libera 1.0.0 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +9 -0
- data/lib/libera/tei.rb +35 -1
- data/lib/libera/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a48b962889918bed974055b7c00258ab1d546826
|
4
|
+
data.tar.gz: f70e99edf393c7fb1f534ce878f1e79279f6ab56
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 00aa349e53c83c32b84c901d48d37ace7472f095fd65c0b63fae43a68c7c700bbc45141e7b3617cc51c54db493a109daf694f15e5ef2b6e2d6e78299fe65ff76
|
7
|
+
data.tar.gz: c336a4d25119edcf30c5eba034b7c875656b1d626445ab354e125e289b1e532ea3e202cefc638c76aac77bcfbcca46a52603d6c571ee1d929eab8716b1dc4734
|
data/README.md
CHANGED
@@ -6,6 +6,15 @@ It's purpose is to take PDF files as input, and split them apart into individual
|
|
6
6
|
|
7
7
|
## Installation
|
8
8
|
|
9
|
+
There are some programs that are required for Libera to work;
|
10
|
+
|
11
|
+
* Tesseract - https://github.com/tesseract-ocr/tesseract
|
12
|
+
* ImageMagick - https://www.imagemagick.org
|
13
|
+
|
14
|
+
Tesseract 3.03 and ImageMagick 6.7.7-10 were the versions used in the development of this gem.
|
15
|
+
|
16
|
+
Both should be available through package managers such as APT, Yum, Homebrew etc.
|
17
|
+
|
9
18
|
Add this line to your application's Gemfile:
|
10
19
|
|
11
20
|
```ruby
|
data/lib/libera/tei.rb
CHANGED
@@ -5,7 +5,7 @@ module Libera
|
|
5
5
|
include OM::XML::Document
|
6
6
|
|
7
7
|
set_terminology do |t|
|
8
|
-
t.root(:path => '
|
8
|
+
t.root(:path => 'TEI', :xmlns => 'http://www.tei-c.org/ns/1.0', :namespace_prefix => nil)
|
9
9
|
t.text(path: 'text'){
|
10
10
|
t.body(path: 'body'){
|
11
11
|
t.page_break(path: 'pb')
|
@@ -14,6 +14,14 @@ module Libera
|
|
14
14
|
}
|
15
15
|
end
|
16
16
|
|
17
|
+
define_template :text do |xml|
|
18
|
+
xml.text_
|
19
|
+
end
|
20
|
+
|
21
|
+
define_template :body do |xml|
|
22
|
+
xml.body
|
23
|
+
end
|
24
|
+
|
17
25
|
define_template :page_break do |xml, img_src|
|
18
26
|
xml.pb(:facs => img_src)
|
19
27
|
end
|
@@ -51,7 +59,33 @@ module Libera
|
|
51
59
|
return builder.doc
|
52
60
|
end
|
53
61
|
|
62
|
+
def add_text
|
63
|
+
begin
|
64
|
+
self.template_registry.add_child(self.ng_xml.root, :text)
|
65
|
+
rescue NoMethodError
|
66
|
+
raise "Unable to add XML node to base template"
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def add_body
|
71
|
+
begin
|
72
|
+
self.template_registry.add_child(self.find_by_terms(:text => 0), :body)
|
73
|
+
rescue NoMethodError
|
74
|
+
raise "Unable to add XML node to base template"
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
54
78
|
def add_page_break(page_img)
|
79
|
+
# any text?
|
80
|
+
if self.find_by_terms(:text => 0).blank?
|
81
|
+
self.add_text
|
82
|
+
end
|
83
|
+
|
84
|
+
# any body?
|
85
|
+
if self.find_by_terms(:text, :body => 0).blank?
|
86
|
+
self.add_body
|
87
|
+
end
|
88
|
+
|
55
89
|
# any anon breaks?
|
56
90
|
ab_count = self.find_by_terms(:text, :body, :anon_block).count
|
57
91
|
|
data/lib/libera/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: libera
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David Cliff
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-06-
|
11
|
+
date: 2018-06-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|