libera 1.0.0 → 1.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: cc099d4ec4b7e794ee9fcd8dbd057a1ccd04467c
4
- data.tar.gz: 42e414654cab1579bdaad82e5fe6d70f97b0fd7a
2
+ SHA256:
3
+ metadata.gz: d4a0c59927db8d8362d9d53ba4658cf0eeddb6034a8dd7122602aadc84187bdc
4
+ data.tar.gz: 75252c43da0a14a65c1403c32ab572d333860dc847666b79c8b6dd3f0e5fe1ba
5
5
  SHA512:
6
- metadata.gz: 5e4e80f572b7e2bc9ac1febedd14693a956d818aa0ec999d486e11f74813df1029a2eaeed846c36d4fb7951d57c2c0981e50b6a9393c1daaa86b3114a504dd3f
7
- data.tar.gz: d7bb316456adfd50f29e19c28eee08051c6bc2439ac41b98a6f550ee37031528ad89724225e20f6b5a424ea12c208dd9f0f9ada4ec035aff6e4999dc13c986c4
6
+ metadata.gz: 2323ec7e6ac8ff52c16f5b92f6b4787388ff8a38f27ae29061fc721078ea11a82871bab031c7e6136d9ede5c1aab0c3394f2ab499be7a065dabf4bbc6dc697ea
7
+ data.tar.gz: 709c1e1e7f864e149846cc5458009a9204b0f34abd57be20a8d168b991f8dbbf5145f00b7aa659800d273585f7185badb1807806f0ae9109477f5a96f16652bf
@@ -0,0 +1 @@
1
+ ruby-2.5.8
@@ -0,0 +1,15 @@
1
+ FROM ruby:2.5
2
+ RUN apt-get update -qq && apt-get install -y \
3
+ ghostscript \
4
+ libgs-dev \
5
+ imagemagick \
6
+ tesseract-ocr
7
+ RUN rm -f /usr/local/lib/ruby/gems/2.5.0/specifications/default/fileutils-1.0.2.gemspec
8
+ RUN useradd -ms /bin/bash libera
9
+ USER libera
10
+ RUN mkdir -p /home/libera/source
11
+ WORKDIR /home/libera/source
12
+ COPY --chown=libera:libera . /home/libera/source
13
+ RUN /home/libera/source/bin/setup
14
+ RUN gem build libera.gemspec
15
+ RUN gem install libera-1.0.4.gem
@@ -1,67 +1,68 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- libera (1.0.0)
5
- fileutils (~> 1.0.2)
6
- nokogiri (~> 1.8.2)
7
- om (~> 3.1.1)
8
- pdf-reader (~> 2.1.0)
9
- rmagick (~> 2.16.0)
10
- rtesseract (~> 2.2.0)
11
- ruby-progressbar (~> 1.9.0)
4
+ libera (1.0.6)
5
+ fileutils (~> 1)
6
+ nokogiri (~> 1)
7
+ om (~> 3.1)
8
+ pdf-reader (~> 2.1)
9
+ rmagick (~> 2.16)
10
+ rtesseract (~> 2.2)
11
+ ruby-progressbar (~> 1.9)
12
12
 
13
13
  GEM
14
14
  remote: https://rubygems.org/
15
15
  specs:
16
16
  Ascii85 (1.0.3)
17
- activemodel (5.2.0)
18
- activesupport (= 5.2.0)
19
- activesupport (5.2.0)
17
+ activemodel (6.1.1)
18
+ activesupport (= 6.1.1)
19
+ activesupport (6.1.1)
20
20
  concurrent-ruby (~> 1.0, >= 1.0.2)
21
- i18n (>= 0.7, < 2)
22
- minitest (~> 5.1)
23
- tzinfo (~> 1.1)
21
+ i18n (>= 1.6, < 2)
22
+ minitest (>= 5.1)
23
+ tzinfo (~> 2.0)
24
+ zeitwerk (~> 2.3)
24
25
  afm (0.2.2)
25
- concurrent-ruby (1.0.5)
26
- daemons (1.2.6)
27
- diff-lcs (1.3)
28
- fileutils (1.0.2)
26
+ concurrent-ruby (1.1.7)
27
+ daemons (1.3.1)
28
+ diff-lcs (1.4.4)
29
+ fileutils (1.5.0)
29
30
  hashery (2.1.2)
30
- i18n (1.0.1)
31
+ i18n (1.8.7)
31
32
  concurrent-ruby (~> 1.0)
32
- mini_portile2 (2.3.0)
33
- minitest (5.11.3)
34
- nokogiri (1.8.2)
35
- mini_portile2 (~> 2.3.0)
36
- om (3.1.1)
37
- activemodel
33
+ minitest (5.14.3)
34
+ nokogiri (1.11.1-x86_64-linux)
35
+ racc (~> 1.4)
36
+ om (3.2.0)
37
+ activemodel (>= 5.1, < 7)
38
38
  activesupport
39
39
  nokogiri (>= 1.4.2)
40
40
  solrizer (~> 3.3)
41
- pdf-reader (2.1.0)
41
+ pdf-reader (2.4.1)
42
42
  Ascii85 (~> 1.0.0)
43
43
  afm (~> 0.2.1)
44
44
  hashery (~> 2.0)
45
45
  ruby-rc4
46
46
  ttfunk
47
- rake (10.5.0)
47
+ racc (1.5.2)
48
+ rake (13.0.3)
48
49
  rmagick (2.16.0)
49
- rspec (3.7.0)
50
- rspec-core (~> 3.7.0)
51
- rspec-expectations (~> 3.7.0)
52
- rspec-mocks (~> 3.7.0)
53
- rspec-core (3.7.1)
54
- rspec-support (~> 3.7.0)
55
- rspec-expectations (3.7.0)
50
+ rspec (3.10.0)
51
+ rspec-core (~> 3.10.0)
52
+ rspec-expectations (~> 3.10.0)
53
+ rspec-mocks (~> 3.10.0)
54
+ rspec-core (3.10.1)
55
+ rspec-support (~> 3.10.0)
56
+ rspec-expectations (3.10.1)
56
57
  diff-lcs (>= 1.2.0, < 2.0)
57
- rspec-support (~> 3.7.0)
58
- rspec-mocks (3.7.0)
58
+ rspec-support (~> 3.10.0)
59
+ rspec-mocks (3.10.1)
59
60
  diff-lcs (>= 1.2.0, < 2.0)
60
- rspec-support (~> 3.7.0)
61
- rspec-support (3.7.1)
61
+ rspec-support (~> 3.10.0)
62
+ rspec-support (3.10.1)
62
63
  rtesseract (2.2.0)
63
64
  nokogiri
64
- ruby-progressbar (1.9.0)
65
+ ruby-progressbar (1.11.0)
65
66
  ruby-rc4 (0.1.5)
66
67
  solrizer (3.4.1)
67
68
  activesupport
@@ -69,21 +70,20 @@ GEM
69
70
  nokogiri
70
71
  stomp
71
72
  xml-simple
72
- stomp (1.4.4)
73
- thread_safe (0.3.6)
74
- ttfunk (1.5.1)
75
- tzinfo (1.2.5)
76
- thread_safe (~> 0.1)
77
- xml-simple (1.1.5)
73
+ stomp (1.4.10)
74
+ ttfunk (1.7.0)
75
+ tzinfo (2.0.4)
76
+ concurrent-ruby (~> 1.0)
77
+ xml-simple (1.1.8)
78
+ zeitwerk (2.4.2)
78
79
 
79
80
  PLATFORMS
80
- ruby
81
+ x86_64-linux
81
82
 
82
83
  DEPENDENCIES
83
- bundler (~> 1.16)
84
84
  libera!
85
- rake (~> 10.0)
86
- rspec (~> 3.0)
85
+ rake (~> 13)
86
+ rspec (~> 3)
87
87
 
88
88
  BUNDLED WITH
89
- 1.16.1
89
+ 2.2.6
data/README.md CHANGED
@@ -4,8 +4,37 @@ Libera is a gem built for Charon - https://github.com/NEU-Libraries/charon - a d
4
4
 
5
5
  It's purpose is to take PDF files as input, and split them apart into individual page images for OCR and TEI generation.
6
6
 
7
+ ## Docker
8
+
9
+ Whilst there are installation instructions below for work as a developer, often times there are significant environmental challenges to software setup. To that end, Libera can be run in Docker. Install Docker as instructed here - https://docs.docker.com/install/
10
+
11
+ Then pull down the libera container image;
12
+
13
+ docker pull nakatomi/libera
14
+
15
+ To share the PDF with the application, you'll need to bind mount a directory to the container. If you run into permission errors or an empty directory issue, you'll need to consult the variances that can occur based on host operating system - https://docs.docker.com/storage/bind-mounts/
16
+
17
+ An example of how to run the container, use a bind mount, and instruct libera is below
18
+
19
+ docker run -ti --mount type=bind,source=/c/Libera,target=/home/libera/work_dir nakatomi/libera libera -p /home/libera/work_dir/dsg.pdf -w /home/libera/work_dir
20
+
21
+ In the above example, the mounted host directory ```/c/Libera``` becomes ```/home/libera/work_dir``` inside the container. In this use case, we use the same directory to deliver the PDF ```dsg.pdf``` as well as use it for where the produced artifacts are then made.
22
+
23
+ The last section of the above is the same as if you'd run Libera in your home operating system
24
+
25
+ libera -p /home/libera/work_dir/dsg.pdf -w /home/libera/work_dir
26
+
7
27
  ## Installation
8
28
 
29
+ There are some programs that are required for Libera to work;
30
+
31
+ * Tesseract - https://github.com/tesseract-ocr/tesseract
32
+ * ImageMagick - https://www.imagemagick.org
33
+
34
+ Tesseract 3.03 and ImageMagick 6.7.7-10 were the versions used in the development of this gem.
35
+
36
+ Both should be available through package managers such as APT, Yum, Homebrew etc.
37
+
9
38
  Add this line to your application's Gemfile:
10
39
 
11
40
  ```ruby
@@ -83,7 +83,7 @@ module Libera
83
83
 
84
84
  def mk_working_dir
85
85
  # Check if working dir exists - If not, make it
86
- FileUtils.mkdir("#{Libera.configuration.working_dir}") unless File.exists? "#{Libera.configuration.working_dir}"
86
+ FileUtils.mkdir_p("#{Libera.configuration.working_dir}") unless File.exists? "#{Libera.configuration.working_dir}"
87
87
  end
88
88
  end
89
89
 
@@ -5,7 +5,7 @@ module Libera
5
5
  include OM::XML::Document
6
6
 
7
7
  set_terminology do |t|
8
- t.root(:path => 'tei', :xmlns => 'http://www.tei-c.org/ns/1.0', :namespace_prefix => nil)
8
+ t.root(:path => 'TEI', :xmlns => 'http://www.tei-c.org/ns/1.0', :namespace_prefix => nil)
9
9
  t.text(path: 'text'){
10
10
  t.body(path: 'body'){
11
11
  t.page_break(path: 'pb')
@@ -14,6 +14,14 @@ module Libera
14
14
  }
15
15
  end
16
16
 
17
+ define_template :text do |xml|
18
+ xml.text_
19
+ end
20
+
21
+ define_template :body do |xml|
22
+ xml.body
23
+ end
24
+
17
25
  define_template :page_break do |xml, img_src|
18
26
  xml.pb(:facs => img_src)
19
27
  end
@@ -51,7 +59,33 @@ module Libera
51
59
  return builder.doc
52
60
  end
53
61
 
62
+ def add_text
63
+ begin
64
+ self.template_registry.add_child(self.ng_xml.root, :text)
65
+ rescue NoMethodError
66
+ raise "Unable to add XML node to base template"
67
+ end
68
+ end
69
+
70
+ def add_body
71
+ begin
72
+ self.template_registry.add_child(self.find_by_terms(:text => 0), :body)
73
+ rescue NoMethodError
74
+ raise "Unable to add XML node to base template"
75
+ end
76
+ end
77
+
54
78
  def add_page_break(page_img)
79
+ # any text?
80
+ if self.find_by_terms(:text => 0).blank?
81
+ self.add_text
82
+ end
83
+
84
+ # any body?
85
+ if self.find_by_terms(:text, :body => 0).blank?
86
+ self.add_body
87
+ end
88
+
55
89
  # any anon breaks?
56
90
  ab_count = self.find_by_terms(:text, :body, :anon_block).count
57
91
 
@@ -1,3 +1,3 @@
1
1
  module Libera
2
- VERSION = "1.0.0"
2
+ VERSION = "1.0.6"
3
3
  end
@@ -17,22 +17,22 @@ Gem::Specification.new do |spec|
17
17
  spec.files = `git ls-files -z`.split("\x0").reject do |f|
18
18
  f.match(%r{^(test|spec|features)/})
19
19
  end
20
-
20
+
21
21
  # spec.bindir = "exe"
22
22
  # spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
23
-
23
+
24
24
  spec.executables = ["libera"]
25
25
  spec.require_paths = ["lib"]
26
26
 
27
- spec.add_development_dependency "bundler", "~> 1.16"
28
- spec.add_development_dependency "rake", "~> 10.0"
29
- spec.add_development_dependency "rspec", "~> 3.0"
30
-
31
- spec.add_dependency "rtesseract", "~> 2.2.0"
32
- spec.add_dependency "rmagick", "~> 2.16.0"
33
- spec.add_dependency "nokogiri", "~> 1.8.2"
34
- spec.add_dependency "pdf-reader", "~> 2.1.0"
35
- spec.add_dependency "om", "~> 3.1.1"
36
- spec.add_dependency "fileutils", "~> 1.0.2"
37
- spec.add_dependency "ruby-progressbar", "~> 1.9.0"
27
+ #spec.add_development_dependency "bundler", "~> 1.16"
28
+ spec.add_development_dependency "rake", "~> 13"
29
+ spec.add_development_dependency "rspec", "~> 3"
30
+
31
+ spec.add_dependency "rtesseract", "~> 2.2"
32
+ spec.add_dependency "rmagick", "~> 2.16"
33
+ spec.add_dependency "nokogiri", "~> 1"
34
+ spec.add_dependency "pdf-reader", "~> 2.1"
35
+ spec.add_dependency "om", "~> 3.1"
36
+ spec.add_dependency "fileutils", "~> 1"
37
+ spec.add_dependency "ruby-progressbar", "~> 1.9"
38
38
  end
metadata CHANGED
@@ -1,155 +1,141 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: libera
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - David Cliff
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-06-06 00:00:00.000000000 Z
11
+ date: 2021-01-19 00:00:00.000000000 Z
12
12
  dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: bundler
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - "~>"
18
- - !ruby/object:Gem::Version
19
- version: '1.16'
20
- type: :development
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - "~>"
25
- - !ruby/object:Gem::Version
26
- version: '1.16'
27
13
  - !ruby/object:Gem::Dependency
28
14
  name: rake
29
15
  requirement: !ruby/object:Gem::Requirement
30
16
  requirements:
31
17
  - - "~>"
32
18
  - !ruby/object:Gem::Version
33
- version: '10.0'
19
+ version: '13'
34
20
  type: :development
35
21
  prerelease: false
36
22
  version_requirements: !ruby/object:Gem::Requirement
37
23
  requirements:
38
24
  - - "~>"
39
25
  - !ruby/object:Gem::Version
40
- version: '10.0'
26
+ version: '13'
41
27
  - !ruby/object:Gem::Dependency
42
28
  name: rspec
43
29
  requirement: !ruby/object:Gem::Requirement
44
30
  requirements:
45
31
  - - "~>"
46
32
  - !ruby/object:Gem::Version
47
- version: '3.0'
33
+ version: '3'
48
34
  type: :development
49
35
  prerelease: false
50
36
  version_requirements: !ruby/object:Gem::Requirement
51
37
  requirements:
52
38
  - - "~>"
53
39
  - !ruby/object:Gem::Version
54
- version: '3.0'
40
+ version: '3'
55
41
  - !ruby/object:Gem::Dependency
56
42
  name: rtesseract
57
43
  requirement: !ruby/object:Gem::Requirement
58
44
  requirements:
59
45
  - - "~>"
60
46
  - !ruby/object:Gem::Version
61
- version: 2.2.0
47
+ version: '2.2'
62
48
  type: :runtime
63
49
  prerelease: false
64
50
  version_requirements: !ruby/object:Gem::Requirement
65
51
  requirements:
66
52
  - - "~>"
67
53
  - !ruby/object:Gem::Version
68
- version: 2.2.0
54
+ version: '2.2'
69
55
  - !ruby/object:Gem::Dependency
70
56
  name: rmagick
71
57
  requirement: !ruby/object:Gem::Requirement
72
58
  requirements:
73
59
  - - "~>"
74
60
  - !ruby/object:Gem::Version
75
- version: 2.16.0
61
+ version: '2.16'
76
62
  type: :runtime
77
63
  prerelease: false
78
64
  version_requirements: !ruby/object:Gem::Requirement
79
65
  requirements:
80
66
  - - "~>"
81
67
  - !ruby/object:Gem::Version
82
- version: 2.16.0
68
+ version: '2.16'
83
69
  - !ruby/object:Gem::Dependency
84
70
  name: nokogiri
85
71
  requirement: !ruby/object:Gem::Requirement
86
72
  requirements:
87
73
  - - "~>"
88
74
  - !ruby/object:Gem::Version
89
- version: 1.8.2
75
+ version: '1'
90
76
  type: :runtime
91
77
  prerelease: false
92
78
  version_requirements: !ruby/object:Gem::Requirement
93
79
  requirements:
94
80
  - - "~>"
95
81
  - !ruby/object:Gem::Version
96
- version: 1.8.2
82
+ version: '1'
97
83
  - !ruby/object:Gem::Dependency
98
84
  name: pdf-reader
99
85
  requirement: !ruby/object:Gem::Requirement
100
86
  requirements:
101
87
  - - "~>"
102
88
  - !ruby/object:Gem::Version
103
- version: 2.1.0
89
+ version: '2.1'
104
90
  type: :runtime
105
91
  prerelease: false
106
92
  version_requirements: !ruby/object:Gem::Requirement
107
93
  requirements:
108
94
  - - "~>"
109
95
  - !ruby/object:Gem::Version
110
- version: 2.1.0
96
+ version: '2.1'
111
97
  - !ruby/object:Gem::Dependency
112
98
  name: om
113
99
  requirement: !ruby/object:Gem::Requirement
114
100
  requirements:
115
101
  - - "~>"
116
102
  - !ruby/object:Gem::Version
117
- version: 3.1.1
103
+ version: '3.1'
118
104
  type: :runtime
119
105
  prerelease: false
120
106
  version_requirements: !ruby/object:Gem::Requirement
121
107
  requirements:
122
108
  - - "~>"
123
109
  - !ruby/object:Gem::Version
124
- version: 3.1.1
110
+ version: '3.1'
125
111
  - !ruby/object:Gem::Dependency
126
112
  name: fileutils
127
113
  requirement: !ruby/object:Gem::Requirement
128
114
  requirements:
129
115
  - - "~>"
130
116
  - !ruby/object:Gem::Version
131
- version: 1.0.2
117
+ version: '1'
132
118
  type: :runtime
133
119
  prerelease: false
134
120
  version_requirements: !ruby/object:Gem::Requirement
135
121
  requirements:
136
122
  - - "~>"
137
123
  - !ruby/object:Gem::Version
138
- version: 1.0.2
124
+ version: '1'
139
125
  - !ruby/object:Gem::Dependency
140
126
  name: ruby-progressbar
141
127
  requirement: !ruby/object:Gem::Requirement
142
128
  requirements:
143
129
  - - "~>"
144
130
  - !ruby/object:Gem::Version
145
- version: 1.9.0
131
+ version: '1.9'
146
132
  type: :runtime
147
133
  prerelease: false
148
134
  version_requirements: !ruby/object:Gem::Requirement
149
135
  requirements:
150
136
  - - "~>"
151
137
  - !ruby/object:Gem::Version
152
- version: 1.9.0
138
+ version: '1.9'
153
139
  description:
154
140
  email:
155
141
  - dgcliff@northeastern.edu
@@ -160,7 +146,9 @@ extra_rdoc_files: []
160
146
  files:
161
147
  - ".gitignore"
162
148
  - ".rspec"
149
+ - ".ruby-version"
163
150
  - ".travis.yml"
151
+ - Dockerfile
164
152
  - Gemfile
165
153
  - Gemfile.lock
166
154
  - LICENSE.txt
@@ -193,7 +181,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
193
181
  version: '0'
194
182
  requirements: []
195
183
  rubyforge_project:
196
- rubygems_version: 2.6.12
184
+ rubygems_version: 2.7.6.2
197
185
  signing_key:
198
186
  specification_version: 4
199
187
  summary: A utility gem for processing PDFs for OCR and TEI