newspaper_works 0.1.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.coveralls.yml +2 -0
- data/.gitignore +4 -0
- data/.travis.yml +2 -2
- data/README.md +14 -13
- data/app/services/newspaper_works/jp2_derivative_service.rb +1 -3
- data/app/services/newspaper_works/newspaper_page_derivative_service.rb +37 -15
- data/app/services/newspaper_works/pdf_derivative_service.rb +4 -7
- data/app/services/newspaper_works/tiff_derivative_service.rb +5 -9
- data/app/views/newspaper_works/base/_attribute_rows.html.erb +72 -24
- data/config/locales/newspaper_article.de.yml +1 -1
- data/config/locales/newspaper_article.en.yml +1 -1
- data/config/locales/newspaper_article.es.yml +1 -1
- data/config/locales/newspaper_article.fr.yml +1 -1
- data/config/locales/newspaper_article.it.yml +1 -1
- data/config/locales/newspaper_article.pt-BR.yml +1 -1
- data/config/locales/newspaper_article.zh.yml +1 -1
- data/config/locales/newspaper_container.de.yml +1 -1
- data/config/locales/newspaper_container.en.yml +1 -1
- data/config/locales/newspaper_container.es.yml +1 -1
- data/config/locales/newspaper_container.fr.yml +1 -1
- data/config/locales/newspaper_container.it.yml +1 -1
- data/config/locales/newspaper_container.pt-BR.yml +1 -1
- data/config/locales/newspaper_container.zh.yml +1 -1
- data/config/locales/newspaper_issue.de.yml +1 -1
- data/config/locales/newspaper_issue.en.yml +1 -1
- data/config/locales/newspaper_issue.es.yml +1 -1
- data/config/locales/newspaper_issue.fr.yml +1 -1
- data/config/locales/newspaper_issue.it.yml +2 -2
- data/config/locales/newspaper_issue.pt-BR.yml +2 -2
- data/config/locales/newspaper_issue.zh.yml +2 -2
- data/config/locales/newspaper_page.de.yml +1 -1
- data/config/locales/newspaper_page.en.yml +1 -1
- data/config/locales/newspaper_page.es.yml +1 -1
- data/config/locales/newspaper_page.fr.yml +1 -1
- data/config/locales/newspaper_page.it.yml +1 -1
- data/config/locales/newspaper_page.pt-BR.yml +1 -1
- data/config/locales/newspaper_page.zh.yml +1 -1
- data/config/locales/newspaper_title.de.yml +1 -1
- data/config/locales/newspaper_title.en.yml +1 -1
- data/config/locales/newspaper_title.es.yml +1 -1
- data/config/locales/newspaper_title.fr.yml +1 -1
- data/config/locales/newspaper_title.it.yml +1 -1
- data/config/locales/newspaper_title.pt-BR.yml +1 -1
- data/config/locales/newspaper_title.zh.yml +1 -1
- data/config/locales/newspaper_works.de.yml +98 -0
- data/config/locales/newspaper_works.en.yml +67 -0
- data/config/locales/newspaper_works.es.yml +96 -0
- data/config/locales/newspaper_works.fr.yml +97 -0
- data/config/locales/newspaper_works.it.yml +90 -0
- data/config/locales/newspaper_works.pt-BR.yml +96 -0
- data/config/locales/newspaper_works.zh.yml +90 -0
- data/config/vendor/fits.xml +55 -0
- data/config/vendor/imagemagick-6-policy.xml +39 -39
- data/lib/newspaper_works.rb +2 -0
- data/lib/newspaper_works/image_tool.rb +119 -0
- data/lib/newspaper_works/jp2_image_metadata.rb +81 -0
- data/lib/newspaper_works/text_extraction.rb +1 -0
- data/lib/newspaper_works/text_extraction/hocr_reader.rb +173 -0
- data/lib/newspaper_works/text_extraction/page_ocr.rb +37 -51
- data/lib/newspaper_works/text_extraction/render_alto.rb +4 -4
- data/lib/newspaper_works/version.rb +1 -1
- data/newspaper_works.gemspec +2 -3
- data/spec/features/search_results_thumbnail_highlights_spec.rb +1 -1
- data/spec/fixtures/files/ocr_mono_text_hocr.html +78 -0
- data/spec/lib/newspaper_works/image_tool_spec.rb +109 -0
- data/spec/lib/newspaper_works/ingest/ingest_shared.rb +3 -3
- data/spec/lib/newspaper_works/ingest/newspaper_page_ingest_spec.rb +2 -2
- data/spec/lib/newspaper_works/jp2_image_metadata_spec.rb +37 -0
- data/spec/lib/newspaper_works/text_extraction/hocr_reader_spec.rb +45 -0
- data/spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb +3 -3
- data/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb +14 -14
- data/spec/services/newspaper_works/jp2_derivative_service_spec.rb +10 -13
- data/spec/services/newspaper_works/newspaper_page_derivative_service_spec.rb +10 -8
- data/spec/services/newspaper_works/pdf_derivative_service_spec.rb +11 -7
- data/spec/services/newspaper_works/tiff_derivative_service_spec.rb +17 -10
- data/spec/spec_helper.rb +19 -0
- metadata +21 -22
@@ -0,0 +1,55 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<fits_configuration>
|
3
|
+
<!-- Order of the tools determines preference -->
|
4
|
+
<tools>
|
5
|
+
<!-- exclude-exts attribute is a comma delimited list of file extensions that the tool should not try to process -->
|
6
|
+
<!-- include-exts attribute is a comma delimited list of file extensions that are the only ones the tool will process -->
|
7
|
+
<!-- classpath-dirs attribute is a list of directories where any tool-specific Java JAR files and configuration files used solely by these JAR files -->
|
8
|
+
<tool class="edu.harvard.hul.ois.fits.tools.mediainfo.MediaInfo" include-exts="avi,mov,mpg,mpeg,mkv,mp4,mxf,ogv,mj2,divx,dv,m4v,m2v,ismv,m2ts,mpeg4" classpath-dirs="lib/mediainfo" />
|
9
|
+
<tool class="edu.harvard.hul.ois.fits.tools.oisfileinfo.AudioInfo" include-exts="wav" classpath-dirs="lib/audioinfo" />
|
10
|
+
<tool class="edu.harvard.hul.ois.fits.tools.oisfileinfo.ADLTool" include-exts="adl" classpath-dirs="lib/adltool" />
|
11
|
+
<tool class="edu.harvard.hul.ois.fits.tools.oisfileinfo.VTTTool" include-exts="vtt" />
|
12
|
+
<tool class="edu.harvard.hul.ois.fits.tools.droid.Droid" exclude-exts="odm,m4a" classpath-dirs="lib/droid" />
|
13
|
+
<tool class="edu.harvard.hul.ois.fits.tools.jhove.Jhove" exclude-exts="dng,mbx,mbox,arw,adl,eml,java,doc,docx,docm,odt,rtf,pages,wpd,wp,epub,csv,avi,mov,mpg,mpeg,mkv,mp4,mpeg4,m2ts,mxf,ogv,mj2,divx,dv,m4v,m2v,ismv,pcd,zip" classpath-dirs="lib/jhove" />
|
14
|
+
<tool class="edu.harvard.hul.ois.fits.tools.fileutility.FileUtility" exclude-exts="dng,wps,adl,jar,epub,csv" classpath-dirs="lib/fileutility" />
|
15
|
+
<tool class="edu.harvard.hul.ois.fits.tools.exiftool.Exiftool" exclude-exts="txt,wps,vsd,jar,avi,mov,mpg,mpeg,mkv,mp4,mxf,ogv,mj2,divx,dv,m4v,m2v,ismv,m2ts,mpeg4" classpath-dirs="lib/exiftool" />
|
16
|
+
<tool class="edu.harvard.hul.ois.fits.tools.nlnz.MetadataExtractor" include-exts="bmp,gif,jpg,jpeg,wp,wpd,odt,doc,pdf,mp3,bfw,flac,html,xml,arc" classpath-dirs="lib/nzmetool,xml/nlnz"/>
|
17
|
+
<tool class="edu.harvard.hul.ois.fits.tools.oisfileinfo.FileInfo" classpath-dirs="lib/fileinfo" />
|
18
|
+
<tool class="edu.harvard.hul.ois.fits.tools.oisfileinfo.XmlMetadata" include-exts="xml" classpath-dirs="lib/xmlmetadata" />
|
19
|
+
<tool class="edu.harvard.hul.ois.fits.tools.ffident.FFIdent" exclude-exts="dng,wps,vsd,jar,ppt,rtf" classpath-dirs="lib/ffident" />
|
20
|
+
|
21
|
+
</tools>
|
22
|
+
|
23
|
+
<output>
|
24
|
+
<dataConsolidator class="edu.harvard.hul.ois.fits.consolidation.OISConsolidator"/>
|
25
|
+
<display-tool-output>false</display-tool-output>
|
26
|
+
<report-conflicts>true</report-conflicts>
|
27
|
+
<validate-tool-output>false</validate-tool-output>
|
28
|
+
<internal-output-schema>xml/fits_output.xsd</internal-output-schema>
|
29
|
+
<external-output-schema>http://hul.harvard.edu/ois/xml/xsd/fits/fits_output.xsd</external-output-schema>
|
30
|
+
<fits-xml-namespace>http://hul.harvard.edu/ois/xml/ns/fits/fits_output</fits-xml-namespace>
|
31
|
+
<enable-statistics>true</enable-statistics>
|
32
|
+
<enable-checksum>true</enable-checksum>
|
33
|
+
<!-- The below controls the exclusion of the checksum for certain files, even if enable-checksum is true -->
|
34
|
+
<!-- Video Exclusions -->
|
35
|
+
<!-- <checksum-exclusions exclude-exts="avi,mov,mpg,mkv,mp4,mxf,ogv,mj2,divx,dv,m4v,m2v,ismv"/> -->
|
36
|
+
<!-- Audio Exclusions -->
|
37
|
+
<!-- <checksum-exclusions exclude-exts="wav,aif,mp3,mp4,m4a,ra,rm"/> -->
|
38
|
+
</output>
|
39
|
+
|
40
|
+
<process>
|
41
|
+
<max-threads>20</max-threads>
|
42
|
+
</process>
|
43
|
+
|
44
|
+
<!-- file name of the droid signature file to use in tools/droid/-->
|
45
|
+
<droid_sigfile>DROID_SignatureFile_V90.xml</droid_sigfile>
|
46
|
+
<!-- Limits number of bytes DROID reads in (in KB) for processing large files for the listed file extensions. -->
|
47
|
+
<!-- Note: This should only be used with files that can provide sufficient metadata at beginning of the file -->
|
48
|
+
<!-- <droid_read_limit include-exts="mov,mxf" read-limit-kb="64" /> -->
|
49
|
+
|
50
|
+
<!-- the fits home is used by the MediaInfo tool to load the jna api libs -->
|
51
|
+
<!-- in most cases you won't need to change -->
|
52
|
+
<!-- example for BB will be /fits -->
|
53
|
+
<fits_home>.</fits_home>
|
54
|
+
|
55
|
+
</fits_configuration>
|
@@ -2,61 +2,60 @@
|
|
2
2
|
<!DOCTYPE policymap [
|
3
3
|
<!ELEMENT policymap (policy)+>
|
4
4
|
<!ELEMENT policy (#PCDATA)>
|
5
|
-
<!ATTLIST policy domain (delegate|coder|filter|path|resource) #IMPLIED>
|
5
|
+
<!ATTLIST policy domain (delegate|coder|filter|path|resource|cache) #IMPLIED>
|
6
6
|
<!ATTLIST policy name CDATA #IMPLIED>
|
7
7
|
<!ATTLIST policy rights CDATA #IMPLIED>
|
8
8
|
<!ATTLIST policy pattern CDATA #IMPLIED>
|
9
9
|
<!ATTLIST policy value CDATA #IMPLIED>
|
10
10
|
]>
|
11
|
-
<
|
12
|
-
<policy domain="coder" rights="read|write" pattern="
|
13
|
-
|
14
|
-
|
11
|
+
<policymap>
|
12
|
+
<policy domain="coder" rights="read|write" pattern="PDF" />
|
13
|
+
<policy domain="coder" rights="read|write" pattern="LABEL" />
|
14
|
+
<!--
|
15
|
+
Configure ImageMagick policies.
|
15
16
|
|
16
|
-
|
17
|
+
Domains include system, delegate, coder, filter, path, or resource.
|
17
18
|
|
18
|
-
|
19
|
-
|
19
|
+
Rights include none, read, write, and execute. Use | to combine them,
|
20
|
+
for example: "read | write" to permit read from, or write to, a path.
|
20
21
|
|
21
|
-
|
22
|
+
Use a glob expression as a pattern.
|
22
23
|
|
23
|
-
|
24
|
+
Suppose we do not want users to process MPEG video images:
|
24
25
|
|
25
|
-
|
26
|
+
<policy domain="delegate" rights="none" pattern="mpeg:decode" />
|
26
27
|
|
27
|
-
|
28
|
+
Here we do not want users reading images from HTTP:
|
28
29
|
|
29
|
-
|
30
|
+
<policy domain="coder" rights="none" pattern="HTTP" />
|
30
31
|
|
31
|
-
|
32
|
+
Lets prevent users from executing any image filters:
|
32
33
|
|
33
|
-
|
34
|
+
<policy domain="filter" rights="none" pattern="*" />
|
34
35
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
36
|
+
The /repository file system is restricted to read only. We use a glob
|
37
|
+
expression to match all paths that start with /repository:
|
38
|
+
|
39
|
+
<policy domain="path" rights="read" pattern="/repository/*" />
|
39
40
|
|
40
|
-
|
41
|
+
Any large image is cached to disk rather than memory:
|
41
42
|
|
42
|
-
|
43
|
+
<policy domain="resource" name="area" value="1GB"/>
|
44
|
+
|
45
|
+
Define arguments for the memory, map, area, and disk resources with
|
46
|
+
SI prefixes (.e.g 100MB). In addition, resource policies are maximums for
|
47
|
+
each instance of ImageMagick (e.g. policy memory limit 1GB, -limit 2GB
|
48
|
+
exceeds policy maximum so memory limit is 1GB).
|
49
|
+
-->
|
50
|
+
|
51
|
+
<!-- NewspaperWorks: allow more than default Ubuntu policy, re: RAM, Disk -->
|
52
|
+
<policy domain="resource" name="memory" value="512MiB"/>
|
53
|
+
<policy domain="resource" name="map" value="1GiB"/>
|
54
|
+
<policy domain="resource" name="width" value="20KP"/>
|
55
|
+
<policy domain="resource" name="height" value="20KP"/>
|
56
|
+
<policy domain="resource" name="area" value="128MB"/>
|
57
|
+
<policy domain="resource" name="disk" value="2GiB"/>
|
43
58
|
|
44
|
-
Define arguments for the memory, map, area, and disk resources with
|
45
|
-
SI prefixes (.e.g 100MB). In addition, resource policies are maximums for
|
46
|
-
each instance of ImageMagick (e.g. policy memory limit 1GB, -limit 2GB
|
47
|
-
exceeds policy maximum so memory limit is 1GB).
|
48
|
-
-->
|
49
|
-
<policymap>
|
50
|
-
<!-- <policy domain="resource" name="temporary-path" value="/tmp"/> -->
|
51
|
-
<!-- <policy domain="resource" name="memory" value="2GiB"/> -->
|
52
|
-
<!-- <policy domain="resource" name="map" value="4GiB"/> -->
|
53
|
-
<!-- <policy domain="resource" name="area" value="1GB"/> -->
|
54
|
-
<!-- <policy domain="resource" name="disk" value="16EB"/> -->
|
55
|
-
<!-- <policy domain="resource" name="file" value="768"/> -->
|
56
|
-
<!-- <policy domain="resource" name="thread" value="4"/> -->
|
57
|
-
<!-- <policy domain="resource" name="throttle" value="0"/> -->
|
58
|
-
<!-- <policy domain="resource" name="time" value="3600"/> -->
|
59
|
-
<!-- <policy domain="system" name="precision" value="6"/> -->
|
60
59
|
<policy domain="cache" name="shared-secret" value="passphrase"/>
|
61
60
|
<policy domain="coder" rights="none" pattern="EPHEMERAL" />
|
62
61
|
<policy domain="coder" rights="none" pattern="URL" />
|
@@ -68,9 +67,10 @@
|
|
68
67
|
<policy domain="coder" rights="none" pattern="WIN" />
|
69
68
|
<policy domain="coder" rights="none" pattern="PLT" />
|
70
69
|
<policy domain="path" rights="none" pattern="@*" />
|
71
|
-
<!-- disable ghostscript format types -->
|
70
|
+
<!-- disable ghostscript format types (except PDF) -->
|
72
71
|
<policy domain="coder" rights="none" pattern="PS" />
|
73
72
|
<policy domain="coder" rights="none" pattern="EPS" />
|
74
|
-
<!-- <policy domain="coder" rights="none" pattern="PDF" /> -->
|
75
73
|
<policy domain="coder" rights="none" pattern="XPS" />
|
74
|
+
<!-- NewspaperWorks: we need to allow PDF here -->
|
75
|
+
<!-- <policy domain="coder" rights="none" pattern="PDF" /> -->
|
76
76
|
</policymap>
|
data/lib/newspaper_works.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
require "newspaper_works/engine"
|
2
2
|
require "newspaper_works/errors"
|
3
|
+
require "newspaper_works/jp2_image_metadata"
|
4
|
+
require "newspaper_works/image_tool"
|
3
5
|
require "newspaper_works/ingest"
|
4
6
|
require "newspaper_works/issue_pdf_composer"
|
5
7
|
require "newspaper_works/text_extraction"
|
@@ -0,0 +1,119 @@
|
|
1
|
+
require 'open3'
|
2
|
+
require 'tmpdir'
|
3
|
+
|
4
|
+
module NewspaperWorks
|
5
|
+
class ImageTool
|
6
|
+
attr_accessor :path, :ftype
|
7
|
+
|
8
|
+
def initialize(path)
|
9
|
+
@path = path
|
10
|
+
@ftype = magic
|
11
|
+
@metadata = nil
|
12
|
+
end
|
13
|
+
|
14
|
+
# @return [Hash] hash with following symbol keys, and respectively
|
15
|
+
# typed String and/or Integer values.
|
16
|
+
# :width, :height — both in Integer px units
|
17
|
+
# :color — (String enumerated from 'gray', 'monochrome', 'color')
|
18
|
+
# :num_components - Integer, number of channels
|
19
|
+
# :bits_per_component — Integer, bits per channel (e.g. 8 vs. 1)
|
20
|
+
# :content_type — RFC 2045 MIME type
|
21
|
+
def metadata
|
22
|
+
return @metadata unless @metadata.nil?
|
23
|
+
@metadata = jp2? ? jp2_metadata : identify_metadata
|
24
|
+
end
|
25
|
+
|
26
|
+
# Convert source image to image at destination path, inferring file type
|
27
|
+
# from destination file extension. In case of JP2 files, create
|
28
|
+
# intermediate file using OpenJPEG 2000 that ImageMagick can use.
|
29
|
+
# Only outputs monochrome output if monochrome is true, destination
|
30
|
+
# format is TIFF.
|
31
|
+
# @param destination [String] Path to output / destination file
|
32
|
+
# @param monochrome [Boolean] true if monochrome output, otherwise false
|
33
|
+
def convert(destination, monochrome = false)
|
34
|
+
raise 'JP2 output not yet supported' if destination.end_with?('jp2')
|
35
|
+
return convert_image(jp2_to_tiff(@path), destination, monochrome) if jp2?
|
36
|
+
convert_image(@path, destination, monochrome)
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
def convert_image(source, destination, monochrome)
|
42
|
+
monochrome &&= destination.slice(-4, 4).index('tif')
|
43
|
+
mono_opts = "-depth 1 -monochrome -compress Group4 -type bilevel "
|
44
|
+
opts = monochrome ? mono_opts : ''
|
45
|
+
cmd = "convert #{source} #{opts}#{destination}"
|
46
|
+
`#{cmd}`
|
47
|
+
end
|
48
|
+
|
49
|
+
def jp2_to_tiff(source)
|
50
|
+
intermediate_path = File.join(Dir.mktmpdir, 'intermediate.tif')
|
51
|
+
jp2_cmd = "opj_decompress -i #{source} -o #{intermediate_path}"
|
52
|
+
`#{jp2_cmd}`
|
53
|
+
intermediate_path
|
54
|
+
end
|
55
|
+
|
56
|
+
def jp2_metadata
|
57
|
+
result = NewspaperWorks::JP2ImageMetadata.new(path).technical_metadata
|
58
|
+
result[:content_type] = 'image/jp2'
|
59
|
+
result
|
60
|
+
end
|
61
|
+
|
62
|
+
def im_line_select(lines, key)
|
63
|
+
line = lines.select { |l| l.downcase.strip.start_with?(key) }[0]
|
64
|
+
# Given "key: value" line, return the value as String stripped of
|
65
|
+
# leading and trailing whitespace
|
66
|
+
return line if line.nil?
|
67
|
+
line.strip.split(':')[-1].strip
|
68
|
+
end
|
69
|
+
|
70
|
+
# @return [Array(Integer, Integer)] width, height in Integer px units
|
71
|
+
def im_identify_geometry(lines)
|
72
|
+
img_geo = im_line_select(lines, 'geometry').split('+')[0]
|
73
|
+
img_geo.split('x').map(&:to_i)
|
74
|
+
end
|
75
|
+
|
76
|
+
# @return [Array<String>] lines of output from imagemagick `identify`
|
77
|
+
def im_identify
|
78
|
+
cmd = "identify -verbose #{path}"
|
79
|
+
`#{cmd}`.lines
|
80
|
+
end
|
81
|
+
|
82
|
+
def im_mime(lines)
|
83
|
+
return 'application/pdf' if pdf? # workaround older imagemagick bug
|
84
|
+
im_line_select(lines, 'mime type')
|
85
|
+
end
|
86
|
+
|
87
|
+
def populate_im_color!(lines, result)
|
88
|
+
bpc = im_line_select(lines, 'depth').split('-')[0].to_i # '1-bit' -> 1
|
89
|
+
colorspace = im_line_select(lines, 'colorspace')
|
90
|
+
color = colorspace == 'Gray' ? 'gray' : 'color'
|
91
|
+
has_alpha = !im_line_select(lines, 'Alpha').nil?
|
92
|
+
result[:num_components] = (color == 'gray' ? 1 : 3) + (has_alpha ? 1 : 0)
|
93
|
+
result[:color] = bpc == 1 ? 'monochrome' : color
|
94
|
+
result[:bits_per_component] = bpc
|
95
|
+
end
|
96
|
+
|
97
|
+
# Return metadata by means of imagemagick identify
|
98
|
+
def identify_metadata
|
99
|
+
result = {}
|
100
|
+
lines = im_identify
|
101
|
+
result[:width], result[:height] = im_identify_geometry(lines)
|
102
|
+
result[:content_type] = im_mime(lines)
|
103
|
+
populate_im_color!(lines, result)
|
104
|
+
result
|
105
|
+
end
|
106
|
+
|
107
|
+
def magic
|
108
|
+
File.read(@path, 23, 0)
|
109
|
+
end
|
110
|
+
|
111
|
+
def jp2?
|
112
|
+
@ftype.end_with?('ftypjp2')
|
113
|
+
end
|
114
|
+
|
115
|
+
def pdf?
|
116
|
+
magic.start_with?('%PDF-')
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
module NewspaperWorks
|
2
|
+
class JP2ImageMetadata
|
3
|
+
TOKEN_MARKER_START = "\xFF".force_encoding("BINARY").freeze
|
4
|
+
TOKEN_MARKER_SIZ = "\x51".force_encoding("BINARY").freeze
|
5
|
+
TOKEN_IHDR = 'ihdr'.freeze
|
6
|
+
|
7
|
+
attr_accessor :path
|
8
|
+
|
9
|
+
def initialize(path)
|
10
|
+
@path = path
|
11
|
+
end
|
12
|
+
|
13
|
+
# @param io [IO] IO stream opened in binary mode, for reading
|
14
|
+
# @return [Array(Integer, Integer)] X size, Y size, in Integer-typed px
|
15
|
+
def extract_jp2_dim(io)
|
16
|
+
raise IOError, 'file not open in binary mode' unless io.binmode?
|
17
|
+
buffer = ''
|
18
|
+
siz_found = false
|
19
|
+
# Informed by ISO/IEC 15444-1:2000, pp. 26-27
|
20
|
+
# via:
|
21
|
+
# http://hosting.astro.cornell.edu/~carcich/LRO/jp2/ISO_JPEG200_Standard/INCITS+ISO+IEC+15444-1-2000.pdf
|
22
|
+
#
|
23
|
+
# first 23 bytes are file-magic, we can skip
|
24
|
+
io.seek(23, IO::SEEK_SET)
|
25
|
+
while !siz_found && !buffer.nil?
|
26
|
+
# read one byte at a time, until we hit marker start 0xFF
|
27
|
+
buffer = io.read(1) while buffer != TOKEN_MARKER_START
|
28
|
+
# - on 0xFF read subsequent byte; if value != 0x51, continue
|
29
|
+
buffer = io.read(1)
|
30
|
+
next if buffer != TOKEN_MARKER_SIZ
|
31
|
+
# - on 0x51, read next 12 bytes
|
32
|
+
buffer = io.read(12)
|
33
|
+
siz_found = true
|
34
|
+
end
|
35
|
+
# discard first 4 bytes; next 4 bytes are XSiz; last 4 bytes are YSiz
|
36
|
+
x_siz = buffer.byteslice(4, 4).unpack('N').first
|
37
|
+
y_siz = buffer.byteslice(8, 4).unpack('N').first
|
38
|
+
[x_siz, y_siz]
|
39
|
+
end
|
40
|
+
|
41
|
+
# @param io [IO] IO stream opened in binary mode, for reading
|
42
|
+
# @return [Array(Integer, Integer)] number components, bits-per-component
|
43
|
+
def extract_jp2_components(io)
|
44
|
+
raise IOError, 'file not open in binary mode' unless io.binmode?
|
45
|
+
io.seek(0, IO::SEEK_SET)
|
46
|
+
# IHDR should be in first 64 bytes
|
47
|
+
buffer = io.read(64)
|
48
|
+
ihdr_data = buffer.split(TOKEN_IHDR)[-1]
|
49
|
+
raise IOError if ihdr_data.nil?
|
50
|
+
num_components = ihdr_data.byteslice(8, 2).unpack('n').first
|
51
|
+
# stored as "bit depth of the components in the codestream, minus 1", so add 1
|
52
|
+
bits_per_component = ihdr_data.byteslice(10, 1).unpack('c').first + 1
|
53
|
+
[num_components, bits_per_component]
|
54
|
+
end
|
55
|
+
|
56
|
+
def validate_jp2(io)
|
57
|
+
# verify file is jp2
|
58
|
+
magic = io.read(23)
|
59
|
+
raise IOError, 'Not JP2 file' unless magic.end_with?('ftypjp2')
|
60
|
+
end
|
61
|
+
|
62
|
+
# @param path [String] path to jp2, for reading
|
63
|
+
# @return [Hash] hash
|
64
|
+
def technical_metadata
|
65
|
+
io = File.open(path, 'rb')
|
66
|
+
io.seek(0, IO::SEEK_SET)
|
67
|
+
validate_jp2(io)
|
68
|
+
x_siz, y_siz = extract_jp2_dim(io)
|
69
|
+
nc, bpc = extract_jp2_components(io)
|
70
|
+
color = nc >= 3 ? 'color' : 'gray'
|
71
|
+
io.close
|
72
|
+
{
|
73
|
+
color: bpc == 1 ? 'monochrome' : color,
|
74
|
+
num_components: nc,
|
75
|
+
bits_per_component: bpc,
|
76
|
+
width: x_siz,
|
77
|
+
height: y_siz
|
78
|
+
}
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'newspaper_works/text_extraction/alto_reader'
|
2
|
+
require 'newspaper_works/text_extraction/hocr_reader'
|
2
3
|
require 'newspaper_works/text_extraction/page_ocr'
|
3
4
|
require 'newspaper_works/text_extraction/render_alto'
|
4
5
|
require 'newspaper_works/text_extraction/word_coords_builder'
|
@@ -0,0 +1,173 @@
|
|
1
|
+
require 'active_support/core_ext/module/delegation'
|
2
|
+
require 'json'
|
3
|
+
require 'nokogiri'
|
4
|
+
|
5
|
+
module NewspaperWorks
|
6
|
+
# Module for text extraction
|
7
|
+
module TextExtraction
|
8
|
+
# Class to obtain plain text and JSON word-coordinates from hOCR source
|
9
|
+
# - Coordinates in px units, unlike ALTO, which may have scaling concerns
|
10
|
+
class HOCRReader
|
11
|
+
attr_accessor :source, :doc_stream
|
12
|
+
delegate :text, :width, :height, :words, to: :doc_stream
|
13
|
+
|
14
|
+
# SAX Document Stream class to gather text and word tokens from hOCR
|
15
|
+
class HOCRDocStream < Nokogiri::XML::SAX::Document
|
16
|
+
attr_accessor :text, :words, :width, :height
|
17
|
+
|
18
|
+
def initialize
|
19
|
+
super()
|
20
|
+
# plain text buffer:
|
21
|
+
@text = ''
|
22
|
+
# list of word hash, containing word+coord:
|
23
|
+
@words = []
|
24
|
+
# page width and height to be found in hOCR for `div.ocr_page`
|
25
|
+
@width = nil
|
26
|
+
@height = nil
|
27
|
+
# to hold current word data state across #start_element, #characters,
|
28
|
+
# and #end_element methods (to associate word with coordinates).
|
29
|
+
@current = nil
|
30
|
+
# to preserve element classname from start to use by #end_element
|
31
|
+
@element_class_name = nil
|
32
|
+
end
|
33
|
+
|
34
|
+
# Return coordinates from `span.ocrx_word` element attribute hash
|
35
|
+
#
|
36
|
+
# @param attrs [Hash] hash with hOCR `span.ocrx_word` element attributes
|
37
|
+
# @return [Array] Array of position x, y, width, height in px.
|
38
|
+
def s_coords(attrs)
|
39
|
+
element_title = attrs['title']
|
40
|
+
bbox = element_title.split(';')[0].split('bbox ')[-1]
|
41
|
+
x1, y1, x2, y2 = bbox.split(' ').map(&:to_i)
|
42
|
+
height = y2 - y1
|
43
|
+
width = x2 - x1
|
44
|
+
hpos = x1
|
45
|
+
vpos = y1
|
46
|
+
[hpos, vpos, width, height]
|
47
|
+
end
|
48
|
+
|
49
|
+
# Consider element for processing?
|
50
|
+
# - `div.ocr_page` — to get page width/height
|
51
|
+
# - `span.ocr_line` — to help make plain text readable
|
52
|
+
# - `span.ocrx_word` — for word-coordinate JSON and plain text word
|
53
|
+
# @param name [String] Element name
|
54
|
+
# @param class_name [String] HTML class name
|
55
|
+
# @return [Boolean] true if element should be processed; otherwise false
|
56
|
+
def consider?(name, class_name)
|
57
|
+
selector = "#{name}.#{class_name}"
|
58
|
+
['div.ocr_page', 'span.ocr_line', 'span.ocrx_word'].include?(selector)
|
59
|
+
end
|
60
|
+
|
61
|
+
def start_word(attrs)
|
62
|
+
@current = {}
|
63
|
+
# will be replaced during #characters method call:
|
64
|
+
@current[:word] = nil
|
65
|
+
@current[:coordinates] = s_coords(attrs)
|
66
|
+
end
|
67
|
+
|
68
|
+
def start_page(attrs)
|
69
|
+
title = attrs['title']
|
70
|
+
fields = title.split(';')
|
71
|
+
bbox = fields[1].split('bbox ')[-1].split(' ').map(&:to_i)
|
72
|
+
# width and height:
|
73
|
+
@width = bbox[2]
|
74
|
+
@height = bbox[3]
|
75
|
+
end
|
76
|
+
|
77
|
+
def word_complete?
|
78
|
+
return false if @current.nil?
|
79
|
+
coords = @current[:coordinates]
|
80
|
+
@current[:word] && !@current[:word].empty? && coords.size == 4
|
81
|
+
end
|
82
|
+
|
83
|
+
def end_word
|
84
|
+
# add trailing space to plaintext buffer for between words:
|
85
|
+
@text += ' '
|
86
|
+
@words.push(@current) if word_complete?
|
87
|
+
end
|
88
|
+
|
89
|
+
def end_line
|
90
|
+
# strip trailing whitespace
|
91
|
+
@text.strip!
|
92
|
+
# then insert a line break
|
93
|
+
@text += "\n"
|
94
|
+
end
|
95
|
+
|
96
|
+
# Callback for element start, ignores elements except for:
|
97
|
+
# - `div.ocr_page` — to get page width/height
|
98
|
+
# - `span.ocr_line` — to help make plain text readable
|
99
|
+
# - `span.ocrx_word` — for word-coordinate JSON and plain text word
|
100
|
+
#
|
101
|
+
# @param name [String] element name.
|
102
|
+
# @param attrs [Array] Array of key, value pair Arrays.
|
103
|
+
def start_element(name, attrs = [])
|
104
|
+
attributes = attrs.to_h
|
105
|
+
@element_class_name = attributes['class']
|
106
|
+
return unless consider?(name, @element_class_name)
|
107
|
+
start_word(attributes) if @element_class_name == 'ocrx_word'
|
108
|
+
start_page(attributes) if @element_class_name == 'ocr_page'
|
109
|
+
end
|
110
|
+
|
111
|
+
def characters(value)
|
112
|
+
return if @current.nil?
|
113
|
+
return if @current[:coordinates].nil?
|
114
|
+
@current[:word] ||= ''
|
115
|
+
@current[:word] += value
|
116
|
+
@text += value
|
117
|
+
end
|
118
|
+
|
119
|
+
# Callback for element end; at this time, flush word coordinate state
|
120
|
+
# for current word, and append line endings to plain text:
|
121
|
+
#
|
122
|
+
# @param name [String] element name.
|
123
|
+
def end_element(_name)
|
124
|
+
end_line if @element_class_name == 'ocr_line'
|
125
|
+
end_word if @element_class_name == 'ocrx_word'
|
126
|
+
end
|
127
|
+
|
128
|
+
# Callback for completion of parsing hOCR, used to normalize generated
|
129
|
+
# text content (strip unneeded whitespace incidental to output).
|
130
|
+
def end_document
|
131
|
+
# postprocess @text to remove trailing spaces on lines
|
132
|
+
@text = @text.split("\n").map(&:strip).join("\n")
|
133
|
+
# remove excess line break
|
134
|
+
@text.gsub!(/\n+/, "\n")
|
135
|
+
@text.delete("\r")
|
136
|
+
# remove trailing whitespace at end of buffer
|
137
|
+
@text.strip!
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
# Construct with either path or HTML [String]
|
142
|
+
#
|
143
|
+
# @param html [String], and process document
|
144
|
+
def initialize(html)
|
145
|
+
@source = isxml?(html) ? html : File.read(html)
|
146
|
+
@doc_stream = HOCRDocStream.new
|
147
|
+
parser = Nokogiri::HTML::SAX::Parser.new(doc_stream)
|
148
|
+
parser.parse(@source)
|
149
|
+
end
|
150
|
+
|
151
|
+
# Determine if source parameter is path or xml/html
|
152
|
+
#
|
153
|
+
# @param xml [String] either path to xml file or xml source
|
154
|
+
# @return [true, false] true if value appears to be XML/HTML, not path
|
155
|
+
def isxml?(xml)
|
156
|
+
xml.lstrip.start_with?('<')
|
157
|
+
end
|
158
|
+
|
159
|
+
# Output JSON flattened word coordinates
|
160
|
+
#
|
161
|
+
# @return [String] JSON serialization of flattened word coordinates
|
162
|
+
def json
|
163
|
+
words = @doc_stream.words
|
164
|
+
builder = NewspaperWorks::TextExtraction::WordCoordsBuilder.new(
|
165
|
+
words,
|
166
|
+
@doc_stream.width,
|
167
|
+
@doc_stream.height
|
168
|
+
)
|
169
|
+
builder.to_json
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|