newspaper_works 0.1.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.coveralls.yml +2 -0
- data/.gitignore +4 -0
- data/.travis.yml +2 -2
- data/README.md +14 -13
- data/app/services/newspaper_works/jp2_derivative_service.rb +1 -3
- data/app/services/newspaper_works/newspaper_page_derivative_service.rb +37 -15
- data/app/services/newspaper_works/pdf_derivative_service.rb +4 -7
- data/app/services/newspaper_works/tiff_derivative_service.rb +5 -9
- data/app/views/newspaper_works/base/_attribute_rows.html.erb +72 -24
- data/config/locales/newspaper_article.de.yml +1 -1
- data/config/locales/newspaper_article.en.yml +1 -1
- data/config/locales/newspaper_article.es.yml +1 -1
- data/config/locales/newspaper_article.fr.yml +1 -1
- data/config/locales/newspaper_article.it.yml +1 -1
- data/config/locales/newspaper_article.pt-BR.yml +1 -1
- data/config/locales/newspaper_article.zh.yml +1 -1
- data/config/locales/newspaper_container.de.yml +1 -1
- data/config/locales/newspaper_container.en.yml +1 -1
- data/config/locales/newspaper_container.es.yml +1 -1
- data/config/locales/newspaper_container.fr.yml +1 -1
- data/config/locales/newspaper_container.it.yml +1 -1
- data/config/locales/newspaper_container.pt-BR.yml +1 -1
- data/config/locales/newspaper_container.zh.yml +1 -1
- data/config/locales/newspaper_issue.de.yml +1 -1
- data/config/locales/newspaper_issue.en.yml +1 -1
- data/config/locales/newspaper_issue.es.yml +1 -1
- data/config/locales/newspaper_issue.fr.yml +1 -1
- data/config/locales/newspaper_issue.it.yml +2 -2
- data/config/locales/newspaper_issue.pt-BR.yml +2 -2
- data/config/locales/newspaper_issue.zh.yml +2 -2
- data/config/locales/newspaper_page.de.yml +1 -1
- data/config/locales/newspaper_page.en.yml +1 -1
- data/config/locales/newspaper_page.es.yml +1 -1
- data/config/locales/newspaper_page.fr.yml +1 -1
- data/config/locales/newspaper_page.it.yml +1 -1
- data/config/locales/newspaper_page.pt-BR.yml +1 -1
- data/config/locales/newspaper_page.zh.yml +1 -1
- data/config/locales/newspaper_title.de.yml +1 -1
- data/config/locales/newspaper_title.en.yml +1 -1
- data/config/locales/newspaper_title.es.yml +1 -1
- data/config/locales/newspaper_title.fr.yml +1 -1
- data/config/locales/newspaper_title.it.yml +1 -1
- data/config/locales/newspaper_title.pt-BR.yml +1 -1
- data/config/locales/newspaper_title.zh.yml +1 -1
- data/config/locales/newspaper_works.de.yml +98 -0
- data/config/locales/newspaper_works.en.yml +67 -0
- data/config/locales/newspaper_works.es.yml +96 -0
- data/config/locales/newspaper_works.fr.yml +97 -0
- data/config/locales/newspaper_works.it.yml +90 -0
- data/config/locales/newspaper_works.pt-BR.yml +96 -0
- data/config/locales/newspaper_works.zh.yml +90 -0
- data/config/vendor/fits.xml +55 -0
- data/config/vendor/imagemagick-6-policy.xml +39 -39
- data/lib/newspaper_works.rb +2 -0
- data/lib/newspaper_works/image_tool.rb +119 -0
- data/lib/newspaper_works/jp2_image_metadata.rb +81 -0
- data/lib/newspaper_works/text_extraction.rb +1 -0
- data/lib/newspaper_works/text_extraction/hocr_reader.rb +173 -0
- data/lib/newspaper_works/text_extraction/page_ocr.rb +37 -51
- data/lib/newspaper_works/text_extraction/render_alto.rb +4 -4
- data/lib/newspaper_works/version.rb +1 -1
- data/newspaper_works.gemspec +2 -3
- data/spec/features/search_results_thumbnail_highlights_spec.rb +1 -1
- data/spec/fixtures/files/ocr_mono_text_hocr.html +78 -0
- data/spec/lib/newspaper_works/image_tool_spec.rb +109 -0
- data/spec/lib/newspaper_works/ingest/ingest_shared.rb +3 -3
- data/spec/lib/newspaper_works/ingest/newspaper_page_ingest_spec.rb +2 -2
- data/spec/lib/newspaper_works/jp2_image_metadata_spec.rb +37 -0
- data/spec/lib/newspaper_works/text_extraction/hocr_reader_spec.rb +45 -0
- data/spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb +3 -3
- data/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb +14 -14
- data/spec/services/newspaper_works/jp2_derivative_service_spec.rb +10 -13
- data/spec/services/newspaper_works/newspaper_page_derivative_service_spec.rb +10 -8
- data/spec/services/newspaper_works/pdf_derivative_service_spec.rb +11 -7
- data/spec/services/newspaper_works/tiff_derivative_service_spec.rb +17 -10
- data/spec/spec_helper.rb +19 -0
- metadata +21 -22
@@ -0,0 +1,55 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<fits_configuration>
|
3
|
+
<!-- Order of the tools determines preference -->
|
4
|
+
<tools>
|
5
|
+
<!-- exclude-exts attribute is a comma delimited list of file extensions that the tool should not try to process -->
|
6
|
+
<!-- include-exts attribute is a comma delimited list of file extensions that are the only ones the tool will process -->
|
7
|
+
<!-- classpath-dirs attribute is a list of directories where any tool-specific Java JAR files and configuration files used solely by these JAR files -->
|
8
|
+
<tool class="edu.harvard.hul.ois.fits.tools.mediainfo.MediaInfo" include-exts="avi,mov,mpg,mpeg,mkv,mp4,mxf,ogv,mj2,divx,dv,m4v,m2v,ismv,m2ts,mpeg4" classpath-dirs="lib/mediainfo" />
|
9
|
+
<tool class="edu.harvard.hul.ois.fits.tools.oisfileinfo.AudioInfo" include-exts="wav" classpath-dirs="lib/audioinfo" />
|
10
|
+
<tool class="edu.harvard.hul.ois.fits.tools.oisfileinfo.ADLTool" include-exts="adl" classpath-dirs="lib/adltool" />
|
11
|
+
<tool class="edu.harvard.hul.ois.fits.tools.oisfileinfo.VTTTool" include-exts="vtt" />
|
12
|
+
<tool class="edu.harvard.hul.ois.fits.tools.droid.Droid" exclude-exts="odm,m4a" classpath-dirs="lib/droid" />
|
13
|
+
<tool class="edu.harvard.hul.ois.fits.tools.jhove.Jhove" exclude-exts="dng,mbx,mbox,arw,adl,eml,java,doc,docx,docm,odt,rtf,pages,wpd,wp,epub,csv,avi,mov,mpg,mpeg,mkv,mp4,mpeg4,m2ts,mxf,ogv,mj2,divx,dv,m4v,m2v,ismv,pcd,zip" classpath-dirs="lib/jhove" />
|
14
|
+
<tool class="edu.harvard.hul.ois.fits.tools.fileutility.FileUtility" exclude-exts="dng,wps,adl,jar,epub,csv" classpath-dirs="lib/fileutility" />
|
15
|
+
<tool class="edu.harvard.hul.ois.fits.tools.exiftool.Exiftool" exclude-exts="txt,wps,vsd,jar,avi,mov,mpg,mpeg,mkv,mp4,mxf,ogv,mj2,divx,dv,m4v,m2v,ismv,m2ts,mpeg4" classpath-dirs="lib/exiftool" />
|
16
|
+
<tool class="edu.harvard.hul.ois.fits.tools.nlnz.MetadataExtractor" include-exts="bmp,gif,jpg,jpeg,wp,wpd,odt,doc,pdf,mp3,bfw,flac,html,xml,arc" classpath-dirs="lib/nzmetool,xml/nlnz"/>
|
17
|
+
<tool class="edu.harvard.hul.ois.fits.tools.oisfileinfo.FileInfo" classpath-dirs="lib/fileinfo" />
|
18
|
+
<tool class="edu.harvard.hul.ois.fits.tools.oisfileinfo.XmlMetadata" include-exts="xml" classpath-dirs="lib/xmlmetadata" />
|
19
|
+
<tool class="edu.harvard.hul.ois.fits.tools.ffident.FFIdent" exclude-exts="dng,wps,vsd,jar,ppt,rtf" classpath-dirs="lib/ffident" />
|
20
|
+
|
21
|
+
</tools>
|
22
|
+
|
23
|
+
<output>
|
24
|
+
<dataConsolidator class="edu.harvard.hul.ois.fits.consolidation.OISConsolidator"/>
|
25
|
+
<display-tool-output>false</display-tool-output>
|
26
|
+
<report-conflicts>true</report-conflicts>
|
27
|
+
<validate-tool-output>false</validate-tool-output>
|
28
|
+
<internal-output-schema>xml/fits_output.xsd</internal-output-schema>
|
29
|
+
<external-output-schema>http://hul.harvard.edu/ois/xml/xsd/fits/fits_output.xsd</external-output-schema>
|
30
|
+
<fits-xml-namespace>http://hul.harvard.edu/ois/xml/ns/fits/fits_output</fits-xml-namespace>
|
31
|
+
<enable-statistics>true</enable-statistics>
|
32
|
+
<enable-checksum>true</enable-checksum>
|
33
|
+
<!-- The below controls the exclusion of the checksum for certain files, even if enable-checksum is true -->
|
34
|
+
<!-- Video Exclusions -->
|
35
|
+
<!-- <checksum-exclusions exclude-exts="avi,mov,mpg,mkv,mp4,mxf,ogv,mj2,divx,dv,m4v,m2v,ismv"/> -->
|
36
|
+
<!-- Audio Exclusions -->
|
37
|
+
<!-- <checksum-exclusions exclude-exts="wav,aif,mp3,mp4,m4a,ra,rm"/> -->
|
38
|
+
</output>
|
39
|
+
|
40
|
+
<process>
|
41
|
+
<max-threads>20</max-threads>
|
42
|
+
</process>
|
43
|
+
|
44
|
+
<!-- file name of the droid signature file to use in tools/droid/-->
|
45
|
+
<droid_sigfile>DROID_SignatureFile_V90.xml</droid_sigfile>
|
46
|
+
<!-- Limits number of bytes DROID reads in (in KB) for processing large files for the listed file extensions. -->
|
47
|
+
<!-- Note: This should only be used with files that can provide sufficient metadata at beginning of the file -->
|
48
|
+
<!-- <droid_read_limit include-exts="mov,mxf" read-limit-kb="64" /> -->
|
49
|
+
|
50
|
+
<!-- the fits home is used by the MediaInfo tool to load the jna api libs -->
|
51
|
+
<!-- in most cases you won't need to change -->
|
52
|
+
<!-- example for BB will be /fits -->
|
53
|
+
<fits_home>.</fits_home>
|
54
|
+
|
55
|
+
</fits_configuration>
|
@@ -2,61 +2,60 @@
|
|
2
2
|
<!DOCTYPE policymap [
|
3
3
|
<!ELEMENT policymap (policy)+>
|
4
4
|
<!ELEMENT policy (#PCDATA)>
|
5
|
-
<!ATTLIST policy domain (delegate|coder|filter|path|resource) #IMPLIED>
|
5
|
+
<!ATTLIST policy domain (delegate|coder|filter|path|resource|cache) #IMPLIED>
|
6
6
|
<!ATTLIST policy name CDATA #IMPLIED>
|
7
7
|
<!ATTLIST policy rights CDATA #IMPLIED>
|
8
8
|
<!ATTLIST policy pattern CDATA #IMPLIED>
|
9
9
|
<!ATTLIST policy value CDATA #IMPLIED>
|
10
10
|
]>
|
11
|
-
<
|
12
|
-
<policy domain="coder" rights="read|write" pattern="
|
13
|
-
|
14
|
-
|
11
|
+
<policymap>
|
12
|
+
<policy domain="coder" rights="read|write" pattern="PDF" />
|
13
|
+
<policy domain="coder" rights="read|write" pattern="LABEL" />
|
14
|
+
<!--
|
15
|
+
Configure ImageMagick policies.
|
15
16
|
|
16
|
-
|
17
|
+
Domains include system, delegate, coder, filter, path, or resource.
|
17
18
|
|
18
|
-
|
19
|
-
|
19
|
+
Rights include none, read, write, and execute. Use | to combine them,
|
20
|
+
for example: "read | write" to permit read from, or write to, a path.
|
20
21
|
|
21
|
-
|
22
|
+
Use a glob expression as a pattern.
|
22
23
|
|
23
|
-
|
24
|
+
Suppose we do not want users to process MPEG video images:
|
24
25
|
|
25
|
-
|
26
|
+
<policy domain="delegate" rights="none" pattern="mpeg:decode" />
|
26
27
|
|
27
|
-
|
28
|
+
Here we do not want users reading images from HTTP:
|
28
29
|
|
29
|
-
|
30
|
+
<policy domain="coder" rights="none" pattern="HTTP" />
|
30
31
|
|
31
|
-
|
32
|
+
Lets prevent users from executing any image filters:
|
32
33
|
|
33
|
-
|
34
|
+
<policy domain="filter" rights="none" pattern="*" />
|
34
35
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
36
|
+
The /repository file system is restricted to read only. We use a glob
|
37
|
+
expression to match all paths that start with /repository:
|
38
|
+
|
39
|
+
<policy domain="path" rights="read" pattern="/repository/*" />
|
39
40
|
|
40
|
-
|
41
|
+
Any large image is cached to disk rather than memory:
|
41
42
|
|
42
|
-
|
43
|
+
<policy domain="resource" name="area" value="1GB"/>
|
44
|
+
|
45
|
+
Define arguments for the memory, map, area, and disk resources with
|
46
|
+
SI prefixes (.e.g 100MB). In addition, resource policies are maximums for
|
47
|
+
each instance of ImageMagick (e.g. policy memory limit 1GB, -limit 2GB
|
48
|
+
exceeds policy maximum so memory limit is 1GB).
|
49
|
+
-->
|
50
|
+
|
51
|
+
<!-- NewspaperWorks: allow more than default Ubuntu policy, re: RAM, Disk -->
|
52
|
+
<policy domain="resource" name="memory" value="512MiB"/>
|
53
|
+
<policy domain="resource" name="map" value="1GiB"/>
|
54
|
+
<policy domain="resource" name="width" value="20KP"/>
|
55
|
+
<policy domain="resource" name="height" value="20KP"/>
|
56
|
+
<policy domain="resource" name="area" value="128MB"/>
|
57
|
+
<policy domain="resource" name="disk" value="2GiB"/>
|
43
58
|
|
44
|
-
Define arguments for the memory, map, area, and disk resources with
|
45
|
-
SI prefixes (.e.g 100MB). In addition, resource policies are maximums for
|
46
|
-
each instance of ImageMagick (e.g. policy memory limit 1GB, -limit 2GB
|
47
|
-
exceeds policy maximum so memory limit is 1GB).
|
48
|
-
-->
|
49
|
-
<policymap>
|
50
|
-
<!-- <policy domain="resource" name="temporary-path" value="/tmp"/> -->
|
51
|
-
<!-- <policy domain="resource" name="memory" value="2GiB"/> -->
|
52
|
-
<!-- <policy domain="resource" name="map" value="4GiB"/> -->
|
53
|
-
<!-- <policy domain="resource" name="area" value="1GB"/> -->
|
54
|
-
<!-- <policy domain="resource" name="disk" value="16EB"/> -->
|
55
|
-
<!-- <policy domain="resource" name="file" value="768"/> -->
|
56
|
-
<!-- <policy domain="resource" name="thread" value="4"/> -->
|
57
|
-
<!-- <policy domain="resource" name="throttle" value="0"/> -->
|
58
|
-
<!-- <policy domain="resource" name="time" value="3600"/> -->
|
59
|
-
<!-- <policy domain="system" name="precision" value="6"/> -->
|
60
59
|
<policy domain="cache" name="shared-secret" value="passphrase"/>
|
61
60
|
<policy domain="coder" rights="none" pattern="EPHEMERAL" />
|
62
61
|
<policy domain="coder" rights="none" pattern="URL" />
|
@@ -68,9 +67,10 @@
|
|
68
67
|
<policy domain="coder" rights="none" pattern="WIN" />
|
69
68
|
<policy domain="coder" rights="none" pattern="PLT" />
|
70
69
|
<policy domain="path" rights="none" pattern="@*" />
|
71
|
-
<!-- disable ghostscript format types -->
|
70
|
+
<!-- disable ghostscript format types (except PDF) -->
|
72
71
|
<policy domain="coder" rights="none" pattern="PS" />
|
73
72
|
<policy domain="coder" rights="none" pattern="EPS" />
|
74
|
-
<!-- <policy domain="coder" rights="none" pattern="PDF" /> -->
|
75
73
|
<policy domain="coder" rights="none" pattern="XPS" />
|
74
|
+
<!-- NewspaperWorks: we need to allow PDF here -->
|
75
|
+
<!-- <policy domain="coder" rights="none" pattern="PDF" /> -->
|
76
76
|
</policymap>
|
data/lib/newspaper_works.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
require "newspaper_works/engine"
|
2
2
|
require "newspaper_works/errors"
|
3
|
+
require "newspaper_works/jp2_image_metadata"
|
4
|
+
require "newspaper_works/image_tool"
|
3
5
|
require "newspaper_works/ingest"
|
4
6
|
require "newspaper_works/issue_pdf_composer"
|
5
7
|
require "newspaper_works/text_extraction"
|
@@ -0,0 +1,119 @@
|
|
1
|
+
require 'open3'
|
2
|
+
require 'tmpdir'
|
3
|
+
|
4
|
+
module NewspaperWorks
|
5
|
+
class ImageTool
|
6
|
+
attr_accessor :path, :ftype
|
7
|
+
|
8
|
+
def initialize(path)
|
9
|
+
@path = path
|
10
|
+
@ftype = magic
|
11
|
+
@metadata = nil
|
12
|
+
end
|
13
|
+
|
14
|
+
# @return [Hash] hash with following symbol keys, and respectively
|
15
|
+
# typed String and/or Integer values.
|
16
|
+
# :width, :height — both in Integer px units
|
17
|
+
# :color — (String enumerated from 'gray', 'monochrome', 'color')
|
18
|
+
# :num_components - Integer, number of channels
|
19
|
+
# :bits_per_component — Integer, bits per channel (e.g. 8 vs. 1)
|
20
|
+
# :content_type — RFC 2045 MIME type
|
21
|
+
def metadata
|
22
|
+
return @metadata unless @metadata.nil?
|
23
|
+
@metadata = jp2? ? jp2_metadata : identify_metadata
|
24
|
+
end
|
25
|
+
|
26
|
+
# Convert source image to image at destination path, inferring file type
|
27
|
+
# from destination file extension. In case of JP2 files, create
|
28
|
+
# intermediate file using OpenJPEG 2000 that ImageMagick can use.
|
29
|
+
# Only outputs monochrome output if monochrome is true, destination
|
30
|
+
# format is TIFF.
|
31
|
+
# @param destination [String] Path to output / destination file
|
32
|
+
# @param monochrome [Boolean] true if monochrome output, otherwise false
|
33
|
+
def convert(destination, monochrome = false)
|
34
|
+
raise 'JP2 output not yet supported' if destination.end_with?('jp2')
|
35
|
+
return convert_image(jp2_to_tiff(@path), destination, monochrome) if jp2?
|
36
|
+
convert_image(@path, destination, monochrome)
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
def convert_image(source, destination, monochrome)
|
42
|
+
monochrome &&= destination.slice(-4, 4).index('tif')
|
43
|
+
mono_opts = "-depth 1 -monochrome -compress Group4 -type bilevel "
|
44
|
+
opts = monochrome ? mono_opts : ''
|
45
|
+
cmd = "convert #{source} #{opts}#{destination}"
|
46
|
+
`#{cmd}`
|
47
|
+
end
|
48
|
+
|
49
|
+
def jp2_to_tiff(source)
|
50
|
+
intermediate_path = File.join(Dir.mktmpdir, 'intermediate.tif')
|
51
|
+
jp2_cmd = "opj_decompress -i #{source} -o #{intermediate_path}"
|
52
|
+
`#{jp2_cmd}`
|
53
|
+
intermediate_path
|
54
|
+
end
|
55
|
+
|
56
|
+
def jp2_metadata
|
57
|
+
result = NewspaperWorks::JP2ImageMetadata.new(path).technical_metadata
|
58
|
+
result[:content_type] = 'image/jp2'
|
59
|
+
result
|
60
|
+
end
|
61
|
+
|
62
|
+
def im_line_select(lines, key)
|
63
|
+
line = lines.select { |l| l.downcase.strip.start_with?(key) }[0]
|
64
|
+
# Given "key: value" line, return the value as String stripped of
|
65
|
+
# leading and trailing whitespace
|
66
|
+
return line if line.nil?
|
67
|
+
line.strip.split(':')[-1].strip
|
68
|
+
end
|
69
|
+
|
70
|
+
# @return [Array(Integer, Integer)] width, height in Integer px units
|
71
|
+
def im_identify_geometry(lines)
|
72
|
+
img_geo = im_line_select(lines, 'geometry').split('+')[0]
|
73
|
+
img_geo.split('x').map(&:to_i)
|
74
|
+
end
|
75
|
+
|
76
|
+
# @return [Array<String>] lines of output from imagemagick `identify`
|
77
|
+
def im_identify
|
78
|
+
cmd = "identify -verbose #{path}"
|
79
|
+
`#{cmd}`.lines
|
80
|
+
end
|
81
|
+
|
82
|
+
def im_mime(lines)
|
83
|
+
return 'application/pdf' if pdf? # workaround older imagemagick bug
|
84
|
+
im_line_select(lines, 'mime type')
|
85
|
+
end
|
86
|
+
|
87
|
+
def populate_im_color!(lines, result)
|
88
|
+
bpc = im_line_select(lines, 'depth').split('-')[0].to_i # '1-bit' -> 1
|
89
|
+
colorspace = im_line_select(lines, 'colorspace')
|
90
|
+
color = colorspace == 'Gray' ? 'gray' : 'color'
|
91
|
+
has_alpha = !im_line_select(lines, 'Alpha').nil?
|
92
|
+
result[:num_components] = (color == 'gray' ? 1 : 3) + (has_alpha ? 1 : 0)
|
93
|
+
result[:color] = bpc == 1 ? 'monochrome' : color
|
94
|
+
result[:bits_per_component] = bpc
|
95
|
+
end
|
96
|
+
|
97
|
+
# Return metadata by means of imagemagick identify
|
98
|
+
def identify_metadata
|
99
|
+
result = {}
|
100
|
+
lines = im_identify
|
101
|
+
result[:width], result[:height] = im_identify_geometry(lines)
|
102
|
+
result[:content_type] = im_mime(lines)
|
103
|
+
populate_im_color!(lines, result)
|
104
|
+
result
|
105
|
+
end
|
106
|
+
|
107
|
+
def magic
|
108
|
+
File.read(@path, 23, 0)
|
109
|
+
end
|
110
|
+
|
111
|
+
def jp2?
|
112
|
+
@ftype.end_with?('ftypjp2')
|
113
|
+
end
|
114
|
+
|
115
|
+
def pdf?
|
116
|
+
magic.start_with?('%PDF-')
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
module NewspaperWorks
|
2
|
+
class JP2ImageMetadata
|
3
|
+
TOKEN_MARKER_START = "\xFF".force_encoding("BINARY").freeze
|
4
|
+
TOKEN_MARKER_SIZ = "\x51".force_encoding("BINARY").freeze
|
5
|
+
TOKEN_IHDR = 'ihdr'.freeze
|
6
|
+
|
7
|
+
attr_accessor :path
|
8
|
+
|
9
|
+
def initialize(path)
|
10
|
+
@path = path
|
11
|
+
end
|
12
|
+
|
13
|
+
# @param io [IO] IO stream opened in binary mode, for reading
|
14
|
+
# @return [Array(Integer, Integer)] X size, Y size, in Integer-typed px
|
15
|
+
def extract_jp2_dim(io)
|
16
|
+
raise IOError, 'file not open in binary mode' unless io.binmode?
|
17
|
+
buffer = ''
|
18
|
+
siz_found = false
|
19
|
+
# Informed by ISO/IEC 15444-1:2000, pp. 26-27
|
20
|
+
# via:
|
21
|
+
# http://hosting.astro.cornell.edu/~carcich/LRO/jp2/ISO_JPEG200_Standard/INCITS+ISO+IEC+15444-1-2000.pdf
|
22
|
+
#
|
23
|
+
# first 23 bytes are file-magic, we can skip
|
24
|
+
io.seek(23, IO::SEEK_SET)
|
25
|
+
while !siz_found && !buffer.nil?
|
26
|
+
# read one byte at a time, until we hit marker start 0xFF
|
27
|
+
buffer = io.read(1) while buffer != TOKEN_MARKER_START
|
28
|
+
# - on 0xFF read subsequent byte; if value != 0x51, continue
|
29
|
+
buffer = io.read(1)
|
30
|
+
next if buffer != TOKEN_MARKER_SIZ
|
31
|
+
# - on 0x51, read next 12 bytes
|
32
|
+
buffer = io.read(12)
|
33
|
+
siz_found = true
|
34
|
+
end
|
35
|
+
# discard first 4 bytes; next 4 bytes are XSiz; last 4 bytes are YSiz
|
36
|
+
x_siz = buffer.byteslice(4, 4).unpack('N').first
|
37
|
+
y_siz = buffer.byteslice(8, 4).unpack('N').first
|
38
|
+
[x_siz, y_siz]
|
39
|
+
end
|
40
|
+
|
41
|
+
# @param io [IO] IO stream opened in binary mode, for reading
|
42
|
+
# @return [Array(Integer, Integer)] number components, bits-per-component
|
43
|
+
def extract_jp2_components(io)
|
44
|
+
raise IOError, 'file not open in binary mode' unless io.binmode?
|
45
|
+
io.seek(0, IO::SEEK_SET)
|
46
|
+
# IHDR should be in first 64 bytes
|
47
|
+
buffer = io.read(64)
|
48
|
+
ihdr_data = buffer.split(TOKEN_IHDR)[-1]
|
49
|
+
raise IOError if ihdr_data.nil?
|
50
|
+
num_components = ihdr_data.byteslice(8, 2).unpack('n').first
|
51
|
+
# stored as "bit depth of the components in the codestream, minus 1", so add 1
|
52
|
+
bits_per_component = ihdr_data.byteslice(10, 1).unpack('c').first + 1
|
53
|
+
[num_components, bits_per_component]
|
54
|
+
end
|
55
|
+
|
56
|
+
def validate_jp2(io)
|
57
|
+
# verify file is jp2
|
58
|
+
magic = io.read(23)
|
59
|
+
raise IOError, 'Not JP2 file' unless magic.end_with?('ftypjp2')
|
60
|
+
end
|
61
|
+
|
62
|
+
# @param path [String] path to jp2, for reading
|
63
|
+
# @return [Hash] hash
|
64
|
+
def technical_metadata
|
65
|
+
io = File.open(path, 'rb')
|
66
|
+
io.seek(0, IO::SEEK_SET)
|
67
|
+
validate_jp2(io)
|
68
|
+
x_siz, y_siz = extract_jp2_dim(io)
|
69
|
+
nc, bpc = extract_jp2_components(io)
|
70
|
+
color = nc >= 3 ? 'color' : 'gray'
|
71
|
+
io.close
|
72
|
+
{
|
73
|
+
color: bpc == 1 ? 'monochrome' : color,
|
74
|
+
num_components: nc,
|
75
|
+
bits_per_component: bpc,
|
76
|
+
width: x_siz,
|
77
|
+
height: y_siz
|
78
|
+
}
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'newspaper_works/text_extraction/alto_reader'
|
2
|
+
require 'newspaper_works/text_extraction/hocr_reader'
|
2
3
|
require 'newspaper_works/text_extraction/page_ocr'
|
3
4
|
require 'newspaper_works/text_extraction/render_alto'
|
4
5
|
require 'newspaper_works/text_extraction/word_coords_builder'
|
@@ -0,0 +1,173 @@
|
|
1
|
+
require 'active_support/core_ext/module/delegation'
|
2
|
+
require 'json'
|
3
|
+
require 'nokogiri'
|
4
|
+
|
5
|
+
module NewspaperWorks
|
6
|
+
# Module for text extraction
|
7
|
+
module TextExtraction
|
8
|
+
# Class to obtain plain text and JSON word-coordinates from hOCR source
|
9
|
+
# - Coordinates in px units, unlike ALTO, which may have scaling concerns
|
10
|
+
class HOCRReader
|
11
|
+
attr_accessor :source, :doc_stream
|
12
|
+
delegate :text, :width, :height, :words, to: :doc_stream
|
13
|
+
|
14
|
+
# SAX Document Stream class to gather text and word tokens from hOCR
|
15
|
+
class HOCRDocStream < Nokogiri::XML::SAX::Document
|
16
|
+
attr_accessor :text, :words, :width, :height
|
17
|
+
|
18
|
+
def initialize
|
19
|
+
super()
|
20
|
+
# plain text buffer:
|
21
|
+
@text = ''
|
22
|
+
# list of word hash, containing word+coord:
|
23
|
+
@words = []
|
24
|
+
# page width and height to be found in hOCR for `div.ocr_page`
|
25
|
+
@width = nil
|
26
|
+
@height = nil
|
27
|
+
# to hold current word data state across #start_element, #characters,
|
28
|
+
# and #end_element methods (to associate word with coordinates).
|
29
|
+
@current = nil
|
30
|
+
# to preserve element classname from start to use by #end_element
|
31
|
+
@element_class_name = nil
|
32
|
+
end
|
33
|
+
|
34
|
+
# Return coordinates from `span.ocrx_word` element attribute hash
|
35
|
+
#
|
36
|
+
# @param attrs [Hash] hash with hOCR `span.ocrx_word` element attributes
|
37
|
+
# @return [Array] Array of position x, y, width, height in px.
|
38
|
+
def s_coords(attrs)
|
39
|
+
element_title = attrs['title']
|
40
|
+
bbox = element_title.split(';')[0].split('bbox ')[-1]
|
41
|
+
x1, y1, x2, y2 = bbox.split(' ').map(&:to_i)
|
42
|
+
height = y2 - y1
|
43
|
+
width = x2 - x1
|
44
|
+
hpos = x1
|
45
|
+
vpos = y1
|
46
|
+
[hpos, vpos, width, height]
|
47
|
+
end
|
48
|
+
|
49
|
+
# Consider element for processing?
|
50
|
+
# - `div.ocr_page` — to get page width/height
|
51
|
+
# - `span.ocr_line` — to help make plain text readable
|
52
|
+
# - `span.ocrx_word` — for word-coordinate JSON and plain text word
|
53
|
+
# @param name [String] Element name
|
54
|
+
# @param class_name [String] HTML class name
|
55
|
+
# @return [Boolean] true if element should be processed; otherwise false
|
56
|
+
def consider?(name, class_name)
|
57
|
+
selector = "#{name}.#{class_name}"
|
58
|
+
['div.ocr_page', 'span.ocr_line', 'span.ocrx_word'].include?(selector)
|
59
|
+
end
|
60
|
+
|
61
|
+
def start_word(attrs)
|
62
|
+
@current = {}
|
63
|
+
# will be replaced during #characters method call:
|
64
|
+
@current[:word] = nil
|
65
|
+
@current[:coordinates] = s_coords(attrs)
|
66
|
+
end
|
67
|
+
|
68
|
+
def start_page(attrs)
|
69
|
+
title = attrs['title']
|
70
|
+
fields = title.split(';')
|
71
|
+
bbox = fields[1].split('bbox ')[-1].split(' ').map(&:to_i)
|
72
|
+
# width and height:
|
73
|
+
@width = bbox[2]
|
74
|
+
@height = bbox[3]
|
75
|
+
end
|
76
|
+
|
77
|
+
def word_complete?
|
78
|
+
return false if @current.nil?
|
79
|
+
coords = @current[:coordinates]
|
80
|
+
@current[:word] && !@current[:word].empty? && coords.size == 4
|
81
|
+
end
|
82
|
+
|
83
|
+
def end_word
|
84
|
+
# add trailing space to plaintext buffer for between words:
|
85
|
+
@text += ' '
|
86
|
+
@words.push(@current) if word_complete?
|
87
|
+
end
|
88
|
+
|
89
|
+
def end_line
|
90
|
+
# strip trailing whitespace
|
91
|
+
@text.strip!
|
92
|
+
# then insert a line break
|
93
|
+
@text += "\n"
|
94
|
+
end
|
95
|
+
|
96
|
+
# Callback for element start, ignores elements except for:
|
97
|
+
# - `div.ocr_page` — to get page width/height
|
98
|
+
# - `span.ocr_line` — to help make plain text readable
|
99
|
+
# - `span.ocrx_word` — for word-coordinate JSON and plain text word
|
100
|
+
#
|
101
|
+
# @param name [String] element name.
|
102
|
+
# @param attrs [Array] Array of key, value pair Arrays.
|
103
|
+
def start_element(name, attrs = [])
|
104
|
+
attributes = attrs.to_h
|
105
|
+
@element_class_name = attributes['class']
|
106
|
+
return unless consider?(name, @element_class_name)
|
107
|
+
start_word(attributes) if @element_class_name == 'ocrx_word'
|
108
|
+
start_page(attributes) if @element_class_name == 'ocr_page'
|
109
|
+
end
|
110
|
+
|
111
|
+
def characters(value)
|
112
|
+
return if @current.nil?
|
113
|
+
return if @current[:coordinates].nil?
|
114
|
+
@current[:word] ||= ''
|
115
|
+
@current[:word] += value
|
116
|
+
@text += value
|
117
|
+
end
|
118
|
+
|
119
|
+
# Callback for element end; at this time, flush word coordinate state
|
120
|
+
# for current word, and append line endings to plain text:
|
121
|
+
#
|
122
|
+
# @param name [String] element name.
|
123
|
+
def end_element(_name)
|
124
|
+
end_line if @element_class_name == 'ocr_line'
|
125
|
+
end_word if @element_class_name == 'ocrx_word'
|
126
|
+
end
|
127
|
+
|
128
|
+
# Callback for completion of parsing hOCR, used to normalize generated
|
129
|
+
# text content (strip unneeded whitespace incidental to output).
|
130
|
+
def end_document
|
131
|
+
# postprocess @text to remove trailing spaces on lines
|
132
|
+
@text = @text.split("\n").map(&:strip).join("\n")
|
133
|
+
# remove excess line break
|
134
|
+
@text.gsub!(/\n+/, "\n")
|
135
|
+
@text.delete("\r")
|
136
|
+
# remove trailing whitespace at end of buffer
|
137
|
+
@text.strip!
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
# Construct with either path or HTML [String]
|
142
|
+
#
|
143
|
+
# @param html [String], and process document
|
144
|
+
def initialize(html)
|
145
|
+
@source = isxml?(html) ? html : File.read(html)
|
146
|
+
@doc_stream = HOCRDocStream.new
|
147
|
+
parser = Nokogiri::HTML::SAX::Parser.new(doc_stream)
|
148
|
+
parser.parse(@source)
|
149
|
+
end
|
150
|
+
|
151
|
+
# Determine if source parameter is path or xml/html
|
152
|
+
#
|
153
|
+
# @param xml [String] either path to xml file or xml source
|
154
|
+
# @return [true, false] true if value appears to be XML/HTML, not path
|
155
|
+
def isxml?(xml)
|
156
|
+
xml.lstrip.start_with?('<')
|
157
|
+
end
|
158
|
+
|
159
|
+
# Output JSON flattened word coordinates
|
160
|
+
#
|
161
|
+
# @return [String] JSON serialization of flattened word coordinates
|
162
|
+
def json
|
163
|
+
words = @doc_stream.words
|
164
|
+
builder = NewspaperWorks::TextExtraction::WordCoordsBuilder.new(
|
165
|
+
words,
|
166
|
+
@doc_stream.width,
|
167
|
+
@doc_stream.height
|
168
|
+
)
|
169
|
+
builder.to_json
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|