newspaper_works 0.1.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. checksums.yaml +5 -5
  2. data/.coveralls.yml +2 -0
  3. data/.gitignore +4 -0
  4. data/.travis.yml +2 -2
  5. data/README.md +14 -13
  6. data/app/services/newspaper_works/jp2_derivative_service.rb +1 -3
  7. data/app/services/newspaper_works/newspaper_page_derivative_service.rb +37 -15
  8. data/app/services/newspaper_works/pdf_derivative_service.rb +4 -7
  9. data/app/services/newspaper_works/tiff_derivative_service.rb +5 -9
  10. data/app/views/newspaper_works/base/_attribute_rows.html.erb +72 -24
  11. data/config/locales/newspaper_article.de.yml +1 -1
  12. data/config/locales/newspaper_article.en.yml +1 -1
  13. data/config/locales/newspaper_article.es.yml +1 -1
  14. data/config/locales/newspaper_article.fr.yml +1 -1
  15. data/config/locales/newspaper_article.it.yml +1 -1
  16. data/config/locales/newspaper_article.pt-BR.yml +1 -1
  17. data/config/locales/newspaper_article.zh.yml +1 -1
  18. data/config/locales/newspaper_container.de.yml +1 -1
  19. data/config/locales/newspaper_container.en.yml +1 -1
  20. data/config/locales/newspaper_container.es.yml +1 -1
  21. data/config/locales/newspaper_container.fr.yml +1 -1
  22. data/config/locales/newspaper_container.it.yml +1 -1
  23. data/config/locales/newspaper_container.pt-BR.yml +1 -1
  24. data/config/locales/newspaper_container.zh.yml +1 -1
  25. data/config/locales/newspaper_issue.de.yml +1 -1
  26. data/config/locales/newspaper_issue.en.yml +1 -1
  27. data/config/locales/newspaper_issue.es.yml +1 -1
  28. data/config/locales/newspaper_issue.fr.yml +1 -1
  29. data/config/locales/newspaper_issue.it.yml +2 -2
  30. data/config/locales/newspaper_issue.pt-BR.yml +2 -2
  31. data/config/locales/newspaper_issue.zh.yml +2 -2
  32. data/config/locales/newspaper_page.de.yml +1 -1
  33. data/config/locales/newspaper_page.en.yml +1 -1
  34. data/config/locales/newspaper_page.es.yml +1 -1
  35. data/config/locales/newspaper_page.fr.yml +1 -1
  36. data/config/locales/newspaper_page.it.yml +1 -1
  37. data/config/locales/newspaper_page.pt-BR.yml +1 -1
  38. data/config/locales/newspaper_page.zh.yml +1 -1
  39. data/config/locales/newspaper_title.de.yml +1 -1
  40. data/config/locales/newspaper_title.en.yml +1 -1
  41. data/config/locales/newspaper_title.es.yml +1 -1
  42. data/config/locales/newspaper_title.fr.yml +1 -1
  43. data/config/locales/newspaper_title.it.yml +1 -1
  44. data/config/locales/newspaper_title.pt-BR.yml +1 -1
  45. data/config/locales/newspaper_title.zh.yml +1 -1
  46. data/config/locales/newspaper_works.de.yml +98 -0
  47. data/config/locales/newspaper_works.en.yml +67 -0
  48. data/config/locales/newspaper_works.es.yml +96 -0
  49. data/config/locales/newspaper_works.fr.yml +97 -0
  50. data/config/locales/newspaper_works.it.yml +90 -0
  51. data/config/locales/newspaper_works.pt-BR.yml +96 -0
  52. data/config/locales/newspaper_works.zh.yml +90 -0
  53. data/config/vendor/fits.xml +55 -0
  54. data/config/vendor/imagemagick-6-policy.xml +39 -39
  55. data/lib/newspaper_works.rb +2 -0
  56. data/lib/newspaper_works/image_tool.rb +119 -0
  57. data/lib/newspaper_works/jp2_image_metadata.rb +81 -0
  58. data/lib/newspaper_works/text_extraction.rb +1 -0
  59. data/lib/newspaper_works/text_extraction/hocr_reader.rb +173 -0
  60. data/lib/newspaper_works/text_extraction/page_ocr.rb +37 -51
  61. data/lib/newspaper_works/text_extraction/render_alto.rb +4 -4
  62. data/lib/newspaper_works/version.rb +1 -1
  63. data/newspaper_works.gemspec +2 -3
  64. data/spec/features/search_results_thumbnail_highlights_spec.rb +1 -1
  65. data/spec/fixtures/files/ocr_mono_text_hocr.html +78 -0
  66. data/spec/lib/newspaper_works/image_tool_spec.rb +109 -0
  67. data/spec/lib/newspaper_works/ingest/ingest_shared.rb +3 -3
  68. data/spec/lib/newspaper_works/ingest/newspaper_page_ingest_spec.rb +2 -2
  69. data/spec/lib/newspaper_works/jp2_image_metadata_spec.rb +37 -0
  70. data/spec/lib/newspaper_works/text_extraction/hocr_reader_spec.rb +45 -0
  71. data/spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb +3 -3
  72. data/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb +14 -14
  73. data/spec/services/newspaper_works/jp2_derivative_service_spec.rb +10 -13
  74. data/spec/services/newspaper_works/newspaper_page_derivative_service_spec.rb +10 -8
  75. data/spec/services/newspaper_works/pdf_derivative_service_spec.rb +11 -7
  76. data/spec/services/newspaper_works/tiff_derivative_service_spec.rb +17 -10
  77. data/spec/spec_helper.rb +19 -0
  78. metadata +21 -22
@@ -0,0 +1,55 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <fits_configuration>
3
+ <!-- Order of the tools determines preference -->
4
+ <tools>
5
+ <!-- exclude-exts attribute is a comma delimited list of file extensions that the tool should not try to process -->
6
+ <!-- include-exts attribute is a comma delimited list of file extensions that are the only ones the tool will process -->
7
+ <!-- classpath-dirs attribute is a list of directories where any tool-specific Java JAR files and configuration files used solely by these JAR files -->
8
+ <tool class="edu.harvard.hul.ois.fits.tools.mediainfo.MediaInfo" include-exts="avi,mov,mpg,mpeg,mkv,mp4,mxf,ogv,mj2,divx,dv,m4v,m2v,ismv,m2ts,mpeg4" classpath-dirs="lib/mediainfo" />
9
+ <tool class="edu.harvard.hul.ois.fits.tools.oisfileinfo.AudioInfo" include-exts="wav" classpath-dirs="lib/audioinfo" />
10
+ <tool class="edu.harvard.hul.ois.fits.tools.oisfileinfo.ADLTool" include-exts="adl" classpath-dirs="lib/adltool" />
11
+ <tool class="edu.harvard.hul.ois.fits.tools.oisfileinfo.VTTTool" include-exts="vtt" />
12
+ <tool class="edu.harvard.hul.ois.fits.tools.droid.Droid" exclude-exts="odm,m4a" classpath-dirs="lib/droid" />
13
+ <tool class="edu.harvard.hul.ois.fits.tools.jhove.Jhove" exclude-exts="dng,mbx,mbox,arw,adl,eml,java,doc,docx,docm,odt,rtf,pages,wpd,wp,epub,csv,avi,mov,mpg,mpeg,mkv,mp4,mpeg4,m2ts,mxf,ogv,mj2,divx,dv,m4v,m2v,ismv,pcd,zip" classpath-dirs="lib/jhove" />
14
+ <tool class="edu.harvard.hul.ois.fits.tools.fileutility.FileUtility" exclude-exts="dng,wps,adl,jar,epub,csv" classpath-dirs="lib/fileutility" />
15
+ <tool class="edu.harvard.hul.ois.fits.tools.exiftool.Exiftool" exclude-exts="txt,wps,vsd,jar,avi,mov,mpg,mpeg,mkv,mp4,mxf,ogv,mj2,divx,dv,m4v,m2v,ismv,m2ts,mpeg4" classpath-dirs="lib/exiftool" />
16
+ <tool class="edu.harvard.hul.ois.fits.tools.nlnz.MetadataExtractor" include-exts="bmp,gif,jpg,jpeg,wp,wpd,odt,doc,pdf,mp3,bfw,flac,html,xml,arc" classpath-dirs="lib/nzmetool,xml/nlnz"/>
17
+ <tool class="edu.harvard.hul.ois.fits.tools.oisfileinfo.FileInfo" classpath-dirs="lib/fileinfo" />
18
+ <tool class="edu.harvard.hul.ois.fits.tools.oisfileinfo.XmlMetadata" include-exts="xml" classpath-dirs="lib/xmlmetadata" />
19
+ <tool class="edu.harvard.hul.ois.fits.tools.ffident.FFIdent" exclude-exts="dng,wps,vsd,jar,ppt,rtf" classpath-dirs="lib/ffident" />
20
+
21
+ </tools>
22
+
23
+ <output>
24
+ <dataConsolidator class="edu.harvard.hul.ois.fits.consolidation.OISConsolidator"/>
25
+ <display-tool-output>false</display-tool-output>
26
+ <report-conflicts>true</report-conflicts>
27
+ <validate-tool-output>false</validate-tool-output>
28
+ <internal-output-schema>xml/fits_output.xsd</internal-output-schema>
29
+ <external-output-schema>http://hul.harvard.edu/ois/xml/xsd/fits/fits_output.xsd</external-output-schema>
30
+ <fits-xml-namespace>http://hul.harvard.edu/ois/xml/ns/fits/fits_output</fits-xml-namespace>
31
+ <enable-statistics>true</enable-statistics>
32
+ <enable-checksum>true</enable-checksum>
33
+ <!-- The below controls the exclusion of the checksum for certain files, even if enable-checksum is true -->
34
+ <!-- Video Exclusions -->
35
+ <!-- <checksum-exclusions exclude-exts="avi,mov,mpg,mkv,mp4,mxf,ogv,mj2,divx,dv,m4v,m2v,ismv"/> -->
36
+ <!-- Audio Exclusions -->
37
+ <!-- <checksum-exclusions exclude-exts="wav,aif,mp3,mp4,m4a,ra,rm"/> -->
38
+ </output>
39
+
40
+ <process>
41
+ <max-threads>20</max-threads>
42
+ </process>
43
+
44
+ <!-- file name of the droid signature file to use in tools/droid/-->
45
+ <droid_sigfile>DROID_SignatureFile_V90.xml</droid_sigfile>
46
+ <!-- Limits number of bytes DROID reads in (in KB) for processing large files for the listed file extensions. -->
47
+ <!-- Note: This should only be used with files that can provide sufficient metadata at beginning of the file -->
48
+ <!-- <droid_read_limit include-exts="mov,mxf" read-limit-kb="64" /> -->
49
+
50
+ <!-- the fits home is used by the MediaInfo tool to load the jna api libs -->
51
+ <!-- in most cases you won't need to change -->
52
+ <!-- example for BB will be /fits -->
53
+ <fits_home>.</fits_home>
54
+
55
+ </fits_configuration>
@@ -2,61 +2,60 @@
2
2
  <!DOCTYPE policymap [
3
3
  <!ELEMENT policymap (policy)+>
4
4
  <!ELEMENT policy (#PCDATA)>
5
- <!ATTLIST policy domain (delegate|coder|filter|path|resource) #IMPLIED>
5
+ <!ATTLIST policy domain (delegate|coder|filter|path|resource|cache) #IMPLIED>
6
6
  <!ATTLIST policy name CDATA #IMPLIED>
7
7
  <!ATTLIST policy rights CDATA #IMPLIED>
8
8
  <!ATTLIST policy pattern CDATA #IMPLIED>
9
9
  <!ATTLIST policy value CDATA #IMPLIED>
10
10
  ]>
11
- <policy domain="coder" rights="read|write" pattern="PDF" />
12
- <policy domain="coder" rights="read|write" pattern="LABEL" />
13
- <!--
14
- Configure ImageMagick policies.
11
+ <policymap>
12
+ <policy domain="coder" rights="read|write" pattern="PDF" />
13
+ <policy domain="coder" rights="read|write" pattern="LABEL" />
14
+ <!--
15
+ Configure ImageMagick policies.
15
16
 
16
- Domains include system, delegate, coder, filter, path, or resource.
17
+ Domains include system, delegate, coder, filter, path, or resource.
17
18
 
18
- Rights include none, read, write, and execute. Use | to combine them,
19
- for example: "read | write" to permit read from, or write to, a path.
19
+ Rights include none, read, write, and execute. Use | to combine them,
20
+ for example: "read | write" to permit read from, or write to, a path.
20
21
 
21
- Use a glob expression as a pattern.
22
+ Use a glob expression as a pattern.
22
23
 
23
- Suppose we do not want users to process MPEG video images:
24
+ Suppose we do not want users to process MPEG video images:
24
25
 
25
- <policy domain="delegate" rights="none" pattern="mpeg:decode" />
26
+ <policy domain="delegate" rights="none" pattern="mpeg:decode" />
26
27
 
27
- Here we do not want users reading images from HTTP:
28
+ Here we do not want users reading images from HTTP:
28
29
 
29
- <policy domain="coder" rights="none" pattern="HTTP" />
30
+ <policy domain="coder" rights="none" pattern="HTTP" />
30
31
 
31
- Lets prevent users from executing any image filters:
32
+ Lets prevent users from executing any image filters:
32
33
 
33
- <policy domain="filter" rights="none" pattern="*" />
34
+ <policy domain="filter" rights="none" pattern="*" />
34
35
 
35
- The /repository file system is restricted to read only. We use a glob
36
- expression to match all paths that start with /repository:
37
-
38
- <policy domain="path" rights="read" pattern="/repository/*" />
36
+ The /repository file system is restricted to read only. We use a glob
37
+ expression to match all paths that start with /repository:
38
+
39
+ <policy domain="path" rights="read" pattern="/repository/*" />
39
40
 
40
- Any large image is cached to disk rather than memory:
41
+ Any large image is cached to disk rather than memory:
41
42
 
42
- <policy domain="resource" name="area" value="1GB"/>
43
+ <policy domain="resource" name="area" value="1GB"/>
44
+
45
+ Define arguments for the memory, map, area, and disk resources with
46
+ SI prefixes (.e.g 100MB). In addition, resource policies are maximums for
47
+ each instance of ImageMagick (e.g. policy memory limit 1GB, -limit 2GB
48
+ exceeds policy maximum so memory limit is 1GB).
49
+ -->
50
+
51
+ <!-- NewspaperWorks: allow more than default Ubuntu policy, re: RAM, Disk -->
52
+ <policy domain="resource" name="memory" value="512MiB"/>
53
+ <policy domain="resource" name="map" value="1GiB"/>
54
+ <policy domain="resource" name="width" value="20KP"/>
55
+ <policy domain="resource" name="height" value="20KP"/>
56
+ <policy domain="resource" name="area" value="128MB"/>
57
+ <policy domain="resource" name="disk" value="2GiB"/>
43
58
 
44
- Define arguments for the memory, map, area, and disk resources with
45
- SI prefixes (.e.g 100MB). In addition, resource policies are maximums for
46
- each instance of ImageMagick (e.g. policy memory limit 1GB, -limit 2GB
47
- exceeds policy maximum so memory limit is 1GB).
48
- -->
49
- <policymap>
50
- <!-- <policy domain="resource" name="temporary-path" value="/tmp"/> -->
51
- <!-- <policy domain="resource" name="memory" value="2GiB"/> -->
52
- <!-- <policy domain="resource" name="map" value="4GiB"/> -->
53
- <!-- <policy domain="resource" name="area" value="1GB"/> -->
54
- <!-- <policy domain="resource" name="disk" value="16EB"/> -->
55
- <!-- <policy domain="resource" name="file" value="768"/> -->
56
- <!-- <policy domain="resource" name="thread" value="4"/> -->
57
- <!-- <policy domain="resource" name="throttle" value="0"/> -->
58
- <!-- <policy domain="resource" name="time" value="3600"/> -->
59
- <!-- <policy domain="system" name="precision" value="6"/> -->
60
59
  <policy domain="cache" name="shared-secret" value="passphrase"/>
61
60
  <policy domain="coder" rights="none" pattern="EPHEMERAL" />
62
61
  <policy domain="coder" rights="none" pattern="URL" />
@@ -68,9 +67,10 @@
68
67
  <policy domain="coder" rights="none" pattern="WIN" />
69
68
  <policy domain="coder" rights="none" pattern="PLT" />
70
69
  <policy domain="path" rights="none" pattern="@*" />
71
- <!-- disable ghostscript format types -->
70
+ <!-- disable ghostscript format types (except PDF) -->
72
71
  <policy domain="coder" rights="none" pattern="PS" />
73
72
  <policy domain="coder" rights="none" pattern="EPS" />
74
- <!-- <policy domain="coder" rights="none" pattern="PDF" /> -->
75
73
  <policy domain="coder" rights="none" pattern="XPS" />
74
+ <!-- NewspaperWorks: we need to allow PDF here -->
75
+ <!-- <policy domain="coder" rights="none" pattern="PDF" /> -->
76
76
  </policymap>
@@ -1,5 +1,7 @@
1
1
  require "newspaper_works/engine"
2
2
  require "newspaper_works/errors"
3
+ require "newspaper_works/jp2_image_metadata"
4
+ require "newspaper_works/image_tool"
3
5
  require "newspaper_works/ingest"
4
6
  require "newspaper_works/issue_pdf_composer"
5
7
  require "newspaper_works/text_extraction"
@@ -0,0 +1,119 @@
1
+ require 'open3'
2
+ require 'tmpdir'
3
+
4
+ module NewspaperWorks
5
+ class ImageTool
6
+ attr_accessor :path, :ftype
7
+
8
+ def initialize(path)
9
+ @path = path
10
+ @ftype = magic
11
+ @metadata = nil
12
+ end
13
+
14
+ # @return [Hash] hash with following symbol keys, and respectively
15
+ # typed String and/or Integer values.
16
+ # :width, :height — both in Integer px units
17
+ # :color — (String enumerated from 'gray', 'monochrome', 'color')
18
+ # :num_components - Integer, number of channels
19
+ # :bits_per_component — Integer, bits per channel (e.g. 8 vs. 1)
20
+ # :content_type — RFC 2045 MIME type
21
+ def metadata
22
+ return @metadata unless @metadata.nil?
23
+ @metadata = jp2? ? jp2_metadata : identify_metadata
24
+ end
25
+
26
+ # Convert source image to image at destination path, inferring file type
27
+ # from destination file extension. In case of JP2 files, create
28
+ # intermediate file using OpenJPEG 2000 that ImageMagick can use.
29
+ # Only outputs monochrome output if monochrome is true, destination
30
+ # format is TIFF.
31
+ # @param destination [String] Path to output / destination file
32
+ # @param monochrome [Boolean] true if monochrome output, otherwise false
33
+ def convert(destination, monochrome = false)
34
+ raise 'JP2 output not yet supported' if destination.end_with?('jp2')
35
+ return convert_image(jp2_to_tiff(@path), destination, monochrome) if jp2?
36
+ convert_image(@path, destination, monochrome)
37
+ end
38
+
39
+ private
40
+
41
+ def convert_image(source, destination, monochrome)
42
+ monochrome &&= destination.slice(-4, 4).index('tif')
43
+ mono_opts = "-depth 1 -monochrome -compress Group4 -type bilevel "
44
+ opts = monochrome ? mono_opts : ''
45
+ cmd = "convert #{source} #{opts}#{destination}"
46
+ `#{cmd}`
47
+ end
48
+
49
+ def jp2_to_tiff(source)
50
+ intermediate_path = File.join(Dir.mktmpdir, 'intermediate.tif')
51
+ jp2_cmd = "opj_decompress -i #{source} -o #{intermediate_path}"
52
+ `#{jp2_cmd}`
53
+ intermediate_path
54
+ end
55
+
56
+ def jp2_metadata
57
+ result = NewspaperWorks::JP2ImageMetadata.new(path).technical_metadata
58
+ result[:content_type] = 'image/jp2'
59
+ result
60
+ end
61
+
62
+ def im_line_select(lines, key)
63
+ line = lines.select { |l| l.downcase.strip.start_with?(key) }[0]
64
+ # Given "key: value" line, return the value as String stripped of
65
+ # leading and trailing whitespace
66
+ return line if line.nil?
67
+ line.strip.split(':')[-1].strip
68
+ end
69
+
70
+ # @return [Array(Integer, Integer)] width, height in Integer px units
71
+ def im_identify_geometry(lines)
72
+ img_geo = im_line_select(lines, 'geometry').split('+')[0]
73
+ img_geo.split('x').map(&:to_i)
74
+ end
75
+
76
+ # @return [Array<String>] lines of output from imagemagick `identify`
77
+ def im_identify
78
+ cmd = "identify -verbose #{path}"
79
+ `#{cmd}`.lines
80
+ end
81
+
82
+ def im_mime(lines)
83
+ return 'application/pdf' if pdf? # workaround older imagemagick bug
84
+ im_line_select(lines, 'mime type')
85
+ end
86
+
87
+ def populate_im_color!(lines, result)
88
+ bpc = im_line_select(lines, 'depth').split('-')[0].to_i # '1-bit' -> 1
89
+ colorspace = im_line_select(lines, 'colorspace')
90
+ color = colorspace == 'Gray' ? 'gray' : 'color'
91
+ has_alpha = !im_line_select(lines, 'Alpha').nil?
92
+ result[:num_components] = (color == 'gray' ? 1 : 3) + (has_alpha ? 1 : 0)
93
+ result[:color] = bpc == 1 ? 'monochrome' : color
94
+ result[:bits_per_component] = bpc
95
+ end
96
+
97
+ # Return metadata by means of imagemagick identify
98
+ def identify_metadata
99
+ result = {}
100
+ lines = im_identify
101
+ result[:width], result[:height] = im_identify_geometry(lines)
102
+ result[:content_type] = im_mime(lines)
103
+ populate_im_color!(lines, result)
104
+ result
105
+ end
106
+
107
+ def magic
108
+ File.read(@path, 23, 0)
109
+ end
110
+
111
+ def jp2?
112
+ @ftype.end_with?('ftypjp2')
113
+ end
114
+
115
+ def pdf?
116
+ magic.start_with?('%PDF-')
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,81 @@
1
+ module NewspaperWorks
2
+ class JP2ImageMetadata
3
+ TOKEN_MARKER_START = "\xFF".force_encoding("BINARY").freeze
4
+ TOKEN_MARKER_SIZ = "\x51".force_encoding("BINARY").freeze
5
+ TOKEN_IHDR = 'ihdr'.freeze
6
+
7
+ attr_accessor :path
8
+
9
+ def initialize(path)
10
+ @path = path
11
+ end
12
+
13
+ # @param io [IO] IO stream opened in binary mode, for reading
14
+ # @return [Array(Integer, Integer)] X size, Y size, in Integer-typed px
15
+ def extract_jp2_dim(io)
16
+ raise IOError, 'file not open in binary mode' unless io.binmode?
17
+ buffer = ''
18
+ siz_found = false
19
+ # Informed by ISO/IEC 15444-1:2000, pp. 26-27
20
+ # via:
21
+ # http://hosting.astro.cornell.edu/~carcich/LRO/jp2/ISO_JPEG200_Standard/INCITS+ISO+IEC+15444-1-2000.pdf
22
+ #
23
+ # first 23 bytes are file-magic, we can skip
24
+ io.seek(23, IO::SEEK_SET)
25
+ while !siz_found && !buffer.nil?
26
+ # read one byte at a time, until we hit marker start 0xFF
27
+ buffer = io.read(1) while buffer != TOKEN_MARKER_START
28
+ # - on 0xFF read subsequent byte; if value != 0x51, continue
29
+ buffer = io.read(1)
30
+ next if buffer != TOKEN_MARKER_SIZ
31
+ # - on 0x51, read next 12 bytes
32
+ buffer = io.read(12)
33
+ siz_found = true
34
+ end
35
+ # discard first 4 bytes; next 4 bytes are XSiz; last 4 bytes are YSiz
36
+ x_siz = buffer.byteslice(4, 4).unpack('N').first
37
+ y_siz = buffer.byteslice(8, 4).unpack('N').first
38
+ [x_siz, y_siz]
39
+ end
40
+
41
+ # @param io [IO] IO stream opened in binary mode, for reading
42
+ # @return [Array(Integer, Integer)] number components, bits-per-component
43
+ def extract_jp2_components(io)
44
+ raise IOError, 'file not open in binary mode' unless io.binmode?
45
+ io.seek(0, IO::SEEK_SET)
46
+ # IHDR should be in first 64 bytes
47
+ buffer = io.read(64)
48
+ ihdr_data = buffer.split(TOKEN_IHDR)[-1]
49
+ raise IOError if ihdr_data.nil?
50
+ num_components = ihdr_data.byteslice(8, 2).unpack('n').first
51
+ # stored as "bit depth of the components in the codestream, minus 1", so add 1
52
+ bits_per_component = ihdr_data.byteslice(10, 1).unpack('c').first + 1
53
+ [num_components, bits_per_component]
54
+ end
55
+
56
+ def validate_jp2(io)
57
+ # verify file is jp2
58
+ magic = io.read(23)
59
+ raise IOError, 'Not JP2 file' unless magic.end_with?('ftypjp2')
60
+ end
61
+
62
+ # @param path [String] path to jp2, for reading
63
+ # @return [Hash] hash
64
+ def technical_metadata
65
+ io = File.open(path, 'rb')
66
+ io.seek(0, IO::SEEK_SET)
67
+ validate_jp2(io)
68
+ x_siz, y_siz = extract_jp2_dim(io)
69
+ nc, bpc = extract_jp2_components(io)
70
+ color = nc >= 3 ? 'color' : 'gray'
71
+ io.close
72
+ {
73
+ color: bpc == 1 ? 'monochrome' : color,
74
+ num_components: nc,
75
+ bits_per_component: bpc,
76
+ width: x_siz,
77
+ height: y_siz
78
+ }
79
+ end
80
+ end
81
+ end
@@ -1,4 +1,5 @@
1
1
  require 'newspaper_works/text_extraction/alto_reader'
2
+ require 'newspaper_works/text_extraction/hocr_reader'
2
3
  require 'newspaper_works/text_extraction/page_ocr'
3
4
  require 'newspaper_works/text_extraction/render_alto'
4
5
  require 'newspaper_works/text_extraction/word_coords_builder'
@@ -0,0 +1,173 @@
1
+ require 'active_support/core_ext/module/delegation'
2
+ require 'json'
3
+ require 'nokogiri'
4
+
5
+ module NewspaperWorks
6
+ # Module for text extraction
7
+ module TextExtraction
8
+ # Class to obtain plain text and JSON word-coordinates from hOCR source
9
+ # - Coordinates in px units, unlike ALTO, which may have scaling concerns
10
+ class HOCRReader
11
+ attr_accessor :source, :doc_stream
12
+ delegate :text, :width, :height, :words, to: :doc_stream
13
+
14
+ # SAX Document Stream class to gather text and word tokens from hOCR
15
+ class HOCRDocStream < Nokogiri::XML::SAX::Document
16
+ attr_accessor :text, :words, :width, :height
17
+
18
+ def initialize
19
+ super()
20
+ # plain text buffer:
21
+ @text = ''
22
+ # list of word hash, containing word+coord:
23
+ @words = []
24
+ # page width and height to be found in hOCR for `div.ocr_page`
25
+ @width = nil
26
+ @height = nil
27
+ # to hold current word data state across #start_element, #characters,
28
+ # and #end_element methods (to associate word with coordinates).
29
+ @current = nil
30
+ # to preserve element classname from start to use by #end_element
31
+ @element_class_name = nil
32
+ end
33
+
34
+ # Return coordinates from `span.ocrx_word` element attribute hash
35
+ #
36
+ # @param attrs [Hash] hash with hOCR `span.ocrx_word` element attributes
37
+ # @return [Array] Array of position x, y, width, height in px.
38
+ def s_coords(attrs)
39
+ element_title = attrs['title']
40
+ bbox = element_title.split(';')[0].split('bbox ')[-1]
41
+ x1, y1, x2, y2 = bbox.split(' ').map(&:to_i)
42
+ height = y2 - y1
43
+ width = x2 - x1
44
+ hpos = x1
45
+ vpos = y1
46
+ [hpos, vpos, width, height]
47
+ end
48
+
49
+ # Consider element for processing?
50
+ # - `div.ocr_page` — to get page width/height
51
+ # - `span.ocr_line` — to help make plain text readable
52
+ # - `span.ocrx_word` — for word-coordinate JSON and plain text word
53
+ # @param name [String] Element name
54
+ # @param class_name [String] HTML class name
55
+ # @return [Boolean] true if element should be processed; otherwise false
56
+ def consider?(name, class_name)
57
+ selector = "#{name}.#{class_name}"
58
+ ['div.ocr_page', 'span.ocr_line', 'span.ocrx_word'].include?(selector)
59
+ end
60
+
61
+ def start_word(attrs)
62
+ @current = {}
63
+ # will be replaced during #characters method call:
64
+ @current[:word] = nil
65
+ @current[:coordinates] = s_coords(attrs)
66
+ end
67
+
68
+ def start_page(attrs)
69
+ title = attrs['title']
70
+ fields = title.split(';')
71
+ bbox = fields[1].split('bbox ')[-1].split(' ').map(&:to_i)
72
+ # width and height:
73
+ @width = bbox[2]
74
+ @height = bbox[3]
75
+ end
76
+
77
+ def word_complete?
78
+ return false if @current.nil?
79
+ coords = @current[:coordinates]
80
+ @current[:word] && !@current[:word].empty? && coords.size == 4
81
+ end
82
+
83
+ def end_word
84
+ # add trailing space to plaintext buffer for between words:
85
+ @text += ' '
86
+ @words.push(@current) if word_complete?
87
+ end
88
+
89
+ def end_line
90
+ # strip trailing whitespace
91
+ @text.strip!
92
+ # then insert a line break
93
+ @text += "\n"
94
+ end
95
+
96
+ # Callback for element start, ignores elements except for:
97
+ # - `div.ocr_page` — to get page width/height
98
+ # - `span.ocr_line` — to help make plain text readable
99
+ # - `span.ocrx_word` — for word-coordinate JSON and plain text word
100
+ #
101
+ # @param name [String] element name.
102
+ # @param attrs [Array] Array of key, value pair Arrays.
103
+ def start_element(name, attrs = [])
104
+ attributes = attrs.to_h
105
+ @element_class_name = attributes['class']
106
+ return unless consider?(name, @element_class_name)
107
+ start_word(attributes) if @element_class_name == 'ocrx_word'
108
+ start_page(attributes) if @element_class_name == 'ocr_page'
109
+ end
110
+
111
+ def characters(value)
112
+ return if @current.nil?
113
+ return if @current[:coordinates].nil?
114
+ @current[:word] ||= ''
115
+ @current[:word] += value
116
+ @text += value
117
+ end
118
+
119
+ # Callback for element end; at this time, flush word coordinate state
120
+ # for current word, and append line endings to plain text:
121
+ #
122
+ # @param name [String] element name.
123
+ def end_element(_name)
124
+ end_line if @element_class_name == 'ocr_line'
125
+ end_word if @element_class_name == 'ocrx_word'
126
+ end
127
+
128
+ # Callback for completion of parsing hOCR, used to normalize generated
129
+ # text content (strip unneeded whitespace incidental to output).
130
+ def end_document
131
+ # postprocess @text to remove trailing spaces on lines
132
+ @text = @text.split("\n").map(&:strip).join("\n")
133
+ # remove excess line break
134
+ @text.gsub!(/\n+/, "\n")
135
+ @text.delete("\r")
136
+ # remove trailing whitespace at end of buffer
137
+ @text.strip!
138
+ end
139
+ end
140
+
141
+ # Construct with either path or HTML [String]
142
+ #
143
+ # @param html [String], and process document
144
+ def initialize(html)
145
+ @source = isxml?(html) ? html : File.read(html)
146
+ @doc_stream = HOCRDocStream.new
147
+ parser = Nokogiri::HTML::SAX::Parser.new(doc_stream)
148
+ parser.parse(@source)
149
+ end
150
+
151
+ # Determine if source parameter is path or xml/html
152
+ #
153
+ # @param xml [String] either path to xml file or xml source
154
+ # @return [true, false] true if value appears to be XML/HTML, not path
155
+ def isxml?(xml)
156
+ xml.lstrip.start_with?('<')
157
+ end
158
+
159
+ # Output JSON flattened word coordinates
160
+ #
161
+ # @return [String] JSON serialization of flattened word coordinates
162
+ def json
163
+ words = @doc_stream.words
164
+ builder = NewspaperWorks::TextExtraction::WordCoordsBuilder.new(
165
+ words,
166
+ @doc_stream.width,
167
+ @doc_stream.height
168
+ )
169
+ builder.to_json
170
+ end
171
+ end
172
+ end
173
+ end