newspaper_works 0.1.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (78) hide show
  1. checksums.yaml +5 -5
  2. data/.coveralls.yml +2 -0
  3. data/.gitignore +4 -0
  4. data/.travis.yml +2 -2
  5. data/README.md +14 -13
  6. data/app/services/newspaper_works/jp2_derivative_service.rb +1 -3
  7. data/app/services/newspaper_works/newspaper_page_derivative_service.rb +37 -15
  8. data/app/services/newspaper_works/pdf_derivative_service.rb +4 -7
  9. data/app/services/newspaper_works/tiff_derivative_service.rb +5 -9
  10. data/app/views/newspaper_works/base/_attribute_rows.html.erb +72 -24
  11. data/config/locales/newspaper_article.de.yml +1 -1
  12. data/config/locales/newspaper_article.en.yml +1 -1
  13. data/config/locales/newspaper_article.es.yml +1 -1
  14. data/config/locales/newspaper_article.fr.yml +1 -1
  15. data/config/locales/newspaper_article.it.yml +1 -1
  16. data/config/locales/newspaper_article.pt-BR.yml +1 -1
  17. data/config/locales/newspaper_article.zh.yml +1 -1
  18. data/config/locales/newspaper_container.de.yml +1 -1
  19. data/config/locales/newspaper_container.en.yml +1 -1
  20. data/config/locales/newspaper_container.es.yml +1 -1
  21. data/config/locales/newspaper_container.fr.yml +1 -1
  22. data/config/locales/newspaper_container.it.yml +1 -1
  23. data/config/locales/newspaper_container.pt-BR.yml +1 -1
  24. data/config/locales/newspaper_container.zh.yml +1 -1
  25. data/config/locales/newspaper_issue.de.yml +1 -1
  26. data/config/locales/newspaper_issue.en.yml +1 -1
  27. data/config/locales/newspaper_issue.es.yml +1 -1
  28. data/config/locales/newspaper_issue.fr.yml +1 -1
  29. data/config/locales/newspaper_issue.it.yml +2 -2
  30. data/config/locales/newspaper_issue.pt-BR.yml +2 -2
  31. data/config/locales/newspaper_issue.zh.yml +2 -2
  32. data/config/locales/newspaper_page.de.yml +1 -1
  33. data/config/locales/newspaper_page.en.yml +1 -1
  34. data/config/locales/newspaper_page.es.yml +1 -1
  35. data/config/locales/newspaper_page.fr.yml +1 -1
  36. data/config/locales/newspaper_page.it.yml +1 -1
  37. data/config/locales/newspaper_page.pt-BR.yml +1 -1
  38. data/config/locales/newspaper_page.zh.yml +1 -1
  39. data/config/locales/newspaper_title.de.yml +1 -1
  40. data/config/locales/newspaper_title.en.yml +1 -1
  41. data/config/locales/newspaper_title.es.yml +1 -1
  42. data/config/locales/newspaper_title.fr.yml +1 -1
  43. data/config/locales/newspaper_title.it.yml +1 -1
  44. data/config/locales/newspaper_title.pt-BR.yml +1 -1
  45. data/config/locales/newspaper_title.zh.yml +1 -1
  46. data/config/locales/newspaper_works.de.yml +98 -0
  47. data/config/locales/newspaper_works.en.yml +67 -0
  48. data/config/locales/newspaper_works.es.yml +96 -0
  49. data/config/locales/newspaper_works.fr.yml +97 -0
  50. data/config/locales/newspaper_works.it.yml +90 -0
  51. data/config/locales/newspaper_works.pt-BR.yml +96 -0
  52. data/config/locales/newspaper_works.zh.yml +90 -0
  53. data/config/vendor/fits.xml +55 -0
  54. data/config/vendor/imagemagick-6-policy.xml +39 -39
  55. data/lib/newspaper_works.rb +2 -0
  56. data/lib/newspaper_works/image_tool.rb +119 -0
  57. data/lib/newspaper_works/jp2_image_metadata.rb +81 -0
  58. data/lib/newspaper_works/text_extraction.rb +1 -0
  59. data/lib/newspaper_works/text_extraction/hocr_reader.rb +173 -0
  60. data/lib/newspaper_works/text_extraction/page_ocr.rb +37 -51
  61. data/lib/newspaper_works/text_extraction/render_alto.rb +4 -4
  62. data/lib/newspaper_works/version.rb +1 -1
  63. data/newspaper_works.gemspec +2 -3
  64. data/spec/features/search_results_thumbnail_highlights_spec.rb +1 -1
  65. data/spec/fixtures/files/ocr_mono_text_hocr.html +78 -0
  66. data/spec/lib/newspaper_works/image_tool_spec.rb +109 -0
  67. data/spec/lib/newspaper_works/ingest/ingest_shared.rb +3 -3
  68. data/spec/lib/newspaper_works/ingest/newspaper_page_ingest_spec.rb +2 -2
  69. data/spec/lib/newspaper_works/jp2_image_metadata_spec.rb +37 -0
  70. data/spec/lib/newspaper_works/text_extraction/hocr_reader_spec.rb +45 -0
  71. data/spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb +3 -3
  72. data/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb +14 -14
  73. data/spec/services/newspaper_works/jp2_derivative_service_spec.rb +10 -13
  74. data/spec/services/newspaper_works/newspaper_page_derivative_service_spec.rb +10 -8
  75. data/spec/services/newspaper_works/pdf_derivative_service_spec.rb +11 -7
  76. data/spec/services/newspaper_works/tiff_derivative_service_spec.rb +17 -10
  77. data/spec/spec_helper.rb +19 -0
  78. metadata +21 -22
@@ -0,0 +1,55 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <fits_configuration>
3
+ <!-- Order of the tools determines preference -->
4
+ <tools>
5
+ <!-- exclude-exts attribute is a comma delimited list of file extensions that the tool should not try to process -->
6
+ <!-- include-exts attribute is a comma delimited list of file extensions that are the only ones the tool will process -->
7
+ <!-- classpath-dirs attribute is a list of directories where any tool-specific Java JAR files and configuration files used solely by these JAR files -->
8
+ <tool class="edu.harvard.hul.ois.fits.tools.mediainfo.MediaInfo" include-exts="avi,mov,mpg,mpeg,mkv,mp4,mxf,ogv,mj2,divx,dv,m4v,m2v,ismv,m2ts,mpeg4" classpath-dirs="lib/mediainfo" />
9
+ <tool class="edu.harvard.hul.ois.fits.tools.oisfileinfo.AudioInfo" include-exts="wav" classpath-dirs="lib/audioinfo" />
10
+ <tool class="edu.harvard.hul.ois.fits.tools.oisfileinfo.ADLTool" include-exts="adl" classpath-dirs="lib/adltool" />
11
+ <tool class="edu.harvard.hul.ois.fits.tools.oisfileinfo.VTTTool" include-exts="vtt" />
12
+ <tool class="edu.harvard.hul.ois.fits.tools.droid.Droid" exclude-exts="odm,m4a" classpath-dirs="lib/droid" />
13
+ <tool class="edu.harvard.hul.ois.fits.tools.jhove.Jhove" exclude-exts="dng,mbx,mbox,arw,adl,eml,java,doc,docx,docm,odt,rtf,pages,wpd,wp,epub,csv,avi,mov,mpg,mpeg,mkv,mp4,mpeg4,m2ts,mxf,ogv,mj2,divx,dv,m4v,m2v,ismv,pcd,zip" classpath-dirs="lib/jhove" />
14
+ <tool class="edu.harvard.hul.ois.fits.tools.fileutility.FileUtility" exclude-exts="dng,wps,adl,jar,epub,csv" classpath-dirs="lib/fileutility" />
15
+ <tool class="edu.harvard.hul.ois.fits.tools.exiftool.Exiftool" exclude-exts="txt,wps,vsd,jar,avi,mov,mpg,mpeg,mkv,mp4,mxf,ogv,mj2,divx,dv,m4v,m2v,ismv,m2ts,mpeg4" classpath-dirs="lib/exiftool" />
16
+ <tool class="edu.harvard.hul.ois.fits.tools.nlnz.MetadataExtractor" include-exts="bmp,gif,jpg,jpeg,wp,wpd,odt,doc,pdf,mp3,bfw,flac,html,xml,arc" classpath-dirs="lib/nzmetool,xml/nlnz"/>
17
+ <tool class="edu.harvard.hul.ois.fits.tools.oisfileinfo.FileInfo" classpath-dirs="lib/fileinfo" />
18
+ <tool class="edu.harvard.hul.ois.fits.tools.oisfileinfo.XmlMetadata" include-exts="xml" classpath-dirs="lib/xmlmetadata" />
19
+ <tool class="edu.harvard.hul.ois.fits.tools.ffident.FFIdent" exclude-exts="dng,wps,vsd,jar,ppt,rtf" classpath-dirs="lib/ffident" />
20
+
21
+ </tools>
22
+
23
+ <output>
24
+ <dataConsolidator class="edu.harvard.hul.ois.fits.consolidation.OISConsolidator"/>
25
+ <display-tool-output>false</display-tool-output>
26
+ <report-conflicts>true</report-conflicts>
27
+ <validate-tool-output>false</validate-tool-output>
28
+ <internal-output-schema>xml/fits_output.xsd</internal-output-schema>
29
+ <external-output-schema>http://hul.harvard.edu/ois/xml/xsd/fits/fits_output.xsd</external-output-schema>
30
+ <fits-xml-namespace>http://hul.harvard.edu/ois/xml/ns/fits/fits_output</fits-xml-namespace>
31
+ <enable-statistics>true</enable-statistics>
32
+ <enable-checksum>true</enable-checksum>
33
+ <!-- The below controls the exclusion of the checksum for certain files, even if enable-checksum is true -->
34
+ <!-- Video Exclusions -->
35
+ <!-- <checksum-exclusions exclude-exts="avi,mov,mpg,mkv,mp4,mxf,ogv,mj2,divx,dv,m4v,m2v,ismv"/> -->
36
+ <!-- Audio Exclusions -->
37
+ <!-- <checksum-exclusions exclude-exts="wav,aif,mp3,mp4,m4a,ra,rm"/> -->
38
+ </output>
39
+
40
+ <process>
41
+ <max-threads>20</max-threads>
42
+ </process>
43
+
44
+ <!-- file name of the droid signature file to use in tools/droid/-->
45
+ <droid_sigfile>DROID_SignatureFile_V90.xml</droid_sigfile>
46
+ <!-- Limits number of bytes DROID reads in (in KB) for processing large files for the listed file extensions. -->
47
+ <!-- Note: This should only be used with files that can provide sufficient metadata at beginning of the file -->
48
+ <!-- <droid_read_limit include-exts="mov,mxf" read-limit-kb="64" /> -->
49
+
50
+ <!-- the fits home is used by the MediaInfo tool to load the jna api libs -->
51
+ <!-- in most cases you won't need to change -->
52
+ <!-- example for BB will be /fits -->
53
+ <fits_home>.</fits_home>
54
+
55
+ </fits_configuration>
@@ -2,61 +2,60 @@
2
2
  <!DOCTYPE policymap [
3
3
  <!ELEMENT policymap (policy)+>
4
4
  <!ELEMENT policy (#PCDATA)>
5
- <!ATTLIST policy domain (delegate|coder|filter|path|resource) #IMPLIED>
5
+ <!ATTLIST policy domain (delegate|coder|filter|path|resource|cache) #IMPLIED>
6
6
  <!ATTLIST policy name CDATA #IMPLIED>
7
7
  <!ATTLIST policy rights CDATA #IMPLIED>
8
8
  <!ATTLIST policy pattern CDATA #IMPLIED>
9
9
  <!ATTLIST policy value CDATA #IMPLIED>
10
10
  ]>
11
- <policy domain="coder" rights="read|write" pattern="PDF" />
12
- <policy domain="coder" rights="read|write" pattern="LABEL" />
13
- <!--
14
- Configure ImageMagick policies.
11
+ <policymap>
12
+ <policy domain="coder" rights="read|write" pattern="PDF" />
13
+ <policy domain="coder" rights="read|write" pattern="LABEL" />
14
+ <!--
15
+ Configure ImageMagick policies.
15
16
 
16
- Domains include system, delegate, coder, filter, path, or resource.
17
+ Domains include system, delegate, coder, filter, path, or resource.
17
18
 
18
- Rights include none, read, write, and execute. Use | to combine them,
19
- for example: "read | write" to permit read from, or write to, a path.
19
+ Rights include none, read, write, and execute. Use | to combine them,
20
+ for example: "read | write" to permit read from, or write to, a path.
20
21
 
21
- Use a glob expression as a pattern.
22
+ Use a glob expression as a pattern.
22
23
 
23
- Suppose we do not want users to process MPEG video images:
24
+ Suppose we do not want users to process MPEG video images:
24
25
 
25
- <policy domain="delegate" rights="none" pattern="mpeg:decode" />
26
+ <policy domain="delegate" rights="none" pattern="mpeg:decode" />
26
27
 
27
- Here we do not want users reading images from HTTP:
28
+ Here we do not want users reading images from HTTP:
28
29
 
29
- <policy domain="coder" rights="none" pattern="HTTP" />
30
+ <policy domain="coder" rights="none" pattern="HTTP" />
30
31
 
31
- Lets prevent users from executing any image filters:
32
+ Lets prevent users from executing any image filters:
32
33
 
33
- <policy domain="filter" rights="none" pattern="*" />
34
+ <policy domain="filter" rights="none" pattern="*" />
34
35
 
35
- The /repository file system is restricted to read only. We use a glob
36
- expression to match all paths that start with /repository:
37
-
38
- <policy domain="path" rights="read" pattern="/repository/*" />
36
+ The /repository file system is restricted to read only. We use a glob
37
+ expression to match all paths that start with /repository:
38
+
39
+ <policy domain="path" rights="read" pattern="/repository/*" />
39
40
 
40
- Any large image is cached to disk rather than memory:
41
+ Any large image is cached to disk rather than memory:
41
42
 
42
- <policy domain="resource" name="area" value="1GB"/>
43
+ <policy domain="resource" name="area" value="1GB"/>
44
+
45
+ Define arguments for the memory, map, area, and disk resources with
46
+ SI prefixes (.e.g 100MB). In addition, resource policies are maximums for
47
+ each instance of ImageMagick (e.g. policy memory limit 1GB, -limit 2GB
48
+ exceeds policy maximum so memory limit is 1GB).
49
+ -->
50
+
51
+ <!-- NewspaperWorks: allow more than default Ubuntu policy, re: RAM, Disk -->
52
+ <policy domain="resource" name="memory" value="512MiB"/>
53
+ <policy domain="resource" name="map" value="1GiB"/>
54
+ <policy domain="resource" name="width" value="20KP"/>
55
+ <policy domain="resource" name="height" value="20KP"/>
56
+ <policy domain="resource" name="area" value="128MB"/>
57
+ <policy domain="resource" name="disk" value="2GiB"/>
43
58
 
44
- Define arguments for the memory, map, area, and disk resources with
45
- SI prefixes (.e.g 100MB). In addition, resource policies are maximums for
46
- each instance of ImageMagick (e.g. policy memory limit 1GB, -limit 2GB
47
- exceeds policy maximum so memory limit is 1GB).
48
- -->
49
- <policymap>
50
- <!-- <policy domain="resource" name="temporary-path" value="/tmp"/> -->
51
- <!-- <policy domain="resource" name="memory" value="2GiB"/> -->
52
- <!-- <policy domain="resource" name="map" value="4GiB"/> -->
53
- <!-- <policy domain="resource" name="area" value="1GB"/> -->
54
- <!-- <policy domain="resource" name="disk" value="16EB"/> -->
55
- <!-- <policy domain="resource" name="file" value="768"/> -->
56
- <!-- <policy domain="resource" name="thread" value="4"/> -->
57
- <!-- <policy domain="resource" name="throttle" value="0"/> -->
58
- <!-- <policy domain="resource" name="time" value="3600"/> -->
59
- <!-- <policy domain="system" name="precision" value="6"/> -->
60
59
  <policy domain="cache" name="shared-secret" value="passphrase"/>
61
60
  <policy domain="coder" rights="none" pattern="EPHEMERAL" />
62
61
  <policy domain="coder" rights="none" pattern="URL" />
@@ -68,9 +67,10 @@
68
67
  <policy domain="coder" rights="none" pattern="WIN" />
69
68
  <policy domain="coder" rights="none" pattern="PLT" />
70
69
  <policy domain="path" rights="none" pattern="@*" />
71
- <!-- disable ghostscript format types -->
70
+ <!-- disable ghostscript format types (except PDF) -->
72
71
  <policy domain="coder" rights="none" pattern="PS" />
73
72
  <policy domain="coder" rights="none" pattern="EPS" />
74
- <!-- <policy domain="coder" rights="none" pattern="PDF" /> -->
75
73
  <policy domain="coder" rights="none" pattern="XPS" />
74
+ <!-- NewspaperWorks: we need to allow PDF here -->
75
+ <!-- <policy domain="coder" rights="none" pattern="PDF" /> -->
76
76
  </policymap>
@@ -1,5 +1,7 @@
1
1
  require "newspaper_works/engine"
2
2
  require "newspaper_works/errors"
3
+ require "newspaper_works/jp2_image_metadata"
4
+ require "newspaper_works/image_tool"
3
5
  require "newspaper_works/ingest"
4
6
  require "newspaper_works/issue_pdf_composer"
5
7
  require "newspaper_works/text_extraction"
@@ -0,0 +1,119 @@
1
+ require 'open3'
2
+ require 'tmpdir'
3
+
4
+ module NewspaperWorks
5
+ class ImageTool
6
+ attr_accessor :path, :ftype
7
+
8
+ def initialize(path)
9
+ @path = path
10
+ @ftype = magic
11
+ @metadata = nil
12
+ end
13
+
14
+ # @return [Hash] hash with following symbol keys, and respectively
15
+ # typed String and/or Integer values.
16
+ # :width, :height — both in Integer px units
17
+ # :color — (String enumerated from 'gray', 'monochrome', 'color')
18
+ # :num_components - Integer, number of channels
19
+ # :bits_per_component — Integer, bits per channel (e.g. 8 vs. 1)
20
+ # :content_type — RFC 2045 MIME type
21
+ def metadata
22
+ return @metadata unless @metadata.nil?
23
+ @metadata = jp2? ? jp2_metadata : identify_metadata
24
+ end
25
+
26
+ # Convert source image to image at destination path, inferring file type
27
+ # from destination file extension. In case of JP2 files, create
28
+ # intermediate file using OpenJPEG 2000 that ImageMagick can use.
29
+ # Only outputs monochrome output if monochrome is true, destination
30
+ # format is TIFF.
31
+ # @param destination [String] Path to output / destination file
32
+ # @param monochrome [Boolean] true if monochrome output, otherwise false
33
+ def convert(destination, monochrome = false)
34
+ raise 'JP2 output not yet supported' if destination.end_with?('jp2')
35
+ return convert_image(jp2_to_tiff(@path), destination, monochrome) if jp2?
36
+ convert_image(@path, destination, monochrome)
37
+ end
38
+
39
+ private
40
+
41
+ def convert_image(source, destination, monochrome)
42
+ monochrome &&= destination.slice(-4, 4).index('tif')
43
+ mono_opts = "-depth 1 -monochrome -compress Group4 -type bilevel "
44
+ opts = monochrome ? mono_opts : ''
45
+ cmd = "convert #{source} #{opts}#{destination}"
46
+ `#{cmd}`
47
+ end
48
+
49
+ def jp2_to_tiff(source)
50
+ intermediate_path = File.join(Dir.mktmpdir, 'intermediate.tif')
51
+ jp2_cmd = "opj_decompress -i #{source} -o #{intermediate_path}"
52
+ `#{jp2_cmd}`
53
+ intermediate_path
54
+ end
55
+
56
+ def jp2_metadata
57
+ result = NewspaperWorks::JP2ImageMetadata.new(path).technical_metadata
58
+ result[:content_type] = 'image/jp2'
59
+ result
60
+ end
61
+
62
+ def im_line_select(lines, key)
63
+ line = lines.select { |l| l.downcase.strip.start_with?(key) }[0]
64
+ # Given "key: value" line, return the value as String stripped of
65
+ # leading and trailing whitespace
66
+ return line if line.nil?
67
+ line.strip.split(':')[-1].strip
68
+ end
69
+
70
+ # @return [Array(Integer, Integer)] width, height in Integer px units
71
+ def im_identify_geometry(lines)
72
+ img_geo = im_line_select(lines, 'geometry').split('+')[0]
73
+ img_geo.split('x').map(&:to_i)
74
+ end
75
+
76
+ # @return [Array<String>] lines of output from imagemagick `identify`
77
+ def im_identify
78
+ cmd = "identify -verbose #{path}"
79
+ `#{cmd}`.lines
80
+ end
81
+
82
+ def im_mime(lines)
83
+ return 'application/pdf' if pdf? # workaround older imagemagick bug
84
+ im_line_select(lines, 'mime type')
85
+ end
86
+
87
+ def populate_im_color!(lines, result)
88
+ bpc = im_line_select(lines, 'depth').split('-')[0].to_i # '1-bit' -> 1
89
+ colorspace = im_line_select(lines, 'colorspace')
90
+ color = colorspace == 'Gray' ? 'gray' : 'color'
91
+ has_alpha = !im_line_select(lines, 'Alpha').nil?
92
+ result[:num_components] = (color == 'gray' ? 1 : 3) + (has_alpha ? 1 : 0)
93
+ result[:color] = bpc == 1 ? 'monochrome' : color
94
+ result[:bits_per_component] = bpc
95
+ end
96
+
97
+ # Return metadata by means of imagemagick identify
98
+ def identify_metadata
99
+ result = {}
100
+ lines = im_identify
101
+ result[:width], result[:height] = im_identify_geometry(lines)
102
+ result[:content_type] = im_mime(lines)
103
+ populate_im_color!(lines, result)
104
+ result
105
+ end
106
+
107
+ def magic
108
+ File.read(@path, 23, 0)
109
+ end
110
+
111
+ def jp2?
112
+ @ftype.end_with?('ftypjp2')
113
+ end
114
+
115
+ def pdf?
116
+ magic.start_with?('%PDF-')
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,81 @@
1
+ module NewspaperWorks
2
+ class JP2ImageMetadata
3
+ TOKEN_MARKER_START = "\xFF".force_encoding("BINARY").freeze
4
+ TOKEN_MARKER_SIZ = "\x51".force_encoding("BINARY").freeze
5
+ TOKEN_IHDR = 'ihdr'.freeze
6
+
7
+ attr_accessor :path
8
+
9
+ def initialize(path)
10
+ @path = path
11
+ end
12
+
13
+ # @param io [IO] IO stream opened in binary mode, for reading
14
+ # @return [Array(Integer, Integer)] X size, Y size, in Integer-typed px
15
+ def extract_jp2_dim(io)
16
+ raise IOError, 'file not open in binary mode' unless io.binmode?
17
+ buffer = ''
18
+ siz_found = false
19
+ # Informed by ISO/IEC 15444-1:2000, pp. 26-27
20
+ # via:
21
+ # http://hosting.astro.cornell.edu/~carcich/LRO/jp2/ISO_JPEG200_Standard/INCITS+ISO+IEC+15444-1-2000.pdf
22
+ #
23
+ # first 23 bytes are file-magic, we can skip
24
+ io.seek(23, IO::SEEK_SET)
25
+ while !siz_found && !buffer.nil?
26
+ # read one byte at a time, until we hit marker start 0xFF
27
+ buffer = io.read(1) while buffer != TOKEN_MARKER_START
28
+ # - on 0xFF read subsequent byte; if value != 0x51, continue
29
+ buffer = io.read(1)
30
+ next if buffer != TOKEN_MARKER_SIZ
31
+ # - on 0x51, read next 12 bytes
32
+ buffer = io.read(12)
33
+ siz_found = true
34
+ end
35
+ # discard first 4 bytes; next 4 bytes are XSiz; last 4 bytes are YSiz
36
+ x_siz = buffer.byteslice(4, 4).unpack('N').first
37
+ y_siz = buffer.byteslice(8, 4).unpack('N').first
38
+ [x_siz, y_siz]
39
+ end
40
+
41
+ # @param io [IO] IO stream opened in binary mode, for reading
42
+ # @return [Array(Integer, Integer)] number components, bits-per-component
43
+ def extract_jp2_components(io)
44
+ raise IOError, 'file not open in binary mode' unless io.binmode?
45
+ io.seek(0, IO::SEEK_SET)
46
+ # IHDR should be in first 64 bytes
47
+ buffer = io.read(64)
48
+ ihdr_data = buffer.split(TOKEN_IHDR)[-1]
49
+ raise IOError if ihdr_data.nil?
50
+ num_components = ihdr_data.byteslice(8, 2).unpack('n').first
51
+ # stored as "bit depth of the components in the codestream, minus 1", so add 1
52
+ bits_per_component = ihdr_data.byteslice(10, 1).unpack('c').first + 1
53
+ [num_components, bits_per_component]
54
+ end
55
+
56
+ def validate_jp2(io)
57
+ # verify file is jp2
58
+ magic = io.read(23)
59
+ raise IOError, 'Not JP2 file' unless magic.end_with?('ftypjp2')
60
+ end
61
+
62
+ # @param path [String] path to jp2, for reading
63
+ # @return [Hash] hash
64
+ def technical_metadata
65
+ io = File.open(path, 'rb')
66
+ io.seek(0, IO::SEEK_SET)
67
+ validate_jp2(io)
68
+ x_siz, y_siz = extract_jp2_dim(io)
69
+ nc, bpc = extract_jp2_components(io)
70
+ color = nc >= 3 ? 'color' : 'gray'
71
+ io.close
72
+ {
73
+ color: bpc == 1 ? 'monochrome' : color,
74
+ num_components: nc,
75
+ bits_per_component: bpc,
76
+ width: x_siz,
77
+ height: y_siz
78
+ }
79
+ end
80
+ end
81
+ end
@@ -1,4 +1,5 @@
1
1
  require 'newspaper_works/text_extraction/alto_reader'
2
+ require 'newspaper_works/text_extraction/hocr_reader'
2
3
  require 'newspaper_works/text_extraction/page_ocr'
3
4
  require 'newspaper_works/text_extraction/render_alto'
4
5
  require 'newspaper_works/text_extraction/word_coords_builder'
@@ -0,0 +1,173 @@
1
+ require 'active_support/core_ext/module/delegation'
2
+ require 'json'
3
+ require 'nokogiri'
4
+
5
+ module NewspaperWorks
6
+ # Module for text extraction
7
+ module TextExtraction
8
+ # Class to obtain plain text and JSON word-coordinates from hOCR source
9
+ # - Coordinates in px units, unlike ALTO, which may have scaling concerns
10
+ class HOCRReader
11
+ attr_accessor :source, :doc_stream
12
+ delegate :text, :width, :height, :words, to: :doc_stream
13
+
14
+ # SAX Document Stream class to gather text and word tokens from hOCR
15
+ class HOCRDocStream < Nokogiri::XML::SAX::Document
16
+ attr_accessor :text, :words, :width, :height
17
+
18
+ def initialize
19
+ super()
20
+ # plain text buffer:
21
+ @text = ''
22
+ # list of word hash, containing word+coord:
23
+ @words = []
24
+ # page width and height to be found in hOCR for `div.ocr_page`
25
+ @width = nil
26
+ @height = nil
27
+ # to hold current word data state across #start_element, #characters,
28
+ # and #end_element methods (to associate word with coordinates).
29
+ @current = nil
30
+ # to preserve element classname from start to use by #end_element
31
+ @element_class_name = nil
32
+ end
33
+
34
+ # Return coordinates from `span.ocrx_word` element attribute hash
35
+ #
36
+ # @param attrs [Hash] hash with hOCR `span.ocrx_word` element attributes
37
+ # @return [Array] Array of position x, y, width, height in px.
38
+ def s_coords(attrs)
39
+ element_title = attrs['title']
40
+ bbox = element_title.split(';')[0].split('bbox ')[-1]
41
+ x1, y1, x2, y2 = bbox.split(' ').map(&:to_i)
42
+ height = y2 - y1
43
+ width = x2 - x1
44
+ hpos = x1
45
+ vpos = y1
46
+ [hpos, vpos, width, height]
47
+ end
48
+
49
+ # Consider element for processing?
50
+ # - `div.ocr_page` — to get page width/height
51
+ # - `span.ocr_line` — to help make plain text readable
52
+ # - `span.ocrx_word` — for word-coordinate JSON and plain text word
53
+ # @param name [String] Element name
54
+ # @param class_name [String] HTML class name
55
+ # @return [Boolean] true if element should be processed; otherwise false
56
+ def consider?(name, class_name)
57
+ selector = "#{name}.#{class_name}"
58
+ ['div.ocr_page', 'span.ocr_line', 'span.ocrx_word'].include?(selector)
59
+ end
60
+
61
+ def start_word(attrs)
62
+ @current = {}
63
+ # will be replaced during #characters method call:
64
+ @current[:word] = nil
65
+ @current[:coordinates] = s_coords(attrs)
66
+ end
67
+
68
+ def start_page(attrs)
69
+ title = attrs['title']
70
+ fields = title.split(';')
71
+ bbox = fields[1].split('bbox ')[-1].split(' ').map(&:to_i)
72
+ # width and height:
73
+ @width = bbox[2]
74
+ @height = bbox[3]
75
+ end
76
+
77
+ def word_complete?
78
+ return false if @current.nil?
79
+ coords = @current[:coordinates]
80
+ @current[:word] && !@current[:word].empty? && coords.size == 4
81
+ end
82
+
83
+ def end_word
84
+ # add trailing space to plaintext buffer for between words:
85
+ @text += ' '
86
+ @words.push(@current) if word_complete?
87
+ end
88
+
89
+ def end_line
90
+ # strip trailing whitespace
91
+ @text.strip!
92
+ # then insert a line break
93
+ @text += "\n"
94
+ end
95
+
96
+ # Callback for element start, ignores elements except for:
97
+ # - `div.ocr_page` — to get page width/height
98
+ # - `span.ocr_line` — to help make plain text readable
99
+ # - `span.ocrx_word` — for word-coordinate JSON and plain text word
100
+ #
101
+ # @param name [String] element name.
102
+ # @param attrs [Array] Array of key, value pair Arrays.
103
+ def start_element(name, attrs = [])
104
+ attributes = attrs.to_h
105
+ @element_class_name = attributes['class']
106
+ return unless consider?(name, @element_class_name)
107
+ start_word(attributes) if @element_class_name == 'ocrx_word'
108
+ start_page(attributes) if @element_class_name == 'ocr_page'
109
+ end
110
+
111
+ def characters(value)
112
+ return if @current.nil?
113
+ return if @current[:coordinates].nil?
114
+ @current[:word] ||= ''
115
+ @current[:word] += value
116
+ @text += value
117
+ end
118
+
119
+ # Callback for element end; at this time, flush word coordinate state
120
+ # for current word, and append line endings to plain text:
121
+ #
122
+ # @param name [String] element name.
123
+ def end_element(_name)
124
+ end_line if @element_class_name == 'ocr_line'
125
+ end_word if @element_class_name == 'ocrx_word'
126
+ end
127
+
128
+ # Callback for completion of parsing hOCR, used to normalize generated
129
+ # text content (strip unneeded whitespace incidental to output).
130
+ def end_document
131
+ # postprocess @text to remove trailing spaces on lines
132
+ @text = @text.split("\n").map(&:strip).join("\n")
133
+ # remove excess line break
134
+ @text.gsub!(/\n+/, "\n")
135
+ @text.delete("\r")
136
+ # remove trailing whitespace at end of buffer
137
+ @text.strip!
138
+ end
139
+ end
140
+
141
+ # Construct with either path or HTML [String]
142
+ #
143
+ # @param html [String], and process document
144
+ def initialize(html)
145
+ @source = isxml?(html) ? html : File.read(html)
146
+ @doc_stream = HOCRDocStream.new
147
+ parser = Nokogiri::HTML::SAX::Parser.new(doc_stream)
148
+ parser.parse(@source)
149
+ end
150
+
151
+ # Determine if source parameter is path or xml/html
152
+ #
153
+ # @param xml [String] either path to xml file or xml source
154
+ # @return [true, false] true if value appears to be XML/HTML, not path
155
+ def isxml?(xml)
156
+ xml.lstrip.start_with?('<')
157
+ end
158
+
159
+ # Output JSON flattened word coordinates
160
+ #
161
+ # @return [String] JSON serialization of flattened word coordinates
162
+ def json
163
+ words = @doc_stream.words
164
+ builder = NewspaperWorks::TextExtraction::WordCoordsBuilder.new(
165
+ words,
166
+ @doc_stream.width,
167
+ @doc_stream.height
168
+ )
169
+ builder.to_json
170
+ end
171
+ end
172
+ end
173
+ end