newspaper_works 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (461) hide show
  1. checksums.yaml +7 -0
  2. data/.fcrepo_wrapper +4 -0
  3. data/.gitignore +43 -0
  4. data/.rubocop.yml +143 -0
  5. data/.solr_wrapper +8 -0
  6. data/.travis.yml +50 -0
  7. data/Gemfile +47 -0
  8. data/LICENSE +203 -0
  9. data/README.md +159 -0
  10. data/Rakefile +38 -0
  11. data/app/actors/hyrax/actors/newspaper_article_actor.rb +8 -0
  12. data/app/actors/hyrax/actors/newspaper_container_actor.rb +8 -0
  13. data/app/actors/hyrax/actors/newspaper_issue_actor.rb +8 -0
  14. data/app/actors/hyrax/actors/newspaper_page_actor.rb +8 -0
  15. data/app/actors/hyrax/actors/newspaper_title_actor.rb +8 -0
  16. data/app/actors/newspaper_works/actors/newspaper_works_upload_actor.rb +88 -0
  17. data/app/assets/config/newspaper_works_manifest.js +2 -0
  18. data/app/assets/images/newspaper_works/.keep +0 -0
  19. data/app/assets/javascripts/newspaper_works/autocomplete_fix.js +33 -0
  20. data/app/assets/javascripts/newspaper_works/ocr_search.js.erb +6 -0
  21. data/app/assets/javascripts/newspaper_works/thumbnail_highlights.js.erb +102 -0
  22. data/app/assets/javascripts/newspaper_works.js +4 -0
  23. data/app/assets/stylesheets/newspaper_works/_issue_search.scss +13 -0
  24. data/app/assets/stylesheets/newspaper_works/_issues_calendar.scss +18 -0
  25. data/app/assets/stylesheets/newspaper_works/_newspaper_works.scss +4 -0
  26. data/app/assets/stylesheets/newspaper_works/_newspapers_search.scss +38 -0
  27. data/app/assets/stylesheets/newspaper_works/_search_results.scss +12 -0
  28. data/app/controllers/hyrax/newspaper_articles_controller.rb +14 -0
  29. data/app/controllers/hyrax/newspaper_containers_controller.rb +14 -0
  30. data/app/controllers/hyrax/newspaper_issues_controller.rb +14 -0
  31. data/app/controllers/hyrax/newspaper_pages_controller.rb +14 -0
  32. data/app/controllers/hyrax/newspaper_titles_controller.rb +13 -0
  33. data/app/controllers/newspaper_works/newspapers_controller.rb +117 -0
  34. data/app/controllers/newspaper_works/newspapers_search_controller.rb +26 -0
  35. data/app/forms/hyrax/newspaper_article_form.rb +11 -0
  36. data/app/forms/hyrax/newspaper_container_form.rb +11 -0
  37. data/app/forms/hyrax/newspaper_issue_form.rb +11 -0
  38. data/app/forms/hyrax/newspaper_page_form.rb +15 -0
  39. data/app/forms/hyrax/newspaper_title_form.rb +12 -0
  40. data/app/forms/newspaper_works/newspaper_core_form_data.rb +17 -0
  41. data/app/helpers/hyrax/newspaper_articles_helper.rb +5 -0
  42. data/app/helpers/hyrax/newspaper_containers_helper.rb +5 -0
  43. data/app/helpers/hyrax/newspaper_issues_helper.rb +5 -0
  44. data/app/helpers/hyrax/newspaper_pages_helper.rb +5 -0
  45. data/app/helpers/newspaper_works/application_helper.rb +5 -0
  46. data/app/helpers/newspaper_works/breadcrumb_helper.rb +92 -0
  47. data/app/helpers/newspaper_works/newspaper_works_helper_behavior.rb +103 -0
  48. data/app/helpers/newspaper_works/newspapers_helper.rb +5 -0
  49. data/app/indexers/concerns/newspaper_works/indexes_full_text.rb +17 -0
  50. data/app/indexers/concerns/newspaper_works/indexes_place_of_publication.rb +67 -0
  51. data/app/indexers/concerns/newspaper_works/indexes_publication_date_range.rb +35 -0
  52. data/app/indexers/concerns/newspaper_works/indexes_relationships.rb +125 -0
  53. data/app/indexers/newspaper_article_indexer.rb +16 -0
  54. data/app/indexers/newspaper_container_indexer.rb +18 -0
  55. data/app/indexers/newspaper_issue_indexer.rb +26 -0
  56. data/app/indexers/newspaper_page_indexer.rb +9 -0
  57. data/app/indexers/newspaper_title_indexer.rb +19 -0
  58. data/app/indexers/newspaper_works/newspaper_core_indexer.rb +21 -0
  59. data/app/jobs/newspaper_works/application_job.rb +4 -0
  60. data/app/jobs/newspaper_works/compose_issue_pdf_job.rb +13 -0
  61. data/app/jobs/newspaper_works/create_issue_pages_job.rb +19 -0
  62. data/app/mailers/newspaper_works/application_mailer.rb +8 -0
  63. data/app/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior.rb +82 -0
  64. data/app/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior.rb +27 -0
  65. data/app/models/concerns/newspaper_works/newspaper_core_metadata.rb +67 -0
  66. data/app/models/concerns/newspaper_works/place_of_publication_behavior.rb +15 -0
  67. data/app/models/concerns/newspaper_works/scanned_media_metadata.rb +43 -0
  68. data/app/models/concerns/newspaper_works/solr/document.rb +25 -0
  69. data/app/models/file_set.rb +10 -0
  70. data/app/models/newspaper_article.rb +158 -0
  71. data/app/models/newspaper_container.rb +86 -0
  72. data/app/models/newspaper_issue.rb +115 -0
  73. data/app/models/newspaper_page.rb +70 -0
  74. data/app/models/newspaper_title.rb +111 -0
  75. data/app/models/newspaper_works/application_record.rb +6 -0
  76. data/app/models/newspaper_works/derivative_attachment.rb +8 -0
  77. data/app/models/newspaper_works/ingest_file_relation.rb +14 -0
  78. data/app/presenters/hyrax/newspaper_article_presenter.rb +38 -0
  79. data/app/presenters/hyrax/newspaper_container_presenter.rb +11 -0
  80. data/app/presenters/hyrax/newspaper_issue_presenter.rb +62 -0
  81. data/app/presenters/hyrax/newspaper_page_presenter.rb +72 -0
  82. data/app/presenters/hyrax/newspaper_title_presenter.rb +86 -0
  83. data/app/presenters/newspaper_works/iiif_manifest_presenter_behavior.rb +29 -0
  84. data/app/presenters/newspaper_works/issue_info_presenter.rb +29 -0
  85. data/app/presenters/newspaper_works/newspaper_core_presenter.rb +9 -0
  86. data/app/presenters/newspaper_works/persistent_url_presenter_behavior.rb +16 -0
  87. data/app/presenters/newspaper_works/place_of_publication_presenter_behavior.rb +8 -0
  88. data/app/presenters/newspaper_works/scanned_media_presenter.rb +7 -0
  89. data/app/presenters/newspaper_works/title_info_presenter.rb +13 -0
  90. data/app/search_builders/concerns/newspaper_works/exclude_models.rb +16 -0
  91. data/app/search_builders/concerns/newspaper_works/highlight_search_params.rb +14 -0
  92. data/app/search_builders/newspaper_works/newspapers_search_builder.rb +26 -0
  93. data/app/services/hyrax/article_genre_service.rb +9 -0
  94. data/app/services/newspaper_works/jp2_derivative_service.rb +120 -0
  95. data/app/services/newspaper_works/newspaper_page_derivative_service.rb +91 -0
  96. data/app/services/newspaper_works/pdf_derivative_service.rb +45 -0
  97. data/app/services/newspaper_works/pluggable_derivative_service.rb +114 -0
  98. data/app/services/newspaper_works/text_extraction_derivative_service.rb +56 -0
  99. data/app/services/newspaper_works/text_formats_from_alto_service.rb +77 -0
  100. data/app/services/newspaper_works/tiff_derivative_service.rb +54 -0
  101. data/app/validators/newspaper_works/publication_date_start_end_validator.rb +48 -0
  102. data/app/validators/newspaper_works/publication_date_validator.rb +16 -0
  103. data/app/views/catalog/_index_gallery_newspaper_article_wrapper.html.erb +9 -0
  104. data/app/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb +9 -0
  105. data/app/views/catalog/_index_header_gallery_newspaper_article.html.erb +23 -0
  106. data/app/views/catalog/_index_header_gallery_newspaper_page.html.erb +23 -0
  107. data/app/views/catalog/_index_header_list_newspaper_article.html.erb +7 -0
  108. data/app/views/catalog/_index_header_list_newspaper_page.html.erb +7 -0
  109. data/app/views/catalog/_snippets_more.html.erb +16 -0
  110. data/app/views/catalog/_thumbnail_list_newspaper_article.html.erb +6 -0
  111. data/app/views/catalog/_thumbnail_list_newspaper_page.html.erb +6 -0
  112. data/app/views/hyrax/file_sets/_actions.html.erb +45 -0
  113. data/app/views/hyrax/newspaper_articles/_newspaper_article.html.erb +2 -0
  114. data/app/views/hyrax/newspaper_articles/show.html.erb +1 -0
  115. data/app/views/hyrax/newspaper_containers/_newspaper_container.html.erb +2 -0
  116. data/app/views/hyrax/newspaper_containers/show.html.erb +1 -0
  117. data/app/views/hyrax/newspaper_issues/_newspaper_issue.html.erb +2 -0
  118. data/app/views/hyrax/newspaper_issues/show.html.erb +1 -0
  119. data/app/views/hyrax/newspaper_pages/_newspaper_page.html.erb +2 -0
  120. data/app/views/hyrax/newspaper_pages/show.html.erb +1 -0
  121. data/app/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb +5 -0
  122. data/app/views/hyrax/newspaper_titles/_issue_search_form.html.erb +33 -0
  123. data/app/views/hyrax/newspaper_titles/_issues_calendar.html.erb +63 -0
  124. data/app/views/hyrax/newspaper_titles/_newspaper_title.html.erb +2 -0
  125. data/app/views/hyrax/newspaper_titles/show.html.erb +54 -0
  126. data/app/views/newspaper_works/base/_attribute_rows.html.erb +42 -0
  127. data/app/views/newspaper_works/base/_attributes.html.erb +16 -0
  128. data/app/views/newspaper_works/base/_metadata.html.erb +6 -0
  129. data/app/views/newspaper_works/base/_newspaper_hierarchy.html.erb +14 -0
  130. data/app/views/newspaper_works/base/_persistent_url.html.erb +1 -0
  131. data/app/views/newspaper_works/base/_show.html.erb +45 -0
  132. data/app/views/newspaper_works/newspapers_search/_date_fields.html.erb +29 -0
  133. data/app/views/newspaper_works/newspapers_search/_facet_layout.html.erb +8 -0
  134. data/app/views/newspaper_works/newspapers_search/_facet_limit.html.erb +17 -0
  135. data/app/views/newspaper_works/newspapers_search/_front_pages_input.html.erb +5 -0
  136. data/app/views/newspaper_works/newspapers_search/_keyword_input.html.erb +18 -0
  137. data/app/views/newspaper_works/newspapers_search/_newspapers_facets.html.erb +5 -0
  138. data/app/views/newspaper_works/newspapers_search/_newspapers_search_form.html.erb +13 -0
  139. data/app/views/newspaper_works/newspapers_search/_newspapers_search_help.html.erb +8 -0
  140. data/app/views/newspaper_works/newspapers_search/search.html.erb +13 -0
  141. data/app/views/records/edit_fields/_alternate_title.html.erb +4 -0
  142. data/app/views/records/edit_fields/_genre.html.erb +4 -0
  143. data/app/views/records/edit_fields/_place_of_publication.html.erb +14 -0
  144. data/app/views/records/edit_fields/_subtitle.html.erb +4 -0
  145. data/bin/rails +13 -0
  146. data/config/fcrepo_wrapper_test.yml +5 -0
  147. data/config/initializers/assets.rb +2 -0
  148. data/config/locales/newspaper_article.de.yml +12 -0
  149. data/config/locales/newspaper_article.en.yml +12 -0
  150. data/config/locales/newspaper_article.es.yml +12 -0
  151. data/config/locales/newspaper_article.fr.yml +12 -0
  152. data/config/locales/newspaper_article.it.yml +12 -0
  153. data/config/locales/newspaper_article.pt-BR.yml +12 -0
  154. data/config/locales/newspaper_article.zh.yml +12 -0
  155. data/config/locales/newspaper_container.de.yml +8 -0
  156. data/config/locales/newspaper_container.en.yml +8 -0
  157. data/config/locales/newspaper_container.es.yml +8 -0
  158. data/config/locales/newspaper_container.fr.yml +8 -0
  159. data/config/locales/newspaper_container.it.yml +8 -0
  160. data/config/locales/newspaper_container.pt-BR.yml +8 -0
  161. data/config/locales/newspaper_container.zh.yml +8 -0
  162. data/config/locales/newspaper_issue.de.yml +8 -0
  163. data/config/locales/newspaper_issue.en.yml +8 -0
  164. data/config/locales/newspaper_issue.es.yml +8 -0
  165. data/config/locales/newspaper_issue.fr.yml +8 -0
  166. data/config/locales/newspaper_issue.it.yml +8 -0
  167. data/config/locales/newspaper_issue.pt-BR.yml +8 -0
  168. data/config/locales/newspaper_issue.zh.yml +8 -0
  169. data/config/locales/newspaper_page.de.yml +15 -0
  170. data/config/locales/newspaper_page.en.yml +15 -0
  171. data/config/locales/newspaper_page.es.yml +15 -0
  172. data/config/locales/newspaper_page.fr.yml +15 -0
  173. data/config/locales/newspaper_page.it.yml +15 -0
  174. data/config/locales/newspaper_page.pt-BR.yml +15 -0
  175. data/config/locales/newspaper_page.zh.yml +15 -0
  176. data/config/locales/newspaper_title.de.yml +8 -0
  177. data/config/locales/newspaper_title.en.yml +8 -0
  178. data/config/locales/newspaper_title.es.yml +8 -0
  179. data/config/locales/newspaper_title.fr.yml +8 -0
  180. data/config/locales/newspaper_title.it.yml +8 -0
  181. data/config/locales/newspaper_title.pt-BR.yml +8 -0
  182. data/config/locales/newspaper_title.zh.yml +8 -0
  183. data/config/locales/newspaper_works.de.yml +50 -0
  184. data/config/locales/newspaper_works.en.yml +52 -0
  185. data/config/locales/newspaper_works.es.yml +52 -0
  186. data/config/locales/newspaper_works.fr.yml +52 -0
  187. data/config/locales/newspaper_works.it.yml +52 -0
  188. data/config/locales/newspaper_works.pt-BR.yml +52 -0
  189. data/config/locales/newspaper_works.zh.yml +52 -0
  190. data/config/routes.rb +9 -0
  191. data/config/solr_wrapper_test.yml +9 -0
  192. data/config/test-fixture/solr-config/_rest_managed.json +3 -0
  193. data/config/test-fixture/solr-config/admin-extra.html +31 -0
  194. data/config/test-fixture/solr-config/elevate.xml +36 -0
  195. data/config/test-fixture/solr-config/mapping-ISOLatin1Accent.txt +246 -0
  196. data/config/test-fixture/solr-config/protwords.txt +21 -0
  197. data/config/test-fixture/solr-config/schema.xml +366 -0
  198. data/config/test-fixture/solr-config/scripts.conf +24 -0
  199. data/config/test-fixture/solr-config/solrconfig.xml +322 -0
  200. data/config/test-fixture/solr-config/spellings.txt +2 -0
  201. data/config/test-fixture/solr-config/stopwords.txt +58 -0
  202. data/config/test-fixture/solr-config/stopwords_en.txt +58 -0
  203. data/config/test-fixture/solr-config/synonyms.txt +31 -0
  204. data/config/test-fixture/solr-config/xslt/example.xsl +132 -0
  205. data/config/test-fixture/solr-config/xslt/example_atom.xsl +67 -0
  206. data/config/test-fixture/solr-config/xslt/example_rss.xsl +66 -0
  207. data/config/test-fixture/solr-config/xslt/luke.xsl +337 -0
  208. data/config/vendor/imagemagick-6-policy.xml +76 -0
  209. data/db/migrate/20181214181358_create_newspaper_works_derivative_attachments.rb +12 -0
  210. data/db/migrate/20190107165909_create_newspaper_works_ingest_file_relations.rb +11 -0
  211. data/lib/generators/newspaper_works/assets_generator.rb +29 -0
  212. data/lib/generators/newspaper_works/blacklight_advanced_search_generator.rb +44 -0
  213. data/lib/generators/newspaper_works/blacklight_iiif_search_generator.rb +41 -0
  214. data/lib/generators/newspaper_works/catalog_controller_generator.rb +60 -0
  215. data/lib/generators/newspaper_works/install_generator.rb +97 -0
  216. data/lib/generators/newspaper_works/templates/annotation_behavior.rb +6 -0
  217. data/lib/generators/newspaper_works/templates/config/authorities/newspaper_article_genres.yml +86 -0
  218. data/lib/generators/newspaper_works/templates/config/initializers/newspaper_works.rb +12 -0
  219. data/lib/generators/newspaper_works/templates/config/initializers/patch_blacklight_advanced_search.rb +74 -0
  220. data/lib/generators/newspaper_works/templates/custom_search_builder.rb +23 -0
  221. data/lib/generators/newspaper_works/templates/newspaper_works.scss +1 -0
  222. data/lib/generators/newspaper_works/templates/newspaper_works_helper.rb +3 -0
  223. data/lib/generators/newspaper_works/templates/search_behavior.rb +6 -0
  224. data/lib/newspaper_works/configuration.rb +14 -0
  225. data/lib/newspaper_works/data/fileset_helper.rb +25 -0
  226. data/lib/newspaper_works/data/path_helper.rb +40 -0
  227. data/lib/newspaper_works/data/work_derivatives.rb +314 -0
  228. data/lib/newspaper_works/data/work_file.rb +92 -0
  229. data/lib/newspaper_works/data/work_files.rb +181 -0
  230. data/lib/newspaper_works/data.rb +35 -0
  231. data/lib/newspaper_works/engine.rb +42 -0
  232. data/lib/newspaper_works/errors.rb +14 -0
  233. data/lib/newspaper_works/ingest/base_ingest.rb +69 -0
  234. data/lib/newspaper_works/ingest/base_publication_info.rb +35 -0
  235. data/lib/newspaper_works/ingest/batch_ingest_helper.rb +44 -0
  236. data/lib/newspaper_works/ingest/batch_issue_ingester.rb +129 -0
  237. data/lib/newspaper_works/ingest/chronam_publication_info.rb +133 -0
  238. data/lib/newspaper_works/ingest/from_command.rb +52 -0
  239. data/lib/newspaper_works/ingest/image_ingest_issues.rb +43 -0
  240. data/lib/newspaper_works/ingest/issue_images.rb +51 -0
  241. data/lib/newspaper_works/ingest/lc_publication_info.rb +144 -0
  242. data/lib/newspaper_works/ingest/named_issue_metadata.rb +60 -0
  243. data/lib/newspaper_works/ingest/ndnp/batch_ingester.rb +64 -0
  244. data/lib/newspaper_works/ingest/ndnp/batch_xml_ingest.rb +72 -0
  245. data/lib/newspaper_works/ingest/ndnp/container_ingest.rb +99 -0
  246. data/lib/newspaper_works/ingest/ndnp/container_ingester.rb +84 -0
  247. data/lib/newspaper_works/ingest/ndnp/container_metadata.rb +87 -0
  248. data/lib/newspaper_works/ingest/ndnp/issue_ingest.rb +81 -0
  249. data/lib/newspaper_works/ingest/ndnp/issue_ingester.rb +101 -0
  250. data/lib/newspaper_works/ingest/ndnp/issue_metadata.rb +96 -0
  251. data/lib/newspaper_works/ingest/ndnp/ndnp_asset_helper.rb +20 -0
  252. data/lib/newspaper_works/ingest/ndnp/ndnp_mets_helper.rb +70 -0
  253. data/lib/newspaper_works/ingest/ndnp/page_ingest.rb +47 -0
  254. data/lib/newspaper_works/ingest/ndnp/page_ingester.rb +157 -0
  255. data/lib/newspaper_works/ingest/ndnp/page_metadata.rb +112 -0
  256. data/lib/newspaper_works/ingest/ndnp.rb +21 -0
  257. data/lib/newspaper_works/ingest/newspaper_issue_ingest.rb +56 -0
  258. data/lib/newspaper_works/ingest/newspaper_page_ingest.rb +6 -0
  259. data/lib/newspaper_works/ingest/page_image.rb +52 -0
  260. data/lib/newspaper_works/ingest/path_enumeration.rb +52 -0
  261. data/lib/newspaper_works/ingest/pdf_images.rb +85 -0
  262. data/lib/newspaper_works/ingest/pdf_issue.rb +20 -0
  263. data/lib/newspaper_works/ingest/pdf_issues.rb +39 -0
  264. data/lib/newspaper_works/ingest/pdf_pages.rb +114 -0
  265. data/lib/newspaper_works/ingest/pub_finder.rb +89 -0
  266. data/lib/newspaper_works/ingest/publication_info.rb +44 -0
  267. data/lib/newspaper_works/ingest.rb +90 -0
  268. data/lib/newspaper_works/issue_pdf_composer.rb +111 -0
  269. data/lib/newspaper_works/logging.rb +54 -0
  270. data/lib/newspaper_works/page_finder.rb +62 -0
  271. data/lib/newspaper_works/resource_fetcher.rb +78 -0
  272. data/lib/newspaper_works/text_extraction/alto_reader.rb +122 -0
  273. data/lib/newspaper_works/text_extraction/page_ocr.rb +100 -0
  274. data/lib/newspaper_works/text_extraction/render_alto.rb +84 -0
  275. data/lib/newspaper_works/text_extraction/word_coords_builder.rb +30 -0
  276. data/lib/newspaper_works/text_extraction.rb +10 -0
  277. data/lib/newspaper_works/version.rb +3 -0
  278. data/lib/newspaper_works.rb +19 -0
  279. data/lib/tasks/newspaper_works_tasks.rake +39 -0
  280. data/newspaper_works.gemspec +49 -0
  281. data/spec/.keep.txt +1 -0
  282. data/spec/actors/newspaper_works/actors/newspaper_works_upload_actor_spec.rb +69 -0
  283. data/spec/controllers/catalog_controller_spec.rb +63 -0
  284. data/spec/controllers/newspaper_works/newspapers_controller_spec.rb +114 -0
  285. data/spec/controllers/newspaper_works/newspapers_search_controller_spec.rb +21 -0
  286. data/spec/factories/ability.rb +6 -0
  287. data/spec/factories/newspaper_issue.rb +7 -0
  288. data/spec/factories/newspaper_issue_ingest.rb +6 -0
  289. data/spec/factories/newspaper_page.rb +7 -0
  290. data/spec/factories/newspaper_page_ingest.rb +6 -0
  291. data/spec/factories/newspaper_page_solr_document.rb +12 -0
  292. data/spec/factories/newspaper_title.rb +8 -0
  293. data/spec/factories/uploaded_pdf_file.rb +9 -0
  294. data/spec/factories/user.rb +13 -0
  295. data/spec/features/front_pages_for_title_spec.rb +19 -0
  296. data/spec/features/newspaper_title_search_spec.rb +30 -0
  297. data/spec/features/newspapers_search_spec.rb +49 -0
  298. data/spec/features/search_results_thumbnail_highlights_spec.rb +33 -0
  299. data/spec/features_shared.rb +71 -0
  300. data/spec/fixtures/files/4.1.07.jp2 +0 -0
  301. data/spec/fixtures/files/4.1.07.tiff +0 -0
  302. data/spec/fixtures/files/README.md +7 -0
  303. data/spec/fixtures/files/alto-2-0.xsd +714 -0
  304. data/spec/fixtures/files/broken-truncated.pdf +0 -0
  305. data/spec/fixtures/files/credits.md +16 -0
  306. data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
  307. data/spec/fixtures/files/minimal-1-page.pdf +0 -0
  308. data/spec/fixtures/files/minimal-2-page.pdf +0 -0
  309. data/spec/fixtures/files/minimal-alto.xml +31 -0
  310. data/spec/fixtures/files/ndnp-alto-sample.xml +24 -0
  311. data/spec/fixtures/files/ndnp-sample1-json.json +1 -0
  312. data/spec/fixtures/files/ndnp-sample1-txt.txt +1 -0
  313. data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
  314. data/spec/fixtures/files/ocr_alto.xml +202 -0
  315. data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +202 -0
  316. data/spec/fixtures/files/ocr_color.tiff +0 -0
  317. data/spec/fixtures/files/ocr_gray.jp2 +0 -0
  318. data/spec/fixtures/files/ocr_gray.tiff +0 -0
  319. data/spec/fixtures/files/ocr_mono.tiff +0 -0
  320. data/spec/fixtures/files/page1.tiff +0 -0
  321. data/spec/fixtures/files/resource_mocks/chronam/http404-expected +0 -0
  322. data/spec/fixtures/files/resource_mocks/chronam/sn84038814.rdf +1028 -0
  323. data/spec/fixtures/files/resource_mocks/chronam/sn93059126.rdf +36 -0
  324. data/spec/fixtures/files/resource_mocks/chronam/sn94051019.rdf +37 -0
  325. data/spec/fixtures/files/resource_mocks/geonames/Chicopee +1104 -0
  326. data/spec/fixtures/files/resource_mocks/geonames/Denver +1104 -0
  327. data/spec/fixtures/files/resource_mocks/geonames/Marysville +279 -0
  328. data/spec/fixtures/files/resource_mocks/geonames/Marysville2 +279 -0
  329. data/spec/fixtures/files/resource_mocks/geonames/SLC +1104 -0
  330. data/spec/fixtures/files/resource_mocks/lccn/sn2099999999 +1 -0
  331. data/spec/fixtures/files/resource_mocks/lccn/sn82014496 +2 -0
  332. data/spec/fixtures/files/resource_mocks/lccn/sn83020109 +1 -0
  333. data/spec/fixtures/files/resource_mocks/lccn/sn83021453 +2 -0
  334. data/spec/fixtures/files/resource_mocks/lccn/sn83045396 +2 -0
  335. data/spec/fixtures/files/resource_mocks/lccn/sn84038814 +2 -0
  336. data/spec/fixtures/files/resource_mocks/lccn/sn93059126 +1 -0
  337. data/spec/fixtures/files/resource_mocks/lccn/sn94051019 +1 -0
  338. data/spec/fixtures/files/resource_mocks/lccn/sn99999999 +1 -0
  339. data/spec/fixtures/files/resource_mocks/urls.json +82 -0
  340. data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
  341. data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
  342. data/spec/fixtures/files/thumbnail.jpg +0 -0
  343. data/spec/forms/hyrax/newspaper_article_form_spec.rb +33 -0
  344. data/spec/forms/hyrax/newspaper_container_form_spec.rb +30 -0
  345. data/spec/forms/hyrax/newspaper_issue_form_spec.rb +31 -0
  346. data/spec/forms/hyrax/newspaper_page_form_spec.rb +28 -0
  347. data/spec/forms/hyrax/newspaper_title_form_spec.rb +31 -0
  348. data/spec/forms/newspaper_works/newspaper_core_form_data_spec.rb +12 -0
  349. data/spec/helpers/newspaper_works/breadcrumb_helper_spec.rb +82 -0
  350. data/spec/helpers/newspaper_works_helper_spec.rb +57 -0
  351. data/spec/indexers/concerns/newspaper_works/indexes_full_text_spec.rb +31 -0
  352. data/spec/indexers/concerns/newspaper_works/indexes_place_of_publication_spec.rb +53 -0
  353. data/spec/indexers/concerns/newspaper_works/indexes_publication_date_range_spec.rb +39 -0
  354. data/spec/indexers/concerns/newspaper_works/indexes_relationships_spec.rb +86 -0
  355. data/spec/indexers/newspaper_article_indexer_spec.rb +29 -0
  356. data/spec/indexers/newspaper_issue_indexer_spec.rb +19 -0
  357. data/spec/indexers/newspaper_title_indexer_spec.rb +22 -0
  358. data/spec/indexers/newspaper_works/newspaper_core_indexer_spec.rb +23 -0
  359. data/spec/lib/newspaper_works/configuration_spec.rb +18 -0
  360. data/spec/lib/newspaper_works/data/work_derivatives_spec.rb +245 -0
  361. data/spec/lib/newspaper_works/data/work_file_spec.rb +99 -0
  362. data/spec/lib/newspaper_works/data/work_files_spec.rb +224 -0
  363. data/spec/lib/newspaper_works/ingest/batch_issue_ingester_spec.rb +158 -0
  364. data/spec/lib/newspaper_works/ingest/chronam_publication_info_spec.rb +35 -0
  365. data/spec/lib/newspaper_works/ingest/from_command_spec.rb +75 -0
  366. data/spec/lib/newspaper_works/ingest/image_ingest_issues_spec.rb +62 -0
  367. data/spec/lib/newspaper_works/ingest/ingest_shared.rb +75 -0
  368. data/spec/lib/newspaper_works/ingest/issue_images_spec.rb +65 -0
  369. data/spec/lib/newspaper_works/ingest/lc_publication_info_spec.rb +34 -0
  370. data/spec/lib/newspaper_works/ingest/ndnp/batch_ingester_spec.rb +131 -0
  371. data/spec/lib/newspaper_works/ingest/ndnp/batch_xml_ingest_spec.rb +64 -0
  372. data/spec/lib/newspaper_works/ingest/ndnp/container_ingest_spec.rb +44 -0
  373. data/spec/lib/newspaper_works/ingest/ndnp/container_ingester_spec.rb +126 -0
  374. data/spec/lib/newspaper_works/ingest/ndnp/container_metadata_spec.rb +36 -0
  375. data/spec/lib/newspaper_works/ingest/ndnp/issue_ingest_spec.rb +108 -0
  376. data/spec/lib/newspaper_works/ingest/ndnp/issue_ingester_spec.rb +155 -0
  377. data/spec/lib/newspaper_works/ingest/ndnp/issue_metadata_spec.rb +84 -0
  378. data/spec/lib/newspaper_works/ingest/ndnp/page_ingest_spec.rb +79 -0
  379. data/spec/lib/newspaper_works/ingest/ndnp/page_ingester_spec.rb +184 -0
  380. data/spec/lib/newspaper_works/ingest/ndnp/page_metadata_spec.rb +85 -0
  381. data/spec/lib/newspaper_works/ingest/newspaper_issue_ingest_spec.rb +83 -0
  382. data/spec/lib/newspaper_works/ingest/newspaper_page_ingest_spec.rb +77 -0
  383. data/spec/lib/newspaper_works/ingest/page_image_spec.rb +29 -0
  384. data/spec/lib/newspaper_works/ingest/pdf_images_spec.rb +32 -0
  385. data/spec/lib/newspaper_works/ingest/pdf_issue_spec.rb +29 -0
  386. data/spec/lib/newspaper_works/ingest/pdf_issues_spec.rb +62 -0
  387. data/spec/lib/newspaper_works/ingest/pdf_pages_spec.rb +110 -0
  388. data/spec/lib/newspaper_works/ingest/pub_finder_spec.rb +58 -0
  389. data/spec/lib/newspaper_works/ingest/publication_info_spec.rb +61 -0
  390. data/spec/lib/newspaper_works/ingest_spec.rb +45 -0
  391. data/spec/lib/newspaper_works/issue_pdf_composer_spec.rb +101 -0
  392. data/spec/lib/newspaper_works/logging_spec.rb +53 -0
  393. data/spec/lib/newspaper_works/page_finder_spec.rb +53 -0
  394. data/spec/lib/newspaper_works/resource_fetcher_spec.rb +65 -0
  395. data/spec/lib/newspaper_works/text_extraction/alto_reader_spec.rb +49 -0
  396. data/spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb +84 -0
  397. data/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb +54 -0
  398. data/spec/lib/newspaper_works/text_extraction/word_coords_builder_spec.rb +30 -0
  399. data/spec/lib/tasks/newspaper_works_rake_spec.rb +124 -0
  400. data/spec/misc_shared.rb +109 -0
  401. data/spec/model_shared.rb +134 -0
  402. data/spec/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior_spec.rb +45 -0
  403. data/spec/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior_spec.rb +27 -0
  404. data/spec/models/concerns/newspaper_works/newspaper_core_metadata_spec.rb +45 -0
  405. data/spec/models/concerns/newspaper_works/place_of_publication_behavior_spec.rb +17 -0
  406. data/spec/models/concerns/newspaper_works/scanned_media_metadata_spec.rb +35 -0
  407. data/spec/models/newspaper_article_spec.rb +73 -0
  408. data/spec/models/newspaper_container_spec.rb +111 -0
  409. data/spec/models/newspaper_issue_spec.rb +91 -0
  410. data/spec/models/newspaper_page_spec.rb +44 -0
  411. data/spec/models/newspaper_title_spec.rb +116 -0
  412. data/spec/models/newspaper_works/derivative_attachment_spec.rb +37 -0
  413. data/spec/models/newspaper_works/ingest_file_relation_spec.rb +56 -0
  414. data/spec/models/solr_document_spec.rb +14 -0
  415. data/spec/ndnp_shared.rb +48 -0
  416. data/spec/presenters/hyrax/newspaper_article_presenter_spec.rb +53 -0
  417. data/spec/presenters/hyrax/newspaper_container_presenter_spec.rb +20 -0
  418. data/spec/presenters/hyrax/newspaper_issue_presenter_spec.rb +65 -0
  419. data/spec/presenters/hyrax/newspaper_page_presenter_spec.rb +75 -0
  420. data/spec/presenters/hyrax/newspaper_title_presenter_spec.rb +153 -0
  421. data/spec/presenters/newspaper_works/iiif_manifest_presenter_behavior_spec.rb +32 -0
  422. data/spec/presenters/newspaper_works/issue_info_presenter_spec.rb +51 -0
  423. data/spec/presenters/newspaper_works/newspaper_core_presenter_spec.rb +22 -0
  424. data/spec/presenters/newspaper_works/persistent_url_presenter_behavior_spec.rb +24 -0
  425. data/spec/presenters/newspaper_works/place_of_publication_presenter_behavior_spec.rb +17 -0
  426. data/spec/presenters/newspaper_works/scanned_media_presenter_spec.rb +18 -0
  427. data/spec/presenters/newspaper_works/title_info_presenter_spec.rb +23 -0
  428. data/spec/routing/route_spec.rb +52 -0
  429. data/spec/search_builders/custom_search_builder_spec.rb +34 -0
  430. data/spec/search_builders/newspaper_works/newspapers_search_builder_spec.rb +33 -0
  431. data/spec/services/hyrax/article_genre_service_spec.rb +12 -0
  432. data/spec/services/hyrax/resource_types_service_spec.rb +12 -0
  433. data/spec/services/newspaper_works/jp2_derivative_service_spec.rb +62 -0
  434. data/spec/services/newspaper_works/newspaper_page_derivative_service_spec.rb +125 -0
  435. data/spec/services/newspaper_works/pdf_derivative_service_spec.rb +62 -0
  436. data/spec/services/newspaper_works/pluggable_derivative_service_spec.rb +204 -0
  437. data/spec/services/newspaper_works/text_extraction_derivative_service_spec.rb +82 -0
  438. data/spec/services/newspaper_works/text_formats_from_alto_service_spec.rb +129 -0
  439. data/spec/services/newspaper_works/tiff_derivative_service_spec.rb +58 -0
  440. data/spec/spec_helper.rb +261 -0
  441. data/spec/support/controller_level_helpers.rb +28 -0
  442. data/spec/test_app_templates/lib/generators/test_app_generator.rb +22 -0
  443. data/spec/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb_spec.rb +36 -0
  444. data/spec/views/catalog/_index_header_list_newspaper_page.html.erb_spec.rb +26 -0
  445. data/spec/views/catalog/_thumbnail_list_newspaper_page.html.erb_spec.rb +35 -0
  446. data/spec/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb_spec.rb +16 -0
  447. data/spec/views/hyrax/newspaper_titles/_issue_search_form.html.erb_spec.rb +33 -0
  448. data/spec/views/hyrax/newspaper_titles/_issues_calendar.html.erb_spec.rb +37 -0
  449. data/spec/views/hyrax/newspaper_titles/show.html.erb_spec.rb +87 -0
  450. data/spec/views/newspaper_works/base/_attribute_rows.html.erb_spec.rb +60 -0
  451. data/spec/views/newspaper_works/base/_newspaper_hierarchy.html.erb_spec.rb +80 -0
  452. data/spec/views/newspaper_works/base/_show.html.erb_spec.rb +78 -0
  453. data/spec/views/newspaper_works/newspapers_search/search.html.erb_spec.rb +54 -0
  454. data/spec/views/records/edit_fields/_place_of_publication.html.erb_spec.rb +26 -0
  455. data/tasks/newspaperworks_dev.rake +26 -0
  456. data/test/integration/navigation_test.rb +7 -0
  457. data/test/lib/generators/newspaper_works/install_generator_test.rb +16 -0
  458. data/test/newspaper_works_test.rb +7 -0
  459. data/test/test_helper.rb +17 -0
  460. data/tmp/.keep +0 -0
  461. metadata +1037 -0
@@ -0,0 +1,54 @@
1
+ module NewspaperWorks
2
+ module Logging
3
+ class << self
4
+ attr_accessor :configured
5
+ end
6
+ self.configured = []
7
+
8
+ def logger
9
+ @logger = Rails.logger
10
+ end
11
+
12
+ # Log message, as in standard logger, but use message_format on message.
13
+ # @param severity [Integer] log level/severity, e.g. Logger::INFO == 2
14
+ # @param msg [String] Log message to be formatted by message_format
15
+ # @param progname [String] (optional)
16
+ def log(severity, msg, progname = nil, &block)
17
+ logger.add(severity, message_format(msg), progname, &block)
18
+ end
19
+
20
+ # Simpler alternative to .log, with default severity, message_format
21
+ # wrapping.
22
+ # @param msg [String] Log message to be formatted by message_format
23
+ # @param severity [Integer] log level/severity, e.g. Logger::INFO == 2
24
+ # @param progname [String]
25
+ def write_log(msg, severity = Logger::INFO, progname = nil)
26
+ logger.add(severity, message_format(msg), progname)
27
+ end
28
+
29
+ # format message, distinct from per-output formatting, to be used in
30
+ # all logging channels Rails.logger broadcasts to. This wrapping
31
+ # indicates in parenthetical prefix which class is acting to
32
+ # produce message.
33
+ # @param msg [String]
34
+ def message_format(msg)
35
+ "(#{self.class}) #{msg}"
36
+ end
37
+
38
+ # Should be called by consuming class, prior to use of .logger method
39
+ # has checks to prevent duplicate configuration if already configured.
40
+ def configure_logger(name)
41
+ @logger = Rails.logger
42
+ return if NewspaperWorks::Logging.configured.include?(name)
43
+ path = Rails.root.join("log/#{name}.log")
44
+ @named_log = ActiveSupport::Logger.new(path)
45
+ @named_log.formatter = proc do |_severity, datetime, _progname, msg|
46
+ "#{datetime}: #{msg}\n"
47
+ end
48
+ # rails will log to named_log in addition to any other configured
49
+ # or default logging destinations:
50
+ @logger.extend(ActiveSupport::Logger.broadcast(@named_log))
51
+ NewspaperWorks::Logging.configured.push(name)
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,62 @@
1
+ # useful methods for retrieving and ordering NewspaperPage objects
2
+ module NewspaperWorks
3
+ module PageFinder
4
+ ##
5
+ # find all pages for an issue, return in order
6
+ # @param issue_id [String]
7
+ # @return [Array] ordered NewspaperPage SolrDocuments for an issue
8
+ def pages_for_issue(issue_id)
9
+ solr_params = ["has_model_ssim:\"NewspaperPage\""]
10
+ solr_params << "issue_id_ssi:\"#{issue_id}\""
11
+ solr_resp = Blacklight.default_index.search(fq: solr_params.join(' AND '))
12
+ all_pages = solr_resp.documents
13
+ return [] if all_pages.blank?
14
+ ordered_pages(all_pages)
15
+ end
16
+
17
+ ##
18
+ # return an ordered array of NewspaperPage documents
19
+ # @param documents [Array] NewspaperPage SolrDocuments for an issue
20
+ # @return [Array] ordered NewspaperPage SolrDocuments for an issue
21
+ def ordered_pages(documents)
22
+ return documents if documents.length <= 1
23
+ ordered_list = []
24
+ next_page_id, final_page_id = nil
25
+ documents.each do |doc|
26
+ if doc['is_following_page_of_ssi'].blank?
27
+ ordered_list.insert(0, doc)
28
+ next_page_id = doc['is_preceding_page_of_ssi']
29
+ elsif doc['is_preceding_page_of_ssi'].blank?
30
+ ordered_list.insert(-1, doc)
31
+ final_page_id = doc['id']
32
+ end
33
+ end
34
+ return documents if next_page_id.nil?
35
+ while next_page_id != final_page_id
36
+ next_page = documents.select { |doc| doc['id'] == next_page_id }.first
37
+ ordered_list.insert(-2, next_page)
38
+ next_page_id = next_page['is_preceding_page_of_ssi']
39
+ end
40
+ ordered_list
41
+ end
42
+
43
+ ##
44
+ # return the index of the current page
45
+ # @param page_id [String] id of the NewspaperPage
46
+ # @param issue_id [String] id of the parent NewspaperIssue
47
+ # @return [Integer] the page's index
48
+ def get_page_index(page_id, issue_id = nil)
49
+ default_index = 0
50
+ unless issue_id
51
+ page_doc = SolrDocument.find(page_id)
52
+ return default_index unless page_doc &&
53
+ page_doc['issue_id_ssi'] &&
54
+ page_doc['is_following_page_of_ssi']
55
+ issue_id = page_doc['issue_id_ssi']
56
+ end
57
+ all_pages = pages_for_issue(issue_id)
58
+ return default_index if all_pages.blank?
59
+ all_pages.index { |page| page['id'] == page_id }
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,78 @@
1
+ module NewspaperWorks
2
+ # in-memory caching fetcher for HTTP GET requests, wraps Faraday.get
3
+ class ResourceFetcher
4
+ # only cache following HTTP response codes, per Section 6.1, RFC 7231
5
+ CACHEABLE_STATUS = [
6
+ 200, 203, 204, 206, 300, 301, 404, 405, 410, 414, 501
7
+ ].freeze
8
+
9
+ class << self
10
+ attr_accessor :cache
11
+ end
12
+
13
+ def self.get(url, stale_after = 3600)
14
+ new(stale_after).get(url)
15
+ end
16
+
17
+ def self.include?(url)
18
+ return false if cache.nil?
19
+ cache.keys.include?(url)
20
+ end
21
+
22
+ def initialize(stale_after = 3600)
23
+ @stale_after = stale_after # seconds
24
+ # initialize shared state only if missing:
25
+ self.class.cache = {} if self.class.cache.nil?
26
+ end
27
+
28
+ def get(url)
29
+ cache_get(url) || miss_get(url)
30
+ end
31
+
32
+ # @return [Hash] shared cache state
33
+ def cache
34
+ self.class.cache
35
+ end
36
+
37
+ # @return [NilClass, Hash] hash of status, response body — or nil if no HIT
38
+ def cache_get(url)
39
+ return unless cache.include?(url)
40
+ check_expiry(url)
41
+ # in case of expiration, cache will no longer include URL:
42
+ return unless cache.include?(url)
43
+ # return non-expired cache HIT:
44
+ cache[url]
45
+ end
46
+
47
+ # Get URL from original source, by URL; will cache any cachable response
48
+ # in self.class.cache (shared state).
49
+ # @param url [String] URL to GET
50
+ # @raise [Faraday::ConnectionFailed] if DNS or TCP connection error.
51
+ # @return [Hash] hash containing status, response headers, response body
52
+ def miss_get(url)
53
+ resp = Faraday.get url
54
+ # create a new hash from headers
55
+ result = resp.headers.to_h
56
+ # add status and body to
57
+ result['status'] = resp.status
58
+ result['body'] = resp.body
59
+ # set (new or replaced previously) cached value for URL:
60
+ if CACHEABLE_STATUS.include?(resp.status)
61
+ result['cached_time'] = DateTime.now.to_time.to_i
62
+ cache[url] = result
63
+ end
64
+ result
65
+ end
66
+
67
+ def check_expiry(url)
68
+ return unless cache.include?(url)
69
+ cache.delete(url) if expired(cache[url])
70
+ end
71
+
72
+ def expired(record)
73
+ now = DateTime.now.to_time.to_i
74
+ # does elapsed seconds between store and now exceed threshold?
75
+ (now - record['cached_time']) > @stale_after
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,122 @@
1
+ require 'active_support/core_ext/module/delegation'
2
+ require 'json'
3
+ require 'nokogiri'
4
+
5
+ module NewspaperWorks
6
+ # Module for text extraction
7
+ module TextExtraction
8
+ # Class to obtain plain text and JSON word-coordinates from ALTO source
9
+ class AltoReader
10
+ attr_accessor :source, :doc_stream
11
+ delegate :text, to: :doc_stream
12
+
13
+ # SAX Document Stream class to gather text and word tokens from ALTO
14
+ class AltoDocStream < Nokogiri::XML::SAX::Document
15
+ attr_accessor :text, :words
16
+
17
+ def initialize(image_width = nil)
18
+ super()
19
+ # scaling matters:
20
+ @image_width = image_width
21
+ @scaling = 1.0 # pt to px, if ALTO using points
22
+ # plain text buffer:
23
+ @text = ''
24
+ # list of word hash, containing word+coord:
25
+ @words = []
26
+ end
27
+
28
+ # Return coordinates from String element attribute hash
29
+ #
30
+ # @param attrs [Hash] hash containing ALTO `String` element attributes.
31
+ # @return [Array] Array of position x, y, width, height in px.
32
+ def s_coords(attrs)
33
+ height = scale_value((attrs['HEIGHT'] || 0).to_i)
34
+ width = scale_value((attrs['WIDTH'] || 0).to_i)
35
+ hpos = scale_value((attrs['HPOS'] || 0).to_i)
36
+ vpos = scale_value((attrs['VPOS'] || 0).to_i)
37
+ [hpos, vpos, width, height]
38
+ end
39
+
40
+ def compute_scaling(attrs)
41
+ return if @image_width.nil?
42
+ match = attrs.select { |e| e[0].casecmp?('WIDTH') }[0]
43
+ return if match.empty?
44
+ page_width = match[1].to_i
45
+ return if @image_width == page_width
46
+ @scaling = page_width / @image_width.to_f
47
+ end
48
+
49
+ def scale_value(v)
50
+ (v / @scaling).to_i
51
+ end
52
+
53
+ # Callback for element start, implementation of which ignores
54
+ # non-String elements.
55
+ #
56
+ # @param name [String] element name.
57
+ # @param attrs [Array] Array of key, value pair Arrays.
58
+ def start_element(name, attrs = [])
59
+ values = attrs.to_h
60
+ compute_scaling(attrs) if name == 'Page'
61
+ return if name != 'String'
62
+ token = values['CONTENT']
63
+ @text << token
64
+ @words << {
65
+ word: token,
66
+ coordinates: s_coords(values)
67
+ }
68
+ end
69
+
70
+ # Callback for element end, used here to manage endings of lines and
71
+ # blocks.
72
+ #
73
+ # @param name [String] element name.
74
+ def end_element(name)
75
+ @text << " " if name == 'String'
76
+ @text << "\n" if name == 'TextBlock'
77
+ @text << "\n" if name == 'TextLine'
78
+ end
79
+
80
+ # Callback for completion of parsing ALTO, used to normalize generated
81
+ # text content (strip unneeded whitespace incidental to output).
82
+ def end_document
83
+ # postprocess @text to remove trailing spaces on lines
84
+ @text = @text.split("\n").map(&:strip).join("\n")
85
+ # remove trailing whitespace at end of buffer
86
+ @text.strip!
87
+ end
88
+ end
89
+
90
+ # Construct with either path
91
+ #
92
+ # @param xml [String], and process document
93
+ def initialize(xml, image_width = nil, image_height = nil)
94
+ @source = isxml?(xml) ? xml : File.read(xml)
95
+ @image_width = image_width
96
+ @image_height = image_height
97
+ @doc_stream = AltoDocStream.new(image_width)
98
+ parser = Nokogiri::XML::SAX::Parser.new(doc_stream)
99
+ parser.parse(@source)
100
+ end
101
+
102
+ # Determine if source parameter is path or xml
103
+ #
104
+ # @param xml [String] either path to xml file or xml source
105
+ # @return [true, false] true if string appears to be XML source, not path
106
+ def isxml?(xml)
107
+ xml.lstrip.start_with?('<')
108
+ end
109
+
110
+ # Output JSON flattened word coordinates
111
+ #
112
+ # @return [String] JSON serialization of flattened word coordinates
113
+ def json
114
+ words = @doc_stream.words
115
+ builder = NewspaperWorks::TextExtraction::WordCoordsBuilder.new(words,
116
+ @image_width,
117
+ @image_height)
118
+ builder.to_json
119
+ end
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,100 @@
1
+ require 'json'
2
+ require 'open3'
3
+ require 'rtesseract'
4
+
5
+ # --
6
+ module NewspaperWorks
7
+ # Module for text extraction (OCR or otherwise)
8
+ module TextExtraction
9
+ class PageOCR
10
+ def self.alto_from(path)
11
+ new(path).alto
12
+ end
13
+
14
+ def initialize(path)
15
+ @path = path
16
+ @words = nil
17
+ @processor = "mini_magick"
18
+ @source_meta = nil
19
+ @use_gm = extension.start_with?('jp2')
20
+ @box = nil
21
+ @plain = nil
22
+ end
23
+
24
+ def extension
25
+ @path.split('.')[-1].downcase
26
+ end
27
+
28
+ def load_box
29
+ if @box.nil?
30
+ if @use_gm
31
+ MiniMagick.with_cli(:graphicsmagick) do
32
+ @box = RTesseract::Box.new(@path, processor: @processor)
33
+ @plain = @box.to_s
34
+ end
35
+ else
36
+ @box = RTesseract::Box.new(@path, processor: @processor)
37
+ @plain = @box.to_s
38
+ end
39
+ end
40
+ @box
41
+ end
42
+
43
+ def words
44
+ @words = load_box.words if @words.nil?
45
+ @words
46
+ end
47
+
48
+ def normalized_coordinate(word)
49
+ {
50
+ word: word[:word],
51
+ coordinates: [
52
+ word[:x_start],
53
+ word[:y_start],
54
+ (word[:x_end] - word[:x_start]),
55
+ (word[:y_end] - word[:y_start])
56
+ ]
57
+ }
58
+ end
59
+
60
+ def word_json
61
+ save_words = words.map { |w| normalized_coordinate(w) }
62
+ builder = NewspaperWorks::TextExtraction::WordCoordsBuilder.new(save_words,
63
+ width,
64
+ height)
65
+ builder.to_json
66
+ end
67
+
68
+ def plain
69
+ load_box
70
+ @plain
71
+ end
72
+
73
+ def identify
74
+ if @source_geometry.nil?
75
+ path = @path
76
+ cmd = "identify -verbose #{path}"
77
+ cmd = 'gm ' + cmd if @use_gm
78
+ lines = `#{cmd}`.lines
79
+ geo = lines.select { |line| line.strip.start_with?('Geometry') }[0]
80
+ img_geo = geo.strip.split(':')[-1].strip.split('+')[0]
81
+ @source_geometry = img_geo.split('x').map(&:to_i)
82
+ end
83
+ @source_geometry
84
+ end
85
+
86
+ def width
87
+ identify[0]
88
+ end
89
+
90
+ def height
91
+ identify[1]
92
+ end
93
+
94
+ def alto
95
+ writer = NewspaperWorks::TextExtraction::RenderAlto.new(width, height)
96
+ writer.to_alto(words)
97
+ end
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,84 @@
1
+ require 'nokogiri'
2
+
3
+ module NewspaperWorks
4
+ # Module for text extraction (OCR or otherwise)
5
+ module TextExtraction
6
+ class RenderAlto
7
+ def initialize(width, height, scaling = 1.0)
8
+ @height = height
9
+ @width = width
10
+ @scaling = scaling
11
+ end
12
+
13
+ def to_alto(words)
14
+ page = alto_page(@width, @height) do |xml|
15
+ words.each do |word|
16
+ xml.String(
17
+ CONTENT: word[:word],
18
+ HEIGHT: scale_point(word[:y_end] - word[:y_start]).to_s,
19
+ WIDTH: scale_point(word[:x_end] - word[:x_start]).to_s,
20
+ HPOS: scale_point(word[:x_start]).to_s,
21
+ VPOS: scale_point(word[:y_start]).to_s
22
+ ) { xml.text '' }
23
+ end
24
+ end
25
+ page.to_xml
26
+ end
27
+
28
+ private
29
+
30
+ # given block to manage word generation, wrap with page/block/line
31
+ def alto_page(pxwidth, pxheight, &block)
32
+ builder = Nokogiri::XML::Builder.new(encoding: 'UTF-8') do |xml|
33
+ xml.alto(xmlns: 'http://www.loc.gov/standards/alto/ns-v2#') do
34
+ xml.Description do
35
+ xml.MeasurementUnit 'pixel'
36
+ end
37
+ alto_layout(xml, pxwidth, pxheight, &block)
38
+ end
39
+ end
40
+ builder
41
+ end
42
+
43
+ def scale_point(value)
44
+ # note: presuming non-fractional, even though ALTO 2.1
45
+ # specifies coordinates are xsd:float, not xsd:int,
46
+ # simplify to integer value for output:
47
+ (value * @scaling).to_i
48
+ end
49
+
50
+ # return layout for page
51
+ def alto_layout(xml, pxwidth, pxheight, &block)
52
+ xml.Layout do
53
+ xml.Page(ID: 'ID1',
54
+ PHYSICAL_IMG_NR: '1',
55
+ HEIGHT: pxheight.to_i,
56
+ WIDTH: pxwidth.to_i) do
57
+ xml.PrintSpace(HEIGHT: pxheight.to_i,
58
+ WIDTH: pxwidth.to_i,
59
+ HPOS: '0',
60
+ VPOS: '0') do
61
+ alto_blockline(xml, pxwidth, pxheight, &block)
62
+ end
63
+ end
64
+ end
65
+ end
66
+
67
+ # make block line and call word-block
68
+ def alto_blockline(xml, pxwidth, pxheight)
69
+ xml.TextBlock(ID: 'ID1a',
70
+ HEIGHT: pxheight.to_i,
71
+ WIDTH: pxwidth.to_i,
72
+ HPOS: '0',
73
+ VPOS: '0') do
74
+ xml.TextLine(HEIGHT: pxheight.to_i,
75
+ WIDTH: pxwidth.to_i,
76
+ HPOS: '0',
77
+ VPOS: '0') do
78
+ yield(xml)
79
+ end
80
+ end
81
+ end
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,30 @@
1
+ module NewspaperWorks
2
+ # Module for text extraction (OCR or otherwise)
3
+ module TextExtraction
4
+ class WordCoordsBuilder
5
+ def initialize(words, width = nil, height = nil)
6
+ @words = words
7
+ @width = width
8
+ @height = height
9
+ end
10
+
11
+ # Output JSON flattened word coordinates
12
+ #
13
+ # @return [String] JSON serialization of flattened word coordinates
14
+ def to_json
15
+ coordinates = {}
16
+ @words.each do |w|
17
+ word_chars = w[:word]
18
+ word_coords = w[:coordinates]
19
+ if coordinates[word_chars]
20
+ coordinates[word_chars] << word_coords
21
+ else
22
+ coordinates[word_chars] = [word_coords]
23
+ end
24
+ end
25
+ payload = { width: @width, height: @height, coords: coordinates }
26
+ JSON.generate(payload)
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,10 @@
1
+ require 'newspaper_works/text_extraction/alto_reader'
2
+ require 'newspaper_works/text_extraction/page_ocr'
3
+ require 'newspaper_works/text_extraction/render_alto'
4
+ require 'newspaper_works/text_extraction/word_coords_builder'
5
+
6
+ module NewspaperWorks
7
+ # Module for text extraction (OCR or otherwise)
8
+ module TextExtraction
9
+ end
10
+ end
@@ -0,0 +1,3 @@
1
+ module NewspaperWorks
2
+ VERSION = '0.1.0'.freeze
3
+ end
@@ -0,0 +1,19 @@
1
+ require "newspaper_works/engine"
2
+ require "newspaper_works/errors"
3
+ require "newspaper_works/ingest"
4
+ require "newspaper_works/issue_pdf_composer"
5
+ require "newspaper_works/text_extraction"
6
+ require "newspaper_works/data"
7
+ require "newspaper_works/configuration"
8
+ require "newspaper_works/page_finder"
9
+ require "newspaper_works/logging"
10
+ require "newspaper_works/resource_fetcher"
11
+
12
+ # Newspaper works modules
13
+ module NewspaperWorks
14
+ def self.config(&block)
15
+ @config ||= NewspaperWorks::Configuration.new
16
+ yield @config if block
17
+ @config
18
+ end
19
+ end
@@ -0,0 +1,39 @@
1
+ namespace :newspaper_works do
2
+ def use_application
3
+ ENV['RAILS_ENV'] = Rails.env if ENV['RAILS_ENV'].nil?
4
+ Rails.application.require_environment!
5
+ end
6
+
7
+ desc 'Ingest an NDNP batch: "rake newspaper_works:ingest_ndnp -- --path="'
8
+ task :ingest_ndnp do
9
+ use_application
10
+ ingester = NewspaperWorks::Ingest::NDNP::BatchIngester.from_command(
11
+ ARGV,
12
+ 'rake newspaper_works:ingest_ndnp --'
13
+ )
14
+ puts "Beginning NDNP batch ingest..."
15
+ ingester.ingest
16
+ puts "NDNP batch ingest complete! See log/ingest.log for details."
17
+ end
18
+
19
+ desc 'Ingest a directory of PDF issues for a single publication: '\
20
+ '"rake newspaper_works:ingest_pdf_issues -- --path="'
21
+ task :ingest_issues do
22
+ use_application
23
+ ingester = NewspaperWorks::Ingest::BatchIssueIngester.from_command(
24
+ ARGV,
25
+ 'rake newspaper_works:ingest_issues --'
26
+ )
27
+ puts "Beginning batch ingest of issues for single publication..."
28
+ ingester.ingest
29
+ puts "Ingest of issue(s) ingest complete, but may be pending background "\
30
+ "jobs. See log/ingest.log for details."
31
+ end
32
+
33
+ # Aliases to media-specific task ingest names
34
+ # rubocop:disable Style/HashSyntax
35
+ task :ingest_pdf_issues => :ingest_issues
36
+ task :ingest_tiff_issues => :ingest_issues
37
+ task :ingest_jp2_issues => :ingest_issues
38
+ # rubocop:enable Style/HashSyntax
39
+ end
@@ -0,0 +1,49 @@
1
+ $LOAD_PATH.push File.expand_path('../lib', __FILE__)
2
+
3
+ # version updated in one place:
4
+ require 'newspaper_works/version'
5
+
6
+ # Gem description:
7
+ Gem::Specification.new do |spec|
8
+ spec.name = 'newspaper_works'
9
+ spec.version = NewspaperWorks::VERSION
10
+ spec.authors = ['Sean Upton', 'Jacob Reed', 'Brian McBride',
11
+ 'Eben English']
12
+ spec.email = ['sean.upton@utah.edu', 'jacob.reed@utah.edu',
13
+ 'brian.mcbride@utah.edu', 'eenglish@bpl.org']
14
+ spec.homepage = 'https://github.com/marriott-library/newspaper_works'
15
+ spec.description = 'Gem/Engine for Newspaper Works in Hyrax-based Samvera
16
+ Application.'
17
+ spec.summary = <<-SUMMARY
18
+ newspaper_works is a Rails Engine gem providing model and administrative
19
+ functions to Hyrax-based Samvera applications, for management of
20
+ (primarily scanned) archival newspaper content.
21
+ SUMMARY
22
+ spec.license = 'Apache-2.0'
23
+ spec.files = `git ls-files`.split($OUTPUT_RECORD_SEPARATOR)
24
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
25
+ spec.add_dependency 'blacklight_iiif_search'
26
+ spec.add_dependency 'blacklight_advanced_search', '6.4.1'
27
+ spec.add_dependency 'hyrax', '2.5.1'
28
+ spec.add_dependency 'nokogiri'
29
+ spec.add_dependency 'rails', '~> 5.1'
30
+ spec.add_dependency 'rtesseract', '~> 2.2.0'
31
+ spec.add_dependency 'sass-rails', '~> 5.0'
32
+
33
+ spec.add_development_dependency 'bixby'
34
+ spec.add_development_dependency 'capybara', '~> 2.4', '< 2.18.0'
35
+ spec.add_development_dependency 'chromedriver-helper', '~> 2.1'
36
+ spec.add_development_dependency 'engine_cart', '~> 2.2'
37
+ spec.add_development_dependency "factory_bot", '~> 4.4'
38
+ spec.add_development_dependency "faraday"
39
+ spec.add_development_dependency 'fcrepo_wrapper', '~> 0.5', '>= 0.5.1'
40
+ spec.add_development_dependency 'newspaper_works_fixtures', '~> 0.3', '>=0.3.1'
41
+ spec.add_development_dependency 'rails-controller-testing', '~> 1'
42
+ spec.add_development_dependency 'rspec-rails', '~> 3.1'
43
+ spec.add_development_dependency 'rspec-activemodel-mocks'
44
+ spec.add_development_dependency 'selenium-webdriver'
45
+ spec.add_development_dependency 'shoulda-matchers', '~> 3.1'
46
+ spec.add_development_dependency 'solr_wrapper', '>= 1.1', '< 3.0'
47
+ spec.add_development_dependency 'webdrivers', '~> 3.0'
48
+ spec.add_development_dependency 'webmock', '~> 3.6'
49
+ end
data/spec/.keep.txt ADDED
@@ -0,0 +1 @@
1
+ spec dir for RSpec