newspaper_works 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (461) hide show
  1. checksums.yaml +7 -0
  2. data/.fcrepo_wrapper +4 -0
  3. data/.gitignore +43 -0
  4. data/.rubocop.yml +143 -0
  5. data/.solr_wrapper +8 -0
  6. data/.travis.yml +50 -0
  7. data/Gemfile +47 -0
  8. data/LICENSE +203 -0
  9. data/README.md +159 -0
  10. data/Rakefile +38 -0
  11. data/app/actors/hyrax/actors/newspaper_article_actor.rb +8 -0
  12. data/app/actors/hyrax/actors/newspaper_container_actor.rb +8 -0
  13. data/app/actors/hyrax/actors/newspaper_issue_actor.rb +8 -0
  14. data/app/actors/hyrax/actors/newspaper_page_actor.rb +8 -0
  15. data/app/actors/hyrax/actors/newspaper_title_actor.rb +8 -0
  16. data/app/actors/newspaper_works/actors/newspaper_works_upload_actor.rb +88 -0
  17. data/app/assets/config/newspaper_works_manifest.js +2 -0
  18. data/app/assets/images/newspaper_works/.keep +0 -0
  19. data/app/assets/javascripts/newspaper_works/autocomplete_fix.js +33 -0
  20. data/app/assets/javascripts/newspaper_works/ocr_search.js.erb +6 -0
  21. data/app/assets/javascripts/newspaper_works/thumbnail_highlights.js.erb +102 -0
  22. data/app/assets/javascripts/newspaper_works.js +4 -0
  23. data/app/assets/stylesheets/newspaper_works/_issue_search.scss +13 -0
  24. data/app/assets/stylesheets/newspaper_works/_issues_calendar.scss +18 -0
  25. data/app/assets/stylesheets/newspaper_works/_newspaper_works.scss +4 -0
  26. data/app/assets/stylesheets/newspaper_works/_newspapers_search.scss +38 -0
  27. data/app/assets/stylesheets/newspaper_works/_search_results.scss +12 -0
  28. data/app/controllers/hyrax/newspaper_articles_controller.rb +14 -0
  29. data/app/controllers/hyrax/newspaper_containers_controller.rb +14 -0
  30. data/app/controllers/hyrax/newspaper_issues_controller.rb +14 -0
  31. data/app/controllers/hyrax/newspaper_pages_controller.rb +14 -0
  32. data/app/controllers/hyrax/newspaper_titles_controller.rb +13 -0
  33. data/app/controllers/newspaper_works/newspapers_controller.rb +117 -0
  34. data/app/controllers/newspaper_works/newspapers_search_controller.rb +26 -0
  35. data/app/forms/hyrax/newspaper_article_form.rb +11 -0
  36. data/app/forms/hyrax/newspaper_container_form.rb +11 -0
  37. data/app/forms/hyrax/newspaper_issue_form.rb +11 -0
  38. data/app/forms/hyrax/newspaper_page_form.rb +15 -0
  39. data/app/forms/hyrax/newspaper_title_form.rb +12 -0
  40. data/app/forms/newspaper_works/newspaper_core_form_data.rb +17 -0
  41. data/app/helpers/hyrax/newspaper_articles_helper.rb +5 -0
  42. data/app/helpers/hyrax/newspaper_containers_helper.rb +5 -0
  43. data/app/helpers/hyrax/newspaper_issues_helper.rb +5 -0
  44. data/app/helpers/hyrax/newspaper_pages_helper.rb +5 -0
  45. data/app/helpers/newspaper_works/application_helper.rb +5 -0
  46. data/app/helpers/newspaper_works/breadcrumb_helper.rb +92 -0
  47. data/app/helpers/newspaper_works/newspaper_works_helper_behavior.rb +103 -0
  48. data/app/helpers/newspaper_works/newspapers_helper.rb +5 -0
  49. data/app/indexers/concerns/newspaper_works/indexes_full_text.rb +17 -0
  50. data/app/indexers/concerns/newspaper_works/indexes_place_of_publication.rb +67 -0
  51. data/app/indexers/concerns/newspaper_works/indexes_publication_date_range.rb +35 -0
  52. data/app/indexers/concerns/newspaper_works/indexes_relationships.rb +125 -0
  53. data/app/indexers/newspaper_article_indexer.rb +16 -0
  54. data/app/indexers/newspaper_container_indexer.rb +18 -0
  55. data/app/indexers/newspaper_issue_indexer.rb +26 -0
  56. data/app/indexers/newspaper_page_indexer.rb +9 -0
  57. data/app/indexers/newspaper_title_indexer.rb +19 -0
  58. data/app/indexers/newspaper_works/newspaper_core_indexer.rb +21 -0
  59. data/app/jobs/newspaper_works/application_job.rb +4 -0
  60. data/app/jobs/newspaper_works/compose_issue_pdf_job.rb +13 -0
  61. data/app/jobs/newspaper_works/create_issue_pages_job.rb +19 -0
  62. data/app/mailers/newspaper_works/application_mailer.rb +8 -0
  63. data/app/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior.rb +82 -0
  64. data/app/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior.rb +27 -0
  65. data/app/models/concerns/newspaper_works/newspaper_core_metadata.rb +67 -0
  66. data/app/models/concerns/newspaper_works/place_of_publication_behavior.rb +15 -0
  67. data/app/models/concerns/newspaper_works/scanned_media_metadata.rb +43 -0
  68. data/app/models/concerns/newspaper_works/solr/document.rb +25 -0
  69. data/app/models/file_set.rb +10 -0
  70. data/app/models/newspaper_article.rb +158 -0
  71. data/app/models/newspaper_container.rb +86 -0
  72. data/app/models/newspaper_issue.rb +115 -0
  73. data/app/models/newspaper_page.rb +70 -0
  74. data/app/models/newspaper_title.rb +111 -0
  75. data/app/models/newspaper_works/application_record.rb +6 -0
  76. data/app/models/newspaper_works/derivative_attachment.rb +8 -0
  77. data/app/models/newspaper_works/ingest_file_relation.rb +14 -0
  78. data/app/presenters/hyrax/newspaper_article_presenter.rb +38 -0
  79. data/app/presenters/hyrax/newspaper_container_presenter.rb +11 -0
  80. data/app/presenters/hyrax/newspaper_issue_presenter.rb +62 -0
  81. data/app/presenters/hyrax/newspaper_page_presenter.rb +72 -0
  82. data/app/presenters/hyrax/newspaper_title_presenter.rb +86 -0
  83. data/app/presenters/newspaper_works/iiif_manifest_presenter_behavior.rb +29 -0
  84. data/app/presenters/newspaper_works/issue_info_presenter.rb +29 -0
  85. data/app/presenters/newspaper_works/newspaper_core_presenter.rb +9 -0
  86. data/app/presenters/newspaper_works/persistent_url_presenter_behavior.rb +16 -0
  87. data/app/presenters/newspaper_works/place_of_publication_presenter_behavior.rb +8 -0
  88. data/app/presenters/newspaper_works/scanned_media_presenter.rb +7 -0
  89. data/app/presenters/newspaper_works/title_info_presenter.rb +13 -0
  90. data/app/search_builders/concerns/newspaper_works/exclude_models.rb +16 -0
  91. data/app/search_builders/concerns/newspaper_works/highlight_search_params.rb +14 -0
  92. data/app/search_builders/newspaper_works/newspapers_search_builder.rb +26 -0
  93. data/app/services/hyrax/article_genre_service.rb +9 -0
  94. data/app/services/newspaper_works/jp2_derivative_service.rb +120 -0
  95. data/app/services/newspaper_works/newspaper_page_derivative_service.rb +91 -0
  96. data/app/services/newspaper_works/pdf_derivative_service.rb +45 -0
  97. data/app/services/newspaper_works/pluggable_derivative_service.rb +114 -0
  98. data/app/services/newspaper_works/text_extraction_derivative_service.rb +56 -0
  99. data/app/services/newspaper_works/text_formats_from_alto_service.rb +77 -0
  100. data/app/services/newspaper_works/tiff_derivative_service.rb +54 -0
  101. data/app/validators/newspaper_works/publication_date_start_end_validator.rb +48 -0
  102. data/app/validators/newspaper_works/publication_date_validator.rb +16 -0
  103. data/app/views/catalog/_index_gallery_newspaper_article_wrapper.html.erb +9 -0
  104. data/app/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb +9 -0
  105. data/app/views/catalog/_index_header_gallery_newspaper_article.html.erb +23 -0
  106. data/app/views/catalog/_index_header_gallery_newspaper_page.html.erb +23 -0
  107. data/app/views/catalog/_index_header_list_newspaper_article.html.erb +7 -0
  108. data/app/views/catalog/_index_header_list_newspaper_page.html.erb +7 -0
  109. data/app/views/catalog/_snippets_more.html.erb +16 -0
  110. data/app/views/catalog/_thumbnail_list_newspaper_article.html.erb +6 -0
  111. data/app/views/catalog/_thumbnail_list_newspaper_page.html.erb +6 -0
  112. data/app/views/hyrax/file_sets/_actions.html.erb +45 -0
  113. data/app/views/hyrax/newspaper_articles/_newspaper_article.html.erb +2 -0
  114. data/app/views/hyrax/newspaper_articles/show.html.erb +1 -0
  115. data/app/views/hyrax/newspaper_containers/_newspaper_container.html.erb +2 -0
  116. data/app/views/hyrax/newspaper_containers/show.html.erb +1 -0
  117. data/app/views/hyrax/newspaper_issues/_newspaper_issue.html.erb +2 -0
  118. data/app/views/hyrax/newspaper_issues/show.html.erb +1 -0
  119. data/app/views/hyrax/newspaper_pages/_newspaper_page.html.erb +2 -0
  120. data/app/views/hyrax/newspaper_pages/show.html.erb +1 -0
  121. data/app/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb +5 -0
  122. data/app/views/hyrax/newspaper_titles/_issue_search_form.html.erb +33 -0
  123. data/app/views/hyrax/newspaper_titles/_issues_calendar.html.erb +63 -0
  124. data/app/views/hyrax/newspaper_titles/_newspaper_title.html.erb +2 -0
  125. data/app/views/hyrax/newspaper_titles/show.html.erb +54 -0
  126. data/app/views/newspaper_works/base/_attribute_rows.html.erb +42 -0
  127. data/app/views/newspaper_works/base/_attributes.html.erb +16 -0
  128. data/app/views/newspaper_works/base/_metadata.html.erb +6 -0
  129. data/app/views/newspaper_works/base/_newspaper_hierarchy.html.erb +14 -0
  130. data/app/views/newspaper_works/base/_persistent_url.html.erb +1 -0
  131. data/app/views/newspaper_works/base/_show.html.erb +45 -0
  132. data/app/views/newspaper_works/newspapers_search/_date_fields.html.erb +29 -0
  133. data/app/views/newspaper_works/newspapers_search/_facet_layout.html.erb +8 -0
  134. data/app/views/newspaper_works/newspapers_search/_facet_limit.html.erb +17 -0
  135. data/app/views/newspaper_works/newspapers_search/_front_pages_input.html.erb +5 -0
  136. data/app/views/newspaper_works/newspapers_search/_keyword_input.html.erb +18 -0
  137. data/app/views/newspaper_works/newspapers_search/_newspapers_facets.html.erb +5 -0
  138. data/app/views/newspaper_works/newspapers_search/_newspapers_search_form.html.erb +13 -0
  139. data/app/views/newspaper_works/newspapers_search/_newspapers_search_help.html.erb +8 -0
  140. data/app/views/newspaper_works/newspapers_search/search.html.erb +13 -0
  141. data/app/views/records/edit_fields/_alternate_title.html.erb +4 -0
  142. data/app/views/records/edit_fields/_genre.html.erb +4 -0
  143. data/app/views/records/edit_fields/_place_of_publication.html.erb +14 -0
  144. data/app/views/records/edit_fields/_subtitle.html.erb +4 -0
  145. data/bin/rails +13 -0
  146. data/config/fcrepo_wrapper_test.yml +5 -0
  147. data/config/initializers/assets.rb +2 -0
  148. data/config/locales/newspaper_article.de.yml +12 -0
  149. data/config/locales/newspaper_article.en.yml +12 -0
  150. data/config/locales/newspaper_article.es.yml +12 -0
  151. data/config/locales/newspaper_article.fr.yml +12 -0
  152. data/config/locales/newspaper_article.it.yml +12 -0
  153. data/config/locales/newspaper_article.pt-BR.yml +12 -0
  154. data/config/locales/newspaper_article.zh.yml +12 -0
  155. data/config/locales/newspaper_container.de.yml +8 -0
  156. data/config/locales/newspaper_container.en.yml +8 -0
  157. data/config/locales/newspaper_container.es.yml +8 -0
  158. data/config/locales/newspaper_container.fr.yml +8 -0
  159. data/config/locales/newspaper_container.it.yml +8 -0
  160. data/config/locales/newspaper_container.pt-BR.yml +8 -0
  161. data/config/locales/newspaper_container.zh.yml +8 -0
  162. data/config/locales/newspaper_issue.de.yml +8 -0
  163. data/config/locales/newspaper_issue.en.yml +8 -0
  164. data/config/locales/newspaper_issue.es.yml +8 -0
  165. data/config/locales/newspaper_issue.fr.yml +8 -0
  166. data/config/locales/newspaper_issue.it.yml +8 -0
  167. data/config/locales/newspaper_issue.pt-BR.yml +8 -0
  168. data/config/locales/newspaper_issue.zh.yml +8 -0
  169. data/config/locales/newspaper_page.de.yml +15 -0
  170. data/config/locales/newspaper_page.en.yml +15 -0
  171. data/config/locales/newspaper_page.es.yml +15 -0
  172. data/config/locales/newspaper_page.fr.yml +15 -0
  173. data/config/locales/newspaper_page.it.yml +15 -0
  174. data/config/locales/newspaper_page.pt-BR.yml +15 -0
  175. data/config/locales/newspaper_page.zh.yml +15 -0
  176. data/config/locales/newspaper_title.de.yml +8 -0
  177. data/config/locales/newspaper_title.en.yml +8 -0
  178. data/config/locales/newspaper_title.es.yml +8 -0
  179. data/config/locales/newspaper_title.fr.yml +8 -0
  180. data/config/locales/newspaper_title.it.yml +8 -0
  181. data/config/locales/newspaper_title.pt-BR.yml +8 -0
  182. data/config/locales/newspaper_title.zh.yml +8 -0
  183. data/config/locales/newspaper_works.de.yml +50 -0
  184. data/config/locales/newspaper_works.en.yml +52 -0
  185. data/config/locales/newspaper_works.es.yml +52 -0
  186. data/config/locales/newspaper_works.fr.yml +52 -0
  187. data/config/locales/newspaper_works.it.yml +52 -0
  188. data/config/locales/newspaper_works.pt-BR.yml +52 -0
  189. data/config/locales/newspaper_works.zh.yml +52 -0
  190. data/config/routes.rb +9 -0
  191. data/config/solr_wrapper_test.yml +9 -0
  192. data/config/test-fixture/solr-config/_rest_managed.json +3 -0
  193. data/config/test-fixture/solr-config/admin-extra.html +31 -0
  194. data/config/test-fixture/solr-config/elevate.xml +36 -0
  195. data/config/test-fixture/solr-config/mapping-ISOLatin1Accent.txt +246 -0
  196. data/config/test-fixture/solr-config/protwords.txt +21 -0
  197. data/config/test-fixture/solr-config/schema.xml +366 -0
  198. data/config/test-fixture/solr-config/scripts.conf +24 -0
  199. data/config/test-fixture/solr-config/solrconfig.xml +322 -0
  200. data/config/test-fixture/solr-config/spellings.txt +2 -0
  201. data/config/test-fixture/solr-config/stopwords.txt +58 -0
  202. data/config/test-fixture/solr-config/stopwords_en.txt +58 -0
  203. data/config/test-fixture/solr-config/synonyms.txt +31 -0
  204. data/config/test-fixture/solr-config/xslt/example.xsl +132 -0
  205. data/config/test-fixture/solr-config/xslt/example_atom.xsl +67 -0
  206. data/config/test-fixture/solr-config/xslt/example_rss.xsl +66 -0
  207. data/config/test-fixture/solr-config/xslt/luke.xsl +337 -0
  208. data/config/vendor/imagemagick-6-policy.xml +76 -0
  209. data/db/migrate/20181214181358_create_newspaper_works_derivative_attachments.rb +12 -0
  210. data/db/migrate/20190107165909_create_newspaper_works_ingest_file_relations.rb +11 -0
  211. data/lib/generators/newspaper_works/assets_generator.rb +29 -0
  212. data/lib/generators/newspaper_works/blacklight_advanced_search_generator.rb +44 -0
  213. data/lib/generators/newspaper_works/blacklight_iiif_search_generator.rb +41 -0
  214. data/lib/generators/newspaper_works/catalog_controller_generator.rb +60 -0
  215. data/lib/generators/newspaper_works/install_generator.rb +97 -0
  216. data/lib/generators/newspaper_works/templates/annotation_behavior.rb +6 -0
  217. data/lib/generators/newspaper_works/templates/config/authorities/newspaper_article_genres.yml +86 -0
  218. data/lib/generators/newspaper_works/templates/config/initializers/newspaper_works.rb +12 -0
  219. data/lib/generators/newspaper_works/templates/config/initializers/patch_blacklight_advanced_search.rb +74 -0
  220. data/lib/generators/newspaper_works/templates/custom_search_builder.rb +23 -0
  221. data/lib/generators/newspaper_works/templates/newspaper_works.scss +1 -0
  222. data/lib/generators/newspaper_works/templates/newspaper_works_helper.rb +3 -0
  223. data/lib/generators/newspaper_works/templates/search_behavior.rb +6 -0
  224. data/lib/newspaper_works/configuration.rb +14 -0
  225. data/lib/newspaper_works/data/fileset_helper.rb +25 -0
  226. data/lib/newspaper_works/data/path_helper.rb +40 -0
  227. data/lib/newspaper_works/data/work_derivatives.rb +314 -0
  228. data/lib/newspaper_works/data/work_file.rb +92 -0
  229. data/lib/newspaper_works/data/work_files.rb +181 -0
  230. data/lib/newspaper_works/data.rb +35 -0
  231. data/lib/newspaper_works/engine.rb +42 -0
  232. data/lib/newspaper_works/errors.rb +14 -0
  233. data/lib/newspaper_works/ingest/base_ingest.rb +69 -0
  234. data/lib/newspaper_works/ingest/base_publication_info.rb +35 -0
  235. data/lib/newspaper_works/ingest/batch_ingest_helper.rb +44 -0
  236. data/lib/newspaper_works/ingest/batch_issue_ingester.rb +129 -0
  237. data/lib/newspaper_works/ingest/chronam_publication_info.rb +133 -0
  238. data/lib/newspaper_works/ingest/from_command.rb +52 -0
  239. data/lib/newspaper_works/ingest/image_ingest_issues.rb +43 -0
  240. data/lib/newspaper_works/ingest/issue_images.rb +51 -0
  241. data/lib/newspaper_works/ingest/lc_publication_info.rb +144 -0
  242. data/lib/newspaper_works/ingest/named_issue_metadata.rb +60 -0
  243. data/lib/newspaper_works/ingest/ndnp/batch_ingester.rb +64 -0
  244. data/lib/newspaper_works/ingest/ndnp/batch_xml_ingest.rb +72 -0
  245. data/lib/newspaper_works/ingest/ndnp/container_ingest.rb +99 -0
  246. data/lib/newspaper_works/ingest/ndnp/container_ingester.rb +84 -0
  247. data/lib/newspaper_works/ingest/ndnp/container_metadata.rb +87 -0
  248. data/lib/newspaper_works/ingest/ndnp/issue_ingest.rb +81 -0
  249. data/lib/newspaper_works/ingest/ndnp/issue_ingester.rb +101 -0
  250. data/lib/newspaper_works/ingest/ndnp/issue_metadata.rb +96 -0
  251. data/lib/newspaper_works/ingest/ndnp/ndnp_asset_helper.rb +20 -0
  252. data/lib/newspaper_works/ingest/ndnp/ndnp_mets_helper.rb +70 -0
  253. data/lib/newspaper_works/ingest/ndnp/page_ingest.rb +47 -0
  254. data/lib/newspaper_works/ingest/ndnp/page_ingester.rb +157 -0
  255. data/lib/newspaper_works/ingest/ndnp/page_metadata.rb +112 -0
  256. data/lib/newspaper_works/ingest/ndnp.rb +21 -0
  257. data/lib/newspaper_works/ingest/newspaper_issue_ingest.rb +56 -0
  258. data/lib/newspaper_works/ingest/newspaper_page_ingest.rb +6 -0
  259. data/lib/newspaper_works/ingest/page_image.rb +52 -0
  260. data/lib/newspaper_works/ingest/path_enumeration.rb +52 -0
  261. data/lib/newspaper_works/ingest/pdf_images.rb +85 -0
  262. data/lib/newspaper_works/ingest/pdf_issue.rb +20 -0
  263. data/lib/newspaper_works/ingest/pdf_issues.rb +39 -0
  264. data/lib/newspaper_works/ingest/pdf_pages.rb +114 -0
  265. data/lib/newspaper_works/ingest/pub_finder.rb +89 -0
  266. data/lib/newspaper_works/ingest/publication_info.rb +44 -0
  267. data/lib/newspaper_works/ingest.rb +90 -0
  268. data/lib/newspaper_works/issue_pdf_composer.rb +111 -0
  269. data/lib/newspaper_works/logging.rb +54 -0
  270. data/lib/newspaper_works/page_finder.rb +62 -0
  271. data/lib/newspaper_works/resource_fetcher.rb +78 -0
  272. data/lib/newspaper_works/text_extraction/alto_reader.rb +122 -0
  273. data/lib/newspaper_works/text_extraction/page_ocr.rb +100 -0
  274. data/lib/newspaper_works/text_extraction/render_alto.rb +84 -0
  275. data/lib/newspaper_works/text_extraction/word_coords_builder.rb +30 -0
  276. data/lib/newspaper_works/text_extraction.rb +10 -0
  277. data/lib/newspaper_works/version.rb +3 -0
  278. data/lib/newspaper_works.rb +19 -0
  279. data/lib/tasks/newspaper_works_tasks.rake +39 -0
  280. data/newspaper_works.gemspec +49 -0
  281. data/spec/.keep.txt +1 -0
  282. data/spec/actors/newspaper_works/actors/newspaper_works_upload_actor_spec.rb +69 -0
  283. data/spec/controllers/catalog_controller_spec.rb +63 -0
  284. data/spec/controllers/newspaper_works/newspapers_controller_spec.rb +114 -0
  285. data/spec/controllers/newspaper_works/newspapers_search_controller_spec.rb +21 -0
  286. data/spec/factories/ability.rb +6 -0
  287. data/spec/factories/newspaper_issue.rb +7 -0
  288. data/spec/factories/newspaper_issue_ingest.rb +6 -0
  289. data/spec/factories/newspaper_page.rb +7 -0
  290. data/spec/factories/newspaper_page_ingest.rb +6 -0
  291. data/spec/factories/newspaper_page_solr_document.rb +12 -0
  292. data/spec/factories/newspaper_title.rb +8 -0
  293. data/spec/factories/uploaded_pdf_file.rb +9 -0
  294. data/spec/factories/user.rb +13 -0
  295. data/spec/features/front_pages_for_title_spec.rb +19 -0
  296. data/spec/features/newspaper_title_search_spec.rb +30 -0
  297. data/spec/features/newspapers_search_spec.rb +49 -0
  298. data/spec/features/search_results_thumbnail_highlights_spec.rb +33 -0
  299. data/spec/features_shared.rb +71 -0
  300. data/spec/fixtures/files/4.1.07.jp2 +0 -0
  301. data/spec/fixtures/files/4.1.07.tiff +0 -0
  302. data/spec/fixtures/files/README.md +7 -0
  303. data/spec/fixtures/files/alto-2-0.xsd +714 -0
  304. data/spec/fixtures/files/broken-truncated.pdf +0 -0
  305. data/spec/fixtures/files/credits.md +16 -0
  306. data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
  307. data/spec/fixtures/files/minimal-1-page.pdf +0 -0
  308. data/spec/fixtures/files/minimal-2-page.pdf +0 -0
  309. data/spec/fixtures/files/minimal-alto.xml +31 -0
  310. data/spec/fixtures/files/ndnp-alto-sample.xml +24 -0
  311. data/spec/fixtures/files/ndnp-sample1-json.json +1 -0
  312. data/spec/fixtures/files/ndnp-sample1-txt.txt +1 -0
  313. data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
  314. data/spec/fixtures/files/ocr_alto.xml +202 -0
  315. data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +202 -0
  316. data/spec/fixtures/files/ocr_color.tiff +0 -0
  317. data/spec/fixtures/files/ocr_gray.jp2 +0 -0
  318. data/spec/fixtures/files/ocr_gray.tiff +0 -0
  319. data/spec/fixtures/files/ocr_mono.tiff +0 -0
  320. data/spec/fixtures/files/page1.tiff +0 -0
  321. data/spec/fixtures/files/resource_mocks/chronam/http404-expected +0 -0
  322. data/spec/fixtures/files/resource_mocks/chronam/sn84038814.rdf +1028 -0
  323. data/spec/fixtures/files/resource_mocks/chronam/sn93059126.rdf +36 -0
  324. data/spec/fixtures/files/resource_mocks/chronam/sn94051019.rdf +37 -0
  325. data/spec/fixtures/files/resource_mocks/geonames/Chicopee +1104 -0
  326. data/spec/fixtures/files/resource_mocks/geonames/Denver +1104 -0
  327. data/spec/fixtures/files/resource_mocks/geonames/Marysville +279 -0
  328. data/spec/fixtures/files/resource_mocks/geonames/Marysville2 +279 -0
  329. data/spec/fixtures/files/resource_mocks/geonames/SLC +1104 -0
  330. data/spec/fixtures/files/resource_mocks/lccn/sn2099999999 +1 -0
  331. data/spec/fixtures/files/resource_mocks/lccn/sn82014496 +2 -0
  332. data/spec/fixtures/files/resource_mocks/lccn/sn83020109 +1 -0
  333. data/spec/fixtures/files/resource_mocks/lccn/sn83021453 +2 -0
  334. data/spec/fixtures/files/resource_mocks/lccn/sn83045396 +2 -0
  335. data/spec/fixtures/files/resource_mocks/lccn/sn84038814 +2 -0
  336. data/spec/fixtures/files/resource_mocks/lccn/sn93059126 +1 -0
  337. data/spec/fixtures/files/resource_mocks/lccn/sn94051019 +1 -0
  338. data/spec/fixtures/files/resource_mocks/lccn/sn99999999 +1 -0
  339. data/spec/fixtures/files/resource_mocks/urls.json +82 -0
  340. data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
  341. data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
  342. data/spec/fixtures/files/thumbnail.jpg +0 -0
  343. data/spec/forms/hyrax/newspaper_article_form_spec.rb +33 -0
  344. data/spec/forms/hyrax/newspaper_container_form_spec.rb +30 -0
  345. data/spec/forms/hyrax/newspaper_issue_form_spec.rb +31 -0
  346. data/spec/forms/hyrax/newspaper_page_form_spec.rb +28 -0
  347. data/spec/forms/hyrax/newspaper_title_form_spec.rb +31 -0
  348. data/spec/forms/newspaper_works/newspaper_core_form_data_spec.rb +12 -0
  349. data/spec/helpers/newspaper_works/breadcrumb_helper_spec.rb +82 -0
  350. data/spec/helpers/newspaper_works_helper_spec.rb +57 -0
  351. data/spec/indexers/concerns/newspaper_works/indexes_full_text_spec.rb +31 -0
  352. data/spec/indexers/concerns/newspaper_works/indexes_place_of_publication_spec.rb +53 -0
  353. data/spec/indexers/concerns/newspaper_works/indexes_publication_date_range_spec.rb +39 -0
  354. data/spec/indexers/concerns/newspaper_works/indexes_relationships_spec.rb +86 -0
  355. data/spec/indexers/newspaper_article_indexer_spec.rb +29 -0
  356. data/spec/indexers/newspaper_issue_indexer_spec.rb +19 -0
  357. data/spec/indexers/newspaper_title_indexer_spec.rb +22 -0
  358. data/spec/indexers/newspaper_works/newspaper_core_indexer_spec.rb +23 -0
  359. data/spec/lib/newspaper_works/configuration_spec.rb +18 -0
  360. data/spec/lib/newspaper_works/data/work_derivatives_spec.rb +245 -0
  361. data/spec/lib/newspaper_works/data/work_file_spec.rb +99 -0
  362. data/spec/lib/newspaper_works/data/work_files_spec.rb +224 -0
  363. data/spec/lib/newspaper_works/ingest/batch_issue_ingester_spec.rb +158 -0
  364. data/spec/lib/newspaper_works/ingest/chronam_publication_info_spec.rb +35 -0
  365. data/spec/lib/newspaper_works/ingest/from_command_spec.rb +75 -0
  366. data/spec/lib/newspaper_works/ingest/image_ingest_issues_spec.rb +62 -0
  367. data/spec/lib/newspaper_works/ingest/ingest_shared.rb +75 -0
  368. data/spec/lib/newspaper_works/ingest/issue_images_spec.rb +65 -0
  369. data/spec/lib/newspaper_works/ingest/lc_publication_info_spec.rb +34 -0
  370. data/spec/lib/newspaper_works/ingest/ndnp/batch_ingester_spec.rb +131 -0
  371. data/spec/lib/newspaper_works/ingest/ndnp/batch_xml_ingest_spec.rb +64 -0
  372. data/spec/lib/newspaper_works/ingest/ndnp/container_ingest_spec.rb +44 -0
  373. data/spec/lib/newspaper_works/ingest/ndnp/container_ingester_spec.rb +126 -0
  374. data/spec/lib/newspaper_works/ingest/ndnp/container_metadata_spec.rb +36 -0
  375. data/spec/lib/newspaper_works/ingest/ndnp/issue_ingest_spec.rb +108 -0
  376. data/spec/lib/newspaper_works/ingest/ndnp/issue_ingester_spec.rb +155 -0
  377. data/spec/lib/newspaper_works/ingest/ndnp/issue_metadata_spec.rb +84 -0
  378. data/spec/lib/newspaper_works/ingest/ndnp/page_ingest_spec.rb +79 -0
  379. data/spec/lib/newspaper_works/ingest/ndnp/page_ingester_spec.rb +184 -0
  380. data/spec/lib/newspaper_works/ingest/ndnp/page_metadata_spec.rb +85 -0
  381. data/spec/lib/newspaper_works/ingest/newspaper_issue_ingest_spec.rb +83 -0
  382. data/spec/lib/newspaper_works/ingest/newspaper_page_ingest_spec.rb +77 -0
  383. data/spec/lib/newspaper_works/ingest/page_image_spec.rb +29 -0
  384. data/spec/lib/newspaper_works/ingest/pdf_images_spec.rb +32 -0
  385. data/spec/lib/newspaper_works/ingest/pdf_issue_spec.rb +29 -0
  386. data/spec/lib/newspaper_works/ingest/pdf_issues_spec.rb +62 -0
  387. data/spec/lib/newspaper_works/ingest/pdf_pages_spec.rb +110 -0
  388. data/spec/lib/newspaper_works/ingest/pub_finder_spec.rb +58 -0
  389. data/spec/lib/newspaper_works/ingest/publication_info_spec.rb +61 -0
  390. data/spec/lib/newspaper_works/ingest_spec.rb +45 -0
  391. data/spec/lib/newspaper_works/issue_pdf_composer_spec.rb +101 -0
  392. data/spec/lib/newspaper_works/logging_spec.rb +53 -0
  393. data/spec/lib/newspaper_works/page_finder_spec.rb +53 -0
  394. data/spec/lib/newspaper_works/resource_fetcher_spec.rb +65 -0
  395. data/spec/lib/newspaper_works/text_extraction/alto_reader_spec.rb +49 -0
  396. data/spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb +84 -0
  397. data/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb +54 -0
  398. data/spec/lib/newspaper_works/text_extraction/word_coords_builder_spec.rb +30 -0
  399. data/spec/lib/tasks/newspaper_works_rake_spec.rb +124 -0
  400. data/spec/misc_shared.rb +109 -0
  401. data/spec/model_shared.rb +134 -0
  402. data/spec/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior_spec.rb +45 -0
  403. data/spec/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior_spec.rb +27 -0
  404. data/spec/models/concerns/newspaper_works/newspaper_core_metadata_spec.rb +45 -0
  405. data/spec/models/concerns/newspaper_works/place_of_publication_behavior_spec.rb +17 -0
  406. data/spec/models/concerns/newspaper_works/scanned_media_metadata_spec.rb +35 -0
  407. data/spec/models/newspaper_article_spec.rb +73 -0
  408. data/spec/models/newspaper_container_spec.rb +111 -0
  409. data/spec/models/newspaper_issue_spec.rb +91 -0
  410. data/spec/models/newspaper_page_spec.rb +44 -0
  411. data/spec/models/newspaper_title_spec.rb +116 -0
  412. data/spec/models/newspaper_works/derivative_attachment_spec.rb +37 -0
  413. data/spec/models/newspaper_works/ingest_file_relation_spec.rb +56 -0
  414. data/spec/models/solr_document_spec.rb +14 -0
  415. data/spec/ndnp_shared.rb +48 -0
  416. data/spec/presenters/hyrax/newspaper_article_presenter_spec.rb +53 -0
  417. data/spec/presenters/hyrax/newspaper_container_presenter_spec.rb +20 -0
  418. data/spec/presenters/hyrax/newspaper_issue_presenter_spec.rb +65 -0
  419. data/spec/presenters/hyrax/newspaper_page_presenter_spec.rb +75 -0
  420. data/spec/presenters/hyrax/newspaper_title_presenter_spec.rb +153 -0
  421. data/spec/presenters/newspaper_works/iiif_manifest_presenter_behavior_spec.rb +32 -0
  422. data/spec/presenters/newspaper_works/issue_info_presenter_spec.rb +51 -0
  423. data/spec/presenters/newspaper_works/newspaper_core_presenter_spec.rb +22 -0
  424. data/spec/presenters/newspaper_works/persistent_url_presenter_behavior_spec.rb +24 -0
  425. data/spec/presenters/newspaper_works/place_of_publication_presenter_behavior_spec.rb +17 -0
  426. data/spec/presenters/newspaper_works/scanned_media_presenter_spec.rb +18 -0
  427. data/spec/presenters/newspaper_works/title_info_presenter_spec.rb +23 -0
  428. data/spec/routing/route_spec.rb +52 -0
  429. data/spec/search_builders/custom_search_builder_spec.rb +34 -0
  430. data/spec/search_builders/newspaper_works/newspapers_search_builder_spec.rb +33 -0
  431. data/spec/services/hyrax/article_genre_service_spec.rb +12 -0
  432. data/spec/services/hyrax/resource_types_service_spec.rb +12 -0
  433. data/spec/services/newspaper_works/jp2_derivative_service_spec.rb +62 -0
  434. data/spec/services/newspaper_works/newspaper_page_derivative_service_spec.rb +125 -0
  435. data/spec/services/newspaper_works/pdf_derivative_service_spec.rb +62 -0
  436. data/spec/services/newspaper_works/pluggable_derivative_service_spec.rb +204 -0
  437. data/spec/services/newspaper_works/text_extraction_derivative_service_spec.rb +82 -0
  438. data/spec/services/newspaper_works/text_formats_from_alto_service_spec.rb +129 -0
  439. data/spec/services/newspaper_works/tiff_derivative_service_spec.rb +58 -0
  440. data/spec/spec_helper.rb +261 -0
  441. data/spec/support/controller_level_helpers.rb +28 -0
  442. data/spec/test_app_templates/lib/generators/test_app_generator.rb +22 -0
  443. data/spec/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb_spec.rb +36 -0
  444. data/spec/views/catalog/_index_header_list_newspaper_page.html.erb_spec.rb +26 -0
  445. data/spec/views/catalog/_thumbnail_list_newspaper_page.html.erb_spec.rb +35 -0
  446. data/spec/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb_spec.rb +16 -0
  447. data/spec/views/hyrax/newspaper_titles/_issue_search_form.html.erb_spec.rb +33 -0
  448. data/spec/views/hyrax/newspaper_titles/_issues_calendar.html.erb_spec.rb +37 -0
  449. data/spec/views/hyrax/newspaper_titles/show.html.erb_spec.rb +87 -0
  450. data/spec/views/newspaper_works/base/_attribute_rows.html.erb_spec.rb +60 -0
  451. data/spec/views/newspaper_works/base/_newspaper_hierarchy.html.erb_spec.rb +80 -0
  452. data/spec/views/newspaper_works/base/_show.html.erb_spec.rb +78 -0
  453. data/spec/views/newspaper_works/newspapers_search/search.html.erb_spec.rb +54 -0
  454. data/spec/views/records/edit_fields/_place_of_publication.html.erb_spec.rb +26 -0
  455. data/tasks/newspaperworks_dev.rake +26 -0
  456. data/test/integration/navigation_test.rb +7 -0
  457. data/test/lib/generators/newspaper_works/install_generator_test.rb +16 -0
  458. data/test/newspaper_works_test.rb +7 -0
  459. data/test/test_helper.rb +17 -0
  460. data/tmp/.keep +0 -0
  461. metadata +1037 -0
@@ -0,0 +1,101 @@
1
+ module NewspaperWorks
2
+ module Ingest
3
+ module NDNP
4
+ class IssueIngester
5
+ include NewspaperWorks::Logging
6
+ include NewspaperWorks::Ingest::NDNP::NDNPAssetHelper
7
+ include NewspaperWorks::Ingest::PubFinder
8
+
9
+ attr_accessor :issue, :target, :opts
10
+
11
+ delegate :path, to: :issue
12
+
13
+ COPY_FIELDS = [
14
+ :lccn,
15
+ :edition_number,
16
+ :edition_name,
17
+ :volume,
18
+ :publication_date,
19
+ :held_by,
20
+ :issue_number
21
+ ].freeze
22
+
23
+ # @param issue [NewspaperWorks::Ingest::NDNP::IssueIngest]
24
+ # source issue data
25
+ # @param opts [Hash]
26
+ # ingest options, e.g. administrative metadata
27
+ def initialize(issue, opts = {})
28
+ @issue = issue
29
+ @opts = opts
30
+ @target = nil
31
+ configure_logger('ingest')
32
+ end
33
+
34
+ def ingest
35
+ construct_issue
36
+ ingest_pages
37
+ NewspaperWorks::ComposeIssuePDFJob.perform_later(@target)
38
+ end
39
+
40
+ def construct_issue
41
+ create_issue
42
+ find_or_create_linked_publication
43
+ end
44
+
45
+ def ingest_pages
46
+ issue.each do |page|
47
+ page_ingester(page).ingest
48
+ end
49
+ end
50
+
51
+ private
52
+
53
+ def page_ingester(page_data)
54
+ NewspaperWorks::Ingest::NDNP::PageIngester.new(
55
+ page_data,
56
+ @target,
57
+ @opts
58
+ )
59
+ end
60
+
61
+ def publication_date
62
+ parsed = DateTime.iso8601(issue.metadata.publication_date)
63
+ parsed.strftime('%B %-d, %Y')
64
+ end
65
+
66
+ def publication_title(issue)
67
+ issue.metadata.publication_title.strip.split(/ \(/)[0]
68
+ end
69
+
70
+ def issue_title
71
+ "#{publication_title(issue)}: #{publication_date}"
72
+ end
73
+
74
+ def copy_issue_metadata
75
+ metadata = issue.metadata
76
+ # set (required, plural) title from single value obtained from reel:
77
+ @target.title = [issue_title]
78
+ # copy all fields with singular (non-repeatable) values on both
79
+ # target NewspaperIssue object, and metadata source:
80
+ COPY_FIELDS.each do |fieldname|
81
+ @target.send("#{fieldname}=", metadata.send(fieldname.to_s))
82
+ end
83
+ end
84
+
85
+ def create_issue
86
+ @target = NewspaperIssue.create
87
+ copy_issue_metadata
88
+ assign_administrative_metadata
89
+ @target.save!
90
+ write_log("Saved metadata to new NewspaperIssue #{@target.id}")
91
+ end
92
+
93
+ def find_or_create_linked_publication
94
+ title = publication_title(issue)
95
+ lccn = issue.metadata.lccn
96
+ find_or_create_publication_for_issue(@target, lccn, title, @opts)
97
+ end
98
+ end
99
+ end
100
+ end
101
+ end
@@ -0,0 +1,96 @@
1
+ module NewspaperWorks
2
+ module Ingest
3
+ module NDNP
4
+ class IssueMetadata
5
+ include NewspaperWorks::Ingest::NDNP::NDNPMetsHelper
6
+
7
+ attr_accessor :path, :doc, :parent
8
+
9
+ def initialize(path, parent = nil)
10
+ @path = path
11
+ @parent = parent
12
+ @doc = nil
13
+ load_doc
14
+ end
15
+
16
+ def inspect
17
+ format(
18
+ "<#{self.class}:0x000000000%<oid>x\n" \
19
+ "\tpath: '#{path}',\n",
20
+ oid: object_id << 1
21
+ )
22
+ end
23
+
24
+ # LCCN (mandatory)
25
+ # @return [String]
26
+ def lccn
27
+ xpath("//mods:identifier[@type='lccn']").text
28
+ end
29
+
30
+ # Volume number (optional)
31
+ # @return [String,NilClass]
32
+ def volume
33
+ result = xpath("//mods:detail[@type='volume']/mods:number")
34
+ return if result.size.zero?
35
+ result.text
36
+ end
37
+
38
+ # Issue number (optional)
39
+ # @return [String,NilClass]
40
+ def issue_number
41
+ result = xpath("//mods:detail[@type='issue']/mods:number")
42
+ return if result.size.zero?
43
+ result.text
44
+ end
45
+
46
+ # Edition name
47
+ # Edition name is optional ("caption" / "label") is optional
48
+ # in NDNP, but as it may be used as a label for readability.
49
+ # @return [String,NilClass]
50
+ def edition_name
51
+ ed_name = xpath("//mods:detail[@type='edition']/mods:caption")
52
+ return ed_name.text unless ed_name.size.zero?
53
+ end
54
+
55
+ # Edition name, with fallback to edition number (mandatory)
56
+ # @return [String]
57
+ def edition_number
58
+ xpath("//mods:detail[@type='edition']/mods:number").text
59
+ end
60
+
61
+ # Issue date (mandatory field) as ISO 8601 datestamp string
62
+ # @return [String] (ISO-8601 date) publication date
63
+ def publication_date
64
+ xpath("//mods:originInfo/mods:dateIssued").text
65
+ end
66
+
67
+ def publication_title
68
+ # try from reel first
69
+ reel = parent.nil? ? nil : parent.container
70
+ return reel.metadata.title unless reel.nil?
71
+ # fallback to parsing //mets/@LABEL
72
+ label = xpath('//mets:mets/@LABEL').first
73
+ v = label.nil? ? '' : label.value.split(/[,] [0-9]/)[0]
74
+ # based on label convention:
75
+ # "ACME Times (Springfield, UT), 1911-01-25, First Edition"
76
+ # Returns the name and (*for now TBD*) place of publication
77
+ # as a string in parentheses.
78
+ v.split(/, [0-9]/)[0]
79
+ end
80
+
81
+ # Original Source Repository (NDNP-mandatory)
82
+ # @return [String]
83
+ def held_by
84
+ xpath("//mods:physicalLocation").first['displayLabel']
85
+ end
86
+
87
+ private
88
+
89
+ def load_doc
90
+ @doc = @parent.doc unless @parent.nil?
91
+ @doc = Nokogiri::XML(File.open(path)) if @doc.nil?
92
+ end
93
+ end
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,20 @@
1
+ module NewspaperWorks
2
+ module Ingest
3
+ module NDNP
4
+ # Mixin for mets-specific XPath and traversal of issue/page data
5
+ module NDNPAssetHelper
6
+ # Set administrative metadata for asset, based on options saved
7
+ # on ingester state.
8
+ # Pre-conditions for use:
9
+ # consuming class implements @target pointing to work asset
10
+ # consuming class implements @opts pointing to Hash
11
+ def assign_administrative_metadata(work = nil)
12
+ NewspaperWorks::Ingest.assign_administrative_metadata(
13
+ work || @target,
14
+ @opts
15
+ )
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,70 @@
1
+ require 'nokogiri'
2
+
3
+ module NewspaperWorks
4
+ module Ingest
5
+ module NDNP
6
+ # Mixin for mets-specific XPath and traversal of issue/page data
7
+ module NDNPMetsHelper
8
+ XML_NS = {
9
+ mets: 'http://www.loc.gov/METS/',
10
+ METS: 'http://www.loc.gov/METS/',
11
+ mods: 'http://www.loc.gov/mods/v3',
12
+ MODS: 'http://www.loc.gov/mods/v3',
13
+ ndnp: 'http://www.loc.gov/ndnp',
14
+ NDNP: 'http://www.loc.gov/ndnp'
15
+ }.freeze
16
+
17
+ # DRY XPath without repeatedly specifying default namespace urlmap
18
+ def xpath(expr, context = nil)
19
+ context ||= doc
20
+ context.xpath(
21
+ expr,
22
+ **XML_NS
23
+ )
24
+ end
25
+
26
+ def dmd_node
27
+ xpath("//mets:dmdSec[@ID='#{dmdid}']")
28
+ end
29
+
30
+ def normalize_path(specified_path)
31
+ return specified_path if specified_path.start_with?('/')
32
+ basename = File.dirname(path)
33
+ File.join(basename, specified_path)
34
+ end
35
+
36
+ # returns hash of "use" key string to path value
37
+ def page_files
38
+ # get pointers from structmap:
39
+ file_group = xpath("//mets:structMap//mets:div[@DMDID='#{dmdid}']")
40
+ result = xpath('mets:fptr', file_group).map do |fptr|
41
+ file_id = fptr['FILEID']
42
+ file_node = xpath(
43
+ "//mets:fileSec//mets:fileGrp//mets:file[@ID='#{file_id}']"
44
+ ).first
45
+ [
46
+ file_node['USE'],
47
+ xpath('mets:FLocat', file_node).first.attribute_with_ns(
48
+ 'href',
49
+ 'http://www.w3.org/1999/xlink'
50
+ ).to_s
51
+ ]
52
+ end
53
+ result.to_h
54
+ end
55
+
56
+ def container_path
57
+ reel_dir = File.expand_path('..', File.dirname(path))
58
+ reel_base = File.basename(reel_dir)
59
+ File.join(reel_dir, "#{reel_base}_1.xml")
60
+ end
61
+
62
+ def container
63
+ reel_path = container_path
64
+ return unless File.exist?(reel_path)
65
+ NewspaperWorks::Ingest::NDNP::ContainerIngest.new(reel_path)
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,47 @@
1
+ module NewspaperWorks
2
+ module Ingest
3
+ module NDNP
4
+ class PageIngest
5
+ include NewspaperWorks::Ingest::NDNP::NDNPMetsHelper
6
+
7
+ attr_accessor :path, :dmdid, :doc, :files
8
+
9
+ def initialize(path = nil, dmdid = nil, parent = nil)
10
+ raise ArgumentError, 'No path provided' if path.nil?
11
+ @path = path
12
+ @dmdid = dmdid
13
+ @doc = nil
14
+ @parent = parent
15
+ @metadata = nil
16
+ load_doc
17
+ @files = page_files.values.map(&method(:normalize_path))
18
+ end
19
+
20
+ def inspect
21
+ format(
22
+ "<#{self.class}:0x000000000%<oid>x\n" \
23
+ "\tpath: '#{path}',\n" \
24
+ "\tdmdid: '#{dmdid}' ...>",
25
+ oid: object_id << 1
26
+ )
27
+ end
28
+
29
+ def metadata
30
+ return @metadata unless @metadata.nil?
31
+ @metadata = NewspaperWorks::Ingest::NDNP::PageMetadata.new(
32
+ path,
33
+ self,
34
+ dmdid
35
+ )
36
+ end
37
+
38
+ private
39
+
40
+ def load_doc
41
+ @doc = @parent.doc unless @parent.nil?
42
+ @doc = Nokogiri::XML(File.open(path)) if @doc.nil?
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,157 @@
1
+ require 'newspaper_works/logging'
2
+
3
+ module NewspaperWorks
4
+ module Ingest
5
+ module NDNP
6
+ class PageIngester
7
+ include NewspaperWorks::Logging
8
+ include NewspaperWorks::Ingest::NDNP::NDNPAssetHelper
9
+
10
+ attr_accessor :page, :issue, :target, :opts
11
+
12
+ delegate :path, :dmdid, to: :page
13
+
14
+ COPY_FIELDS = [
15
+ :width,
16
+ :height,
17
+ :page_number,
18
+ :identifier
19
+ ].freeze
20
+
21
+ COPY_FIELDS_PLURALIZE = [
22
+ :identifier
23
+ ].freeze
24
+
25
+ # @param page [NewspaperWorks::Ingest::NDNP::PageIngest]
26
+ # source page data
27
+ # @param issue [NewspaperIssue]
28
+ # source issue data
29
+ # @param opts [Hash]
30
+ # ingest options, e.g. administrative metadata
31
+ def initialize(page, issue, opts = {})
32
+ @page = page
33
+ @issue = issue
34
+ @opts = opts
35
+ # target is to-be-created NewspaperPage:
36
+ @target = nil
37
+ @work_files = nil
38
+ configure_logger('ingest')
39
+ end
40
+
41
+ def ingest
42
+ construct_page
43
+ ingest_page_files
44
+ link_reel
45
+ end
46
+
47
+ def construct_page
48
+ @target = NewspaperPage.create!(title: page_title)
49
+ write_log(
50
+ "Created NewspaperPage work #{@target.id} "\
51
+ "with title '#{@target.title[0]}'"
52
+ )
53
+ copy_page_metadata
54
+ assign_administrative_metadata
55
+ link_issue
56
+ @target.save!
57
+ write_log("Saved metadata to NewspaperPage work #{@target.id}")
58
+ end
59
+
60
+ # Ingest primary, derivative files; other derivatives including
61
+ # thumbnail, plain-text, json will be made by NewspaperWorks
62
+ # derivative service components as a consequence of commiting
63
+ # files assigned (via actor stack, via WorkFiles).
64
+ def ingest_page_files
65
+ @work_files = NewspaperWorks::Data::WorkFiles.new(@target)
66
+ page.files.each do |path|
67
+ ext = path.downcase.split('.')[-1]
68
+ if ['tif', 'tiff'].include?(ext)
69
+ ingest_primary_file(path)
70
+ else
71
+ ingest_derivative_file(path)
72
+ end
73
+ end
74
+ write_log("Beginning file attachment process (WorkFiles.commit!) "\
75
+ "for work #{@target.id}")
76
+ @work_files.commit!
77
+ end
78
+
79
+ def link_reel
80
+ reel_data = @page.container
81
+ return if reel_data.nil?
82
+ ingester = NewspaperWorks::Ingest::NDNP::ContainerIngester.new(
83
+ reel_data,
84
+ issue.publication,
85
+ @opts
86
+ )
87
+ # find-or-create container, linked to publication:
88
+ ingester.ingest
89
+ # link target page to container asset for reel:
90
+ ingester.link(@target)
91
+ end
92
+
93
+ private
94
+
95
+ def ingest_primary_file(path)
96
+ unless File.exist?(path)
97
+ pdf_path = page.files.select { |p| p.end_with?('pdf') }[0]
98
+ # make and get TIFF path (to generated tmp file):
99
+ path = make_tiff(pdf_path)
100
+ end
101
+ write_log("Assigned primary file to work #{@target.id}, #{path}")
102
+ @work_files.assign(path)
103
+ end
104
+
105
+ def ingest_derivative_file(path)
106
+ write_log("Assigned derivative file to work #{@target.id}, #{path}")
107
+ @work_files.derivatives.assign(path)
108
+ end
109
+
110
+ def link_issue
111
+ issue.ordered_members << @target # page
112
+ issue.save!
113
+ write_log(
114
+ "Linked NewspaperIssue work #{issue.id} "\
115
+ "to NewspaperPage work #{@target.id}"
116
+ )
117
+ end
118
+
119
+ # dir whitelist
120
+ def whitelist
121
+ Hyrax.config.whitelisted_ingest_dirs
122
+ end
123
+
124
+ # Generate TIFF in temporary file, return its path, given path to PDF
125
+ # @param pdf_path [String] path to single-page PDF
126
+ # @return [String] path to generated TIFF
127
+ def make_tiff(pdf_path)
128
+ write_log(
129
+ "Creating TIFF from PDF in lieu of missing for work "\
130
+ " (#{@target.id})",
131
+ Logger::WARN
132
+ )
133
+ whitelist.push(Dir.tmpdir) unless whitelist.include?(Dir.tmpdir)
134
+ NewspaperWorks::Ingest::PdfPages.new(pdf_path).to_a[0]
135
+ end
136
+
137
+ # Page title as issue title plus page title
138
+ # e.g. "ACME Tribune (1910-01-02): Page 2"
139
+ # @return [String] composed page title
140
+ def page_title
141
+ ["#{issue.title.first}: Page #{@page.metadata.page_number}"]
142
+ end
143
+
144
+ def copy_page_metadata
145
+ metadata = page.metadata
146
+ # copy all fields with singular (non-repeatable) values on both
147
+ # target NewspaperIssue object, and metadata source:
148
+ COPY_FIELDS.each do |fieldname|
149
+ value = metadata.send(fieldname.to_s)
150
+ pluralize = COPY_FIELDS_PLURALIZE.include?(fieldname)
151
+ @target.send("#{fieldname}=", pluralize ? [value] : value)
152
+ end
153
+ end
154
+ end
155
+ end
156
+ end
157
+ end
@@ -0,0 +1,112 @@
1
+ require 'nokogiri'
2
+
3
+ module NewspaperWorks
4
+ module Ingest
5
+ module NDNP
6
+ class PageMetadata
7
+ # mixin convenience methods for NDNP XML, plus XML_NS hash
8
+ include NewspaperWorks::Ingest::NDNP::NDNPMetsHelper
9
+
10
+ attr_accessor :path, :dmdid, :doc
11
+
12
+ def initialize(path = nil, parent = nil, dmdid = nil)
13
+ raise ArgumentError, 'No context provided' if path.nil? && parent.nil?
14
+ @path = path
15
+ @parent = parent
16
+ @dmdid = dmdid
17
+ @doc = nil
18
+ load_doc
19
+ end
20
+
21
+ def inspect
22
+ format(
23
+ "<#{self.class}:0x000000000%<oid>x\n" \
24
+ "\tpath: '#{path}',\n" \
25
+ "\tdmdid: '#{dmdid}' ...>",
26
+ oid: object_id << 1
27
+ )
28
+ end
29
+
30
+ # Printed page number, if printed; optional field in NDNP spec.
31
+ # "Number" is used liberally, and may contain both alpha
32
+ # and numeric characters. As such, return value is String.
33
+ #
34
+ # If NDNP issue data fails to provide an explicitly
35
+ # human-readable page number, fallback to sequence
36
+ # number, in String form.
37
+ #
38
+ # @return [String, NilClass] Page "number" string
39
+ def page_number
40
+ detail = dmd_node.xpath(
41
+ ".//mods:mods//mods:detail[@type='page number']",
42
+ **XML_NS
43
+ )
44
+ if detail.size.zero?
45
+ fallback = page_sequence_number
46
+ return fallback.nil? ? nil : fallback.to_s
47
+ end
48
+ detail.xpath("mods:number", **XML_NS).first.text
49
+ end
50
+
51
+ # Page sequence number, indexical to order in issue.
52
+ # "Number" here is one-indexed positive integer, position in
53
+ # issue. Mandatory for page of issue, nil for page of reel.
54
+ # @return [Integer,NilClass] Page sequence number, positive integer
55
+ def page_sequence_number
56
+ detail = dmd_node.xpath(
57
+ ".//mods:mods//mods:extent[@unit='pages']",
58
+ **XML_NS
59
+ )
60
+ node = detail.xpath("mods:start", **XML_NS).first
61
+ node.text.to_i unless node.nil?
62
+ end
63
+
64
+ # Extract identifier from page ALTO, based on file name.
65
+ # XML parsing of big documents are expensive, so use regex to
66
+ # scan for fileName element, and return its value.
67
+ # @return [String,NilClass] file name or path, or nil.
68
+ def identifier
69
+ matches = page_alto.scan(/<fileName>([^<]*)<\/fileName>/).first
70
+ matches.size.zero? ? nil : stripped_filename(matches[0])
71
+ end
72
+
73
+ def height
74
+ alto_page_meta('HEIGHT').to_i
75
+ end
76
+
77
+ def width
78
+ alto_page_meta('WIDTH').to_i
79
+ end
80
+
81
+ private
82
+
83
+ # filename stripped of base path and file extension
84
+ def stripped_filename(path)
85
+ File.basename(path).split('.')[0]
86
+ end
87
+
88
+ def load_doc
89
+ @doc = @parent.doc unless @parent.nil?
90
+ @doc = Nokogiri::XML(File.open(path)) if @doc.nil?
91
+ end
92
+
93
+ def alto_path
94
+ specified_path = page_files['ocr']
95
+ normalize_path(specified_path)
96
+ end
97
+
98
+ def page_alto
99
+ File.read(alto_path)
100
+ end
101
+
102
+ def alto_page_meta(key)
103
+ matches = page_alto.scan(/(<Page [^>]*>)/).first
104
+ return if matches.size.zero?
105
+ # parse xml <Page> start tag fragment, get attributes:
106
+ page_tag = Nokogiri::XML(matches[0]).root
107
+ page_tag[key]
108
+ end
109
+ end
110
+ end
111
+ end
112
+ end
@@ -0,0 +1,21 @@
1
+ require 'newspaper_works/ingest/ndnp/ndnp_mets_helper'
2
+ require 'newspaper_works/ingest/ndnp/ndnp_asset_helper'
3
+ require 'newspaper_works/ingest/ndnp/page_ingest'
4
+ require 'newspaper_works/ingest/ndnp/page_ingester'
5
+ require 'newspaper_works/ingest/ndnp/page_metadata'
6
+ require 'newspaper_works/ingest/ndnp/issue_ingest'
7
+ require 'newspaper_works/ingest/ndnp/issue_ingester'
8
+ require 'newspaper_works/ingest/ndnp/issue_metadata'
9
+ require 'newspaper_works/ingest/ndnp/container_ingest'
10
+ require 'newspaper_works/ingest/ndnp/container_ingester'
11
+ require 'newspaper_works/ingest/ndnp/container_metadata'
12
+ require 'newspaper_works/ingest/ndnp/batch_xml_ingest'
13
+ require 'newspaper_works/ingest/ndnp/batch_ingester'
14
+
15
+ module NewspaperWorks
16
+ module Ingest
17
+ # Module for NDNP-specific ingest components
18
+ module NDNP
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,56 @@
1
+ module NewspaperWorks
2
+ module Ingest
3
+ class NewspaperIssueIngest < BaseIngest
4
+ @configured = false
5
+
6
+ class << self
7
+ def configure
8
+ return if @configured == true
9
+ # PDF ingest may save page images to /tmp (via Dir.tmpdir), which
10
+ # needs whitelisting for use by NewspaperWorks::Data::WorkFiles.commit!
11
+ # via Hyrax CreateWithRemoteFilesActor:
12
+ whitelist = Hyrax.config.whitelisted_ingest_dirs
13
+ whitelist.push(Dir.tmpdir) unless whitelist.include?(Dir.tmpdir)
14
+ @configured = true
15
+ end
16
+ end
17
+
18
+ def import
19
+ # first, handle the PDF itself on the issue...
20
+ super
21
+ # ...then create child works from split pages
22
+ create_child_pages
23
+ end
24
+
25
+ # Creates child pages with attached TIFF masters, can be called by
26
+ # `import`, or independently if `load` is called first. The
27
+ # latter is appropriate if framework is already handling the
28
+ # NewspaperIssue file attachment (e.g. Hyrax upload via browser).
29
+ def create_child_pages
30
+ self.class.configure
31
+ pages = NewspaperWorks::Ingest::PdfPages.new(path).to_a
32
+ pages.each_with_index do |tiffpath, idx|
33
+ page = new_child_page_with_file(tiffpath, idx)
34
+ @work.ordered_members << page
35
+ end
36
+ @work.save!(validate: false) unless pages.empty?
37
+ end
38
+
39
+ def new_child_page_with_file(tiffpath, idx)
40
+ page_number = idx + 1
41
+ page = NewspaperPage.new
42
+ page.title = ["#{@work.title.first}: Page #{page_number}"]
43
+ # technically, a sequence number distinct from displayed page number
44
+ page.page_number = page_number.to_s
45
+ # Set depositor and admin-set id:
46
+ page.depositor = @work.depositor
47
+ page.admin_set_id = @work.admin_set_id
48
+ # copying permissions also by effect copies visibility:
49
+ page.permissions_attributes = @work.permissions.map(&:to_hash)
50
+ NewspaperPageIngest.new(page).ingest(tiffpath)
51
+ page.save!
52
+ page
53
+ end
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,6 @@
1
+ module NewspaperWorks
2
+ module Ingest
3
+ class NewspaperPageIngest < BaseIngest
4
+ end
5
+ end
6
+ end