newspaper_works 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (461) hide show
  1. checksums.yaml +7 -0
  2. data/.fcrepo_wrapper +4 -0
  3. data/.gitignore +43 -0
  4. data/.rubocop.yml +143 -0
  5. data/.solr_wrapper +8 -0
  6. data/.travis.yml +50 -0
  7. data/Gemfile +47 -0
  8. data/LICENSE +203 -0
  9. data/README.md +159 -0
  10. data/Rakefile +38 -0
  11. data/app/actors/hyrax/actors/newspaper_article_actor.rb +8 -0
  12. data/app/actors/hyrax/actors/newspaper_container_actor.rb +8 -0
  13. data/app/actors/hyrax/actors/newspaper_issue_actor.rb +8 -0
  14. data/app/actors/hyrax/actors/newspaper_page_actor.rb +8 -0
  15. data/app/actors/hyrax/actors/newspaper_title_actor.rb +8 -0
  16. data/app/actors/newspaper_works/actors/newspaper_works_upload_actor.rb +88 -0
  17. data/app/assets/config/newspaper_works_manifest.js +2 -0
  18. data/app/assets/images/newspaper_works/.keep +0 -0
  19. data/app/assets/javascripts/newspaper_works/autocomplete_fix.js +33 -0
  20. data/app/assets/javascripts/newspaper_works/ocr_search.js.erb +6 -0
  21. data/app/assets/javascripts/newspaper_works/thumbnail_highlights.js.erb +102 -0
  22. data/app/assets/javascripts/newspaper_works.js +4 -0
  23. data/app/assets/stylesheets/newspaper_works/_issue_search.scss +13 -0
  24. data/app/assets/stylesheets/newspaper_works/_issues_calendar.scss +18 -0
  25. data/app/assets/stylesheets/newspaper_works/_newspaper_works.scss +4 -0
  26. data/app/assets/stylesheets/newspaper_works/_newspapers_search.scss +38 -0
  27. data/app/assets/stylesheets/newspaper_works/_search_results.scss +12 -0
  28. data/app/controllers/hyrax/newspaper_articles_controller.rb +14 -0
  29. data/app/controllers/hyrax/newspaper_containers_controller.rb +14 -0
  30. data/app/controllers/hyrax/newspaper_issues_controller.rb +14 -0
  31. data/app/controllers/hyrax/newspaper_pages_controller.rb +14 -0
  32. data/app/controllers/hyrax/newspaper_titles_controller.rb +13 -0
  33. data/app/controllers/newspaper_works/newspapers_controller.rb +117 -0
  34. data/app/controllers/newspaper_works/newspapers_search_controller.rb +26 -0
  35. data/app/forms/hyrax/newspaper_article_form.rb +11 -0
  36. data/app/forms/hyrax/newspaper_container_form.rb +11 -0
  37. data/app/forms/hyrax/newspaper_issue_form.rb +11 -0
  38. data/app/forms/hyrax/newspaper_page_form.rb +15 -0
  39. data/app/forms/hyrax/newspaper_title_form.rb +12 -0
  40. data/app/forms/newspaper_works/newspaper_core_form_data.rb +17 -0
  41. data/app/helpers/hyrax/newspaper_articles_helper.rb +5 -0
  42. data/app/helpers/hyrax/newspaper_containers_helper.rb +5 -0
  43. data/app/helpers/hyrax/newspaper_issues_helper.rb +5 -0
  44. data/app/helpers/hyrax/newspaper_pages_helper.rb +5 -0
  45. data/app/helpers/newspaper_works/application_helper.rb +5 -0
  46. data/app/helpers/newspaper_works/breadcrumb_helper.rb +92 -0
  47. data/app/helpers/newspaper_works/newspaper_works_helper_behavior.rb +103 -0
  48. data/app/helpers/newspaper_works/newspapers_helper.rb +5 -0
  49. data/app/indexers/concerns/newspaper_works/indexes_full_text.rb +17 -0
  50. data/app/indexers/concerns/newspaper_works/indexes_place_of_publication.rb +67 -0
  51. data/app/indexers/concerns/newspaper_works/indexes_publication_date_range.rb +35 -0
  52. data/app/indexers/concerns/newspaper_works/indexes_relationships.rb +125 -0
  53. data/app/indexers/newspaper_article_indexer.rb +16 -0
  54. data/app/indexers/newspaper_container_indexer.rb +18 -0
  55. data/app/indexers/newspaper_issue_indexer.rb +26 -0
  56. data/app/indexers/newspaper_page_indexer.rb +9 -0
  57. data/app/indexers/newspaper_title_indexer.rb +19 -0
  58. data/app/indexers/newspaper_works/newspaper_core_indexer.rb +21 -0
  59. data/app/jobs/newspaper_works/application_job.rb +4 -0
  60. data/app/jobs/newspaper_works/compose_issue_pdf_job.rb +13 -0
  61. data/app/jobs/newspaper_works/create_issue_pages_job.rb +19 -0
  62. data/app/mailers/newspaper_works/application_mailer.rb +8 -0
  63. data/app/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior.rb +82 -0
  64. data/app/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior.rb +27 -0
  65. data/app/models/concerns/newspaper_works/newspaper_core_metadata.rb +67 -0
  66. data/app/models/concerns/newspaper_works/place_of_publication_behavior.rb +15 -0
  67. data/app/models/concerns/newspaper_works/scanned_media_metadata.rb +43 -0
  68. data/app/models/concerns/newspaper_works/solr/document.rb +25 -0
  69. data/app/models/file_set.rb +10 -0
  70. data/app/models/newspaper_article.rb +158 -0
  71. data/app/models/newspaper_container.rb +86 -0
  72. data/app/models/newspaper_issue.rb +115 -0
  73. data/app/models/newspaper_page.rb +70 -0
  74. data/app/models/newspaper_title.rb +111 -0
  75. data/app/models/newspaper_works/application_record.rb +6 -0
  76. data/app/models/newspaper_works/derivative_attachment.rb +8 -0
  77. data/app/models/newspaper_works/ingest_file_relation.rb +14 -0
  78. data/app/presenters/hyrax/newspaper_article_presenter.rb +38 -0
  79. data/app/presenters/hyrax/newspaper_container_presenter.rb +11 -0
  80. data/app/presenters/hyrax/newspaper_issue_presenter.rb +62 -0
  81. data/app/presenters/hyrax/newspaper_page_presenter.rb +72 -0
  82. data/app/presenters/hyrax/newspaper_title_presenter.rb +86 -0
  83. data/app/presenters/newspaper_works/iiif_manifest_presenter_behavior.rb +29 -0
  84. data/app/presenters/newspaper_works/issue_info_presenter.rb +29 -0
  85. data/app/presenters/newspaper_works/newspaper_core_presenter.rb +9 -0
  86. data/app/presenters/newspaper_works/persistent_url_presenter_behavior.rb +16 -0
  87. data/app/presenters/newspaper_works/place_of_publication_presenter_behavior.rb +8 -0
  88. data/app/presenters/newspaper_works/scanned_media_presenter.rb +7 -0
  89. data/app/presenters/newspaper_works/title_info_presenter.rb +13 -0
  90. data/app/search_builders/concerns/newspaper_works/exclude_models.rb +16 -0
  91. data/app/search_builders/concerns/newspaper_works/highlight_search_params.rb +14 -0
  92. data/app/search_builders/newspaper_works/newspapers_search_builder.rb +26 -0
  93. data/app/services/hyrax/article_genre_service.rb +9 -0
  94. data/app/services/newspaper_works/jp2_derivative_service.rb +120 -0
  95. data/app/services/newspaper_works/newspaper_page_derivative_service.rb +91 -0
  96. data/app/services/newspaper_works/pdf_derivative_service.rb +45 -0
  97. data/app/services/newspaper_works/pluggable_derivative_service.rb +114 -0
  98. data/app/services/newspaper_works/text_extraction_derivative_service.rb +56 -0
  99. data/app/services/newspaper_works/text_formats_from_alto_service.rb +77 -0
  100. data/app/services/newspaper_works/tiff_derivative_service.rb +54 -0
  101. data/app/validators/newspaper_works/publication_date_start_end_validator.rb +48 -0
  102. data/app/validators/newspaper_works/publication_date_validator.rb +16 -0
  103. data/app/views/catalog/_index_gallery_newspaper_article_wrapper.html.erb +9 -0
  104. data/app/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb +9 -0
  105. data/app/views/catalog/_index_header_gallery_newspaper_article.html.erb +23 -0
  106. data/app/views/catalog/_index_header_gallery_newspaper_page.html.erb +23 -0
  107. data/app/views/catalog/_index_header_list_newspaper_article.html.erb +7 -0
  108. data/app/views/catalog/_index_header_list_newspaper_page.html.erb +7 -0
  109. data/app/views/catalog/_snippets_more.html.erb +16 -0
  110. data/app/views/catalog/_thumbnail_list_newspaper_article.html.erb +6 -0
  111. data/app/views/catalog/_thumbnail_list_newspaper_page.html.erb +6 -0
  112. data/app/views/hyrax/file_sets/_actions.html.erb +45 -0
  113. data/app/views/hyrax/newspaper_articles/_newspaper_article.html.erb +2 -0
  114. data/app/views/hyrax/newspaper_articles/show.html.erb +1 -0
  115. data/app/views/hyrax/newspaper_containers/_newspaper_container.html.erb +2 -0
  116. data/app/views/hyrax/newspaper_containers/show.html.erb +1 -0
  117. data/app/views/hyrax/newspaper_issues/_newspaper_issue.html.erb +2 -0
  118. data/app/views/hyrax/newspaper_issues/show.html.erb +1 -0
  119. data/app/views/hyrax/newspaper_pages/_newspaper_page.html.erb +2 -0
  120. data/app/views/hyrax/newspaper_pages/show.html.erb +1 -0
  121. data/app/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb +5 -0
  122. data/app/views/hyrax/newspaper_titles/_issue_search_form.html.erb +33 -0
  123. data/app/views/hyrax/newspaper_titles/_issues_calendar.html.erb +63 -0
  124. data/app/views/hyrax/newspaper_titles/_newspaper_title.html.erb +2 -0
  125. data/app/views/hyrax/newspaper_titles/show.html.erb +54 -0
  126. data/app/views/newspaper_works/base/_attribute_rows.html.erb +42 -0
  127. data/app/views/newspaper_works/base/_attributes.html.erb +16 -0
  128. data/app/views/newspaper_works/base/_metadata.html.erb +6 -0
  129. data/app/views/newspaper_works/base/_newspaper_hierarchy.html.erb +14 -0
  130. data/app/views/newspaper_works/base/_persistent_url.html.erb +1 -0
  131. data/app/views/newspaper_works/base/_show.html.erb +45 -0
  132. data/app/views/newspaper_works/newspapers_search/_date_fields.html.erb +29 -0
  133. data/app/views/newspaper_works/newspapers_search/_facet_layout.html.erb +8 -0
  134. data/app/views/newspaper_works/newspapers_search/_facet_limit.html.erb +17 -0
  135. data/app/views/newspaper_works/newspapers_search/_front_pages_input.html.erb +5 -0
  136. data/app/views/newspaper_works/newspapers_search/_keyword_input.html.erb +18 -0
  137. data/app/views/newspaper_works/newspapers_search/_newspapers_facets.html.erb +5 -0
  138. data/app/views/newspaper_works/newspapers_search/_newspapers_search_form.html.erb +13 -0
  139. data/app/views/newspaper_works/newspapers_search/_newspapers_search_help.html.erb +8 -0
  140. data/app/views/newspaper_works/newspapers_search/search.html.erb +13 -0
  141. data/app/views/records/edit_fields/_alternate_title.html.erb +4 -0
  142. data/app/views/records/edit_fields/_genre.html.erb +4 -0
  143. data/app/views/records/edit_fields/_place_of_publication.html.erb +14 -0
  144. data/app/views/records/edit_fields/_subtitle.html.erb +4 -0
  145. data/bin/rails +13 -0
  146. data/config/fcrepo_wrapper_test.yml +5 -0
  147. data/config/initializers/assets.rb +2 -0
  148. data/config/locales/newspaper_article.de.yml +12 -0
  149. data/config/locales/newspaper_article.en.yml +12 -0
  150. data/config/locales/newspaper_article.es.yml +12 -0
  151. data/config/locales/newspaper_article.fr.yml +12 -0
  152. data/config/locales/newspaper_article.it.yml +12 -0
  153. data/config/locales/newspaper_article.pt-BR.yml +12 -0
  154. data/config/locales/newspaper_article.zh.yml +12 -0
  155. data/config/locales/newspaper_container.de.yml +8 -0
  156. data/config/locales/newspaper_container.en.yml +8 -0
  157. data/config/locales/newspaper_container.es.yml +8 -0
  158. data/config/locales/newspaper_container.fr.yml +8 -0
  159. data/config/locales/newspaper_container.it.yml +8 -0
  160. data/config/locales/newspaper_container.pt-BR.yml +8 -0
  161. data/config/locales/newspaper_container.zh.yml +8 -0
  162. data/config/locales/newspaper_issue.de.yml +8 -0
  163. data/config/locales/newspaper_issue.en.yml +8 -0
  164. data/config/locales/newspaper_issue.es.yml +8 -0
  165. data/config/locales/newspaper_issue.fr.yml +8 -0
  166. data/config/locales/newspaper_issue.it.yml +8 -0
  167. data/config/locales/newspaper_issue.pt-BR.yml +8 -0
  168. data/config/locales/newspaper_issue.zh.yml +8 -0
  169. data/config/locales/newspaper_page.de.yml +15 -0
  170. data/config/locales/newspaper_page.en.yml +15 -0
  171. data/config/locales/newspaper_page.es.yml +15 -0
  172. data/config/locales/newspaper_page.fr.yml +15 -0
  173. data/config/locales/newspaper_page.it.yml +15 -0
  174. data/config/locales/newspaper_page.pt-BR.yml +15 -0
  175. data/config/locales/newspaper_page.zh.yml +15 -0
  176. data/config/locales/newspaper_title.de.yml +8 -0
  177. data/config/locales/newspaper_title.en.yml +8 -0
  178. data/config/locales/newspaper_title.es.yml +8 -0
  179. data/config/locales/newspaper_title.fr.yml +8 -0
  180. data/config/locales/newspaper_title.it.yml +8 -0
  181. data/config/locales/newspaper_title.pt-BR.yml +8 -0
  182. data/config/locales/newspaper_title.zh.yml +8 -0
  183. data/config/locales/newspaper_works.de.yml +50 -0
  184. data/config/locales/newspaper_works.en.yml +52 -0
  185. data/config/locales/newspaper_works.es.yml +52 -0
  186. data/config/locales/newspaper_works.fr.yml +52 -0
  187. data/config/locales/newspaper_works.it.yml +52 -0
  188. data/config/locales/newspaper_works.pt-BR.yml +52 -0
  189. data/config/locales/newspaper_works.zh.yml +52 -0
  190. data/config/routes.rb +9 -0
  191. data/config/solr_wrapper_test.yml +9 -0
  192. data/config/test-fixture/solr-config/_rest_managed.json +3 -0
  193. data/config/test-fixture/solr-config/admin-extra.html +31 -0
  194. data/config/test-fixture/solr-config/elevate.xml +36 -0
  195. data/config/test-fixture/solr-config/mapping-ISOLatin1Accent.txt +246 -0
  196. data/config/test-fixture/solr-config/protwords.txt +21 -0
  197. data/config/test-fixture/solr-config/schema.xml +366 -0
  198. data/config/test-fixture/solr-config/scripts.conf +24 -0
  199. data/config/test-fixture/solr-config/solrconfig.xml +322 -0
  200. data/config/test-fixture/solr-config/spellings.txt +2 -0
  201. data/config/test-fixture/solr-config/stopwords.txt +58 -0
  202. data/config/test-fixture/solr-config/stopwords_en.txt +58 -0
  203. data/config/test-fixture/solr-config/synonyms.txt +31 -0
  204. data/config/test-fixture/solr-config/xslt/example.xsl +132 -0
  205. data/config/test-fixture/solr-config/xslt/example_atom.xsl +67 -0
  206. data/config/test-fixture/solr-config/xslt/example_rss.xsl +66 -0
  207. data/config/test-fixture/solr-config/xslt/luke.xsl +337 -0
  208. data/config/vendor/imagemagick-6-policy.xml +76 -0
  209. data/db/migrate/20181214181358_create_newspaper_works_derivative_attachments.rb +12 -0
  210. data/db/migrate/20190107165909_create_newspaper_works_ingest_file_relations.rb +11 -0
  211. data/lib/generators/newspaper_works/assets_generator.rb +29 -0
  212. data/lib/generators/newspaper_works/blacklight_advanced_search_generator.rb +44 -0
  213. data/lib/generators/newspaper_works/blacklight_iiif_search_generator.rb +41 -0
  214. data/lib/generators/newspaper_works/catalog_controller_generator.rb +60 -0
  215. data/lib/generators/newspaper_works/install_generator.rb +97 -0
  216. data/lib/generators/newspaper_works/templates/annotation_behavior.rb +6 -0
  217. data/lib/generators/newspaper_works/templates/config/authorities/newspaper_article_genres.yml +86 -0
  218. data/lib/generators/newspaper_works/templates/config/initializers/newspaper_works.rb +12 -0
  219. data/lib/generators/newspaper_works/templates/config/initializers/patch_blacklight_advanced_search.rb +74 -0
  220. data/lib/generators/newspaper_works/templates/custom_search_builder.rb +23 -0
  221. data/lib/generators/newspaper_works/templates/newspaper_works.scss +1 -0
  222. data/lib/generators/newspaper_works/templates/newspaper_works_helper.rb +3 -0
  223. data/lib/generators/newspaper_works/templates/search_behavior.rb +6 -0
  224. data/lib/newspaper_works/configuration.rb +14 -0
  225. data/lib/newspaper_works/data/fileset_helper.rb +25 -0
  226. data/lib/newspaper_works/data/path_helper.rb +40 -0
  227. data/lib/newspaper_works/data/work_derivatives.rb +314 -0
  228. data/lib/newspaper_works/data/work_file.rb +92 -0
  229. data/lib/newspaper_works/data/work_files.rb +181 -0
  230. data/lib/newspaper_works/data.rb +35 -0
  231. data/lib/newspaper_works/engine.rb +42 -0
  232. data/lib/newspaper_works/errors.rb +14 -0
  233. data/lib/newspaper_works/ingest/base_ingest.rb +69 -0
  234. data/lib/newspaper_works/ingest/base_publication_info.rb +35 -0
  235. data/lib/newspaper_works/ingest/batch_ingest_helper.rb +44 -0
  236. data/lib/newspaper_works/ingest/batch_issue_ingester.rb +129 -0
  237. data/lib/newspaper_works/ingest/chronam_publication_info.rb +133 -0
  238. data/lib/newspaper_works/ingest/from_command.rb +52 -0
  239. data/lib/newspaper_works/ingest/image_ingest_issues.rb +43 -0
  240. data/lib/newspaper_works/ingest/issue_images.rb +51 -0
  241. data/lib/newspaper_works/ingest/lc_publication_info.rb +144 -0
  242. data/lib/newspaper_works/ingest/named_issue_metadata.rb +60 -0
  243. data/lib/newspaper_works/ingest/ndnp/batch_ingester.rb +64 -0
  244. data/lib/newspaper_works/ingest/ndnp/batch_xml_ingest.rb +72 -0
  245. data/lib/newspaper_works/ingest/ndnp/container_ingest.rb +99 -0
  246. data/lib/newspaper_works/ingest/ndnp/container_ingester.rb +84 -0
  247. data/lib/newspaper_works/ingest/ndnp/container_metadata.rb +87 -0
  248. data/lib/newspaper_works/ingest/ndnp/issue_ingest.rb +81 -0
  249. data/lib/newspaper_works/ingest/ndnp/issue_ingester.rb +101 -0
  250. data/lib/newspaper_works/ingest/ndnp/issue_metadata.rb +96 -0
  251. data/lib/newspaper_works/ingest/ndnp/ndnp_asset_helper.rb +20 -0
  252. data/lib/newspaper_works/ingest/ndnp/ndnp_mets_helper.rb +70 -0
  253. data/lib/newspaper_works/ingest/ndnp/page_ingest.rb +47 -0
  254. data/lib/newspaper_works/ingest/ndnp/page_ingester.rb +157 -0
  255. data/lib/newspaper_works/ingest/ndnp/page_metadata.rb +112 -0
  256. data/lib/newspaper_works/ingest/ndnp.rb +21 -0
  257. data/lib/newspaper_works/ingest/newspaper_issue_ingest.rb +56 -0
  258. data/lib/newspaper_works/ingest/newspaper_page_ingest.rb +6 -0
  259. data/lib/newspaper_works/ingest/page_image.rb +52 -0
  260. data/lib/newspaper_works/ingest/path_enumeration.rb +52 -0
  261. data/lib/newspaper_works/ingest/pdf_images.rb +85 -0
  262. data/lib/newspaper_works/ingest/pdf_issue.rb +20 -0
  263. data/lib/newspaper_works/ingest/pdf_issues.rb +39 -0
  264. data/lib/newspaper_works/ingest/pdf_pages.rb +114 -0
  265. data/lib/newspaper_works/ingest/pub_finder.rb +89 -0
  266. data/lib/newspaper_works/ingest/publication_info.rb +44 -0
  267. data/lib/newspaper_works/ingest.rb +90 -0
  268. data/lib/newspaper_works/issue_pdf_composer.rb +111 -0
  269. data/lib/newspaper_works/logging.rb +54 -0
  270. data/lib/newspaper_works/page_finder.rb +62 -0
  271. data/lib/newspaper_works/resource_fetcher.rb +78 -0
  272. data/lib/newspaper_works/text_extraction/alto_reader.rb +122 -0
  273. data/lib/newspaper_works/text_extraction/page_ocr.rb +100 -0
  274. data/lib/newspaper_works/text_extraction/render_alto.rb +84 -0
  275. data/lib/newspaper_works/text_extraction/word_coords_builder.rb +30 -0
  276. data/lib/newspaper_works/text_extraction.rb +10 -0
  277. data/lib/newspaper_works/version.rb +3 -0
  278. data/lib/newspaper_works.rb +19 -0
  279. data/lib/tasks/newspaper_works_tasks.rake +39 -0
  280. data/newspaper_works.gemspec +49 -0
  281. data/spec/.keep.txt +1 -0
  282. data/spec/actors/newspaper_works/actors/newspaper_works_upload_actor_spec.rb +69 -0
  283. data/spec/controllers/catalog_controller_spec.rb +63 -0
  284. data/spec/controllers/newspaper_works/newspapers_controller_spec.rb +114 -0
  285. data/spec/controllers/newspaper_works/newspapers_search_controller_spec.rb +21 -0
  286. data/spec/factories/ability.rb +6 -0
  287. data/spec/factories/newspaper_issue.rb +7 -0
  288. data/spec/factories/newspaper_issue_ingest.rb +6 -0
  289. data/spec/factories/newspaper_page.rb +7 -0
  290. data/spec/factories/newspaper_page_ingest.rb +6 -0
  291. data/spec/factories/newspaper_page_solr_document.rb +12 -0
  292. data/spec/factories/newspaper_title.rb +8 -0
  293. data/spec/factories/uploaded_pdf_file.rb +9 -0
  294. data/spec/factories/user.rb +13 -0
  295. data/spec/features/front_pages_for_title_spec.rb +19 -0
  296. data/spec/features/newspaper_title_search_spec.rb +30 -0
  297. data/spec/features/newspapers_search_spec.rb +49 -0
  298. data/spec/features/search_results_thumbnail_highlights_spec.rb +33 -0
  299. data/spec/features_shared.rb +71 -0
  300. data/spec/fixtures/files/4.1.07.jp2 +0 -0
  301. data/spec/fixtures/files/4.1.07.tiff +0 -0
  302. data/spec/fixtures/files/README.md +7 -0
  303. data/spec/fixtures/files/alto-2-0.xsd +714 -0
  304. data/spec/fixtures/files/broken-truncated.pdf +0 -0
  305. data/spec/fixtures/files/credits.md +16 -0
  306. data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
  307. data/spec/fixtures/files/minimal-1-page.pdf +0 -0
  308. data/spec/fixtures/files/minimal-2-page.pdf +0 -0
  309. data/spec/fixtures/files/minimal-alto.xml +31 -0
  310. data/spec/fixtures/files/ndnp-alto-sample.xml +24 -0
  311. data/spec/fixtures/files/ndnp-sample1-json.json +1 -0
  312. data/spec/fixtures/files/ndnp-sample1-txt.txt +1 -0
  313. data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
  314. data/spec/fixtures/files/ocr_alto.xml +202 -0
  315. data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +202 -0
  316. data/spec/fixtures/files/ocr_color.tiff +0 -0
  317. data/spec/fixtures/files/ocr_gray.jp2 +0 -0
  318. data/spec/fixtures/files/ocr_gray.tiff +0 -0
  319. data/spec/fixtures/files/ocr_mono.tiff +0 -0
  320. data/spec/fixtures/files/page1.tiff +0 -0
  321. data/spec/fixtures/files/resource_mocks/chronam/http404-expected +0 -0
  322. data/spec/fixtures/files/resource_mocks/chronam/sn84038814.rdf +1028 -0
  323. data/spec/fixtures/files/resource_mocks/chronam/sn93059126.rdf +36 -0
  324. data/spec/fixtures/files/resource_mocks/chronam/sn94051019.rdf +37 -0
  325. data/spec/fixtures/files/resource_mocks/geonames/Chicopee +1104 -0
  326. data/spec/fixtures/files/resource_mocks/geonames/Denver +1104 -0
  327. data/spec/fixtures/files/resource_mocks/geonames/Marysville +279 -0
  328. data/spec/fixtures/files/resource_mocks/geonames/Marysville2 +279 -0
  329. data/spec/fixtures/files/resource_mocks/geonames/SLC +1104 -0
  330. data/spec/fixtures/files/resource_mocks/lccn/sn2099999999 +1 -0
  331. data/spec/fixtures/files/resource_mocks/lccn/sn82014496 +2 -0
  332. data/spec/fixtures/files/resource_mocks/lccn/sn83020109 +1 -0
  333. data/spec/fixtures/files/resource_mocks/lccn/sn83021453 +2 -0
  334. data/spec/fixtures/files/resource_mocks/lccn/sn83045396 +2 -0
  335. data/spec/fixtures/files/resource_mocks/lccn/sn84038814 +2 -0
  336. data/spec/fixtures/files/resource_mocks/lccn/sn93059126 +1 -0
  337. data/spec/fixtures/files/resource_mocks/lccn/sn94051019 +1 -0
  338. data/spec/fixtures/files/resource_mocks/lccn/sn99999999 +1 -0
  339. data/spec/fixtures/files/resource_mocks/urls.json +82 -0
  340. data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
  341. data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
  342. data/spec/fixtures/files/thumbnail.jpg +0 -0
  343. data/spec/forms/hyrax/newspaper_article_form_spec.rb +33 -0
  344. data/spec/forms/hyrax/newspaper_container_form_spec.rb +30 -0
  345. data/spec/forms/hyrax/newspaper_issue_form_spec.rb +31 -0
  346. data/spec/forms/hyrax/newspaper_page_form_spec.rb +28 -0
  347. data/spec/forms/hyrax/newspaper_title_form_spec.rb +31 -0
  348. data/spec/forms/newspaper_works/newspaper_core_form_data_spec.rb +12 -0
  349. data/spec/helpers/newspaper_works/breadcrumb_helper_spec.rb +82 -0
  350. data/spec/helpers/newspaper_works_helper_spec.rb +57 -0
  351. data/spec/indexers/concerns/newspaper_works/indexes_full_text_spec.rb +31 -0
  352. data/spec/indexers/concerns/newspaper_works/indexes_place_of_publication_spec.rb +53 -0
  353. data/spec/indexers/concerns/newspaper_works/indexes_publication_date_range_spec.rb +39 -0
  354. data/spec/indexers/concerns/newspaper_works/indexes_relationships_spec.rb +86 -0
  355. data/spec/indexers/newspaper_article_indexer_spec.rb +29 -0
  356. data/spec/indexers/newspaper_issue_indexer_spec.rb +19 -0
  357. data/spec/indexers/newspaper_title_indexer_spec.rb +22 -0
  358. data/spec/indexers/newspaper_works/newspaper_core_indexer_spec.rb +23 -0
  359. data/spec/lib/newspaper_works/configuration_spec.rb +18 -0
  360. data/spec/lib/newspaper_works/data/work_derivatives_spec.rb +245 -0
  361. data/spec/lib/newspaper_works/data/work_file_spec.rb +99 -0
  362. data/spec/lib/newspaper_works/data/work_files_spec.rb +224 -0
  363. data/spec/lib/newspaper_works/ingest/batch_issue_ingester_spec.rb +158 -0
  364. data/spec/lib/newspaper_works/ingest/chronam_publication_info_spec.rb +35 -0
  365. data/spec/lib/newspaper_works/ingest/from_command_spec.rb +75 -0
  366. data/spec/lib/newspaper_works/ingest/image_ingest_issues_spec.rb +62 -0
  367. data/spec/lib/newspaper_works/ingest/ingest_shared.rb +75 -0
  368. data/spec/lib/newspaper_works/ingest/issue_images_spec.rb +65 -0
  369. data/spec/lib/newspaper_works/ingest/lc_publication_info_spec.rb +34 -0
  370. data/spec/lib/newspaper_works/ingest/ndnp/batch_ingester_spec.rb +131 -0
  371. data/spec/lib/newspaper_works/ingest/ndnp/batch_xml_ingest_spec.rb +64 -0
  372. data/spec/lib/newspaper_works/ingest/ndnp/container_ingest_spec.rb +44 -0
  373. data/spec/lib/newspaper_works/ingest/ndnp/container_ingester_spec.rb +126 -0
  374. data/spec/lib/newspaper_works/ingest/ndnp/container_metadata_spec.rb +36 -0
  375. data/spec/lib/newspaper_works/ingest/ndnp/issue_ingest_spec.rb +108 -0
  376. data/spec/lib/newspaper_works/ingest/ndnp/issue_ingester_spec.rb +155 -0
  377. data/spec/lib/newspaper_works/ingest/ndnp/issue_metadata_spec.rb +84 -0
  378. data/spec/lib/newspaper_works/ingest/ndnp/page_ingest_spec.rb +79 -0
  379. data/spec/lib/newspaper_works/ingest/ndnp/page_ingester_spec.rb +184 -0
  380. data/spec/lib/newspaper_works/ingest/ndnp/page_metadata_spec.rb +85 -0
  381. data/spec/lib/newspaper_works/ingest/newspaper_issue_ingest_spec.rb +83 -0
  382. data/spec/lib/newspaper_works/ingest/newspaper_page_ingest_spec.rb +77 -0
  383. data/spec/lib/newspaper_works/ingest/page_image_spec.rb +29 -0
  384. data/spec/lib/newspaper_works/ingest/pdf_images_spec.rb +32 -0
  385. data/spec/lib/newspaper_works/ingest/pdf_issue_spec.rb +29 -0
  386. data/spec/lib/newspaper_works/ingest/pdf_issues_spec.rb +62 -0
  387. data/spec/lib/newspaper_works/ingest/pdf_pages_spec.rb +110 -0
  388. data/spec/lib/newspaper_works/ingest/pub_finder_spec.rb +58 -0
  389. data/spec/lib/newspaper_works/ingest/publication_info_spec.rb +61 -0
  390. data/spec/lib/newspaper_works/ingest_spec.rb +45 -0
  391. data/spec/lib/newspaper_works/issue_pdf_composer_spec.rb +101 -0
  392. data/spec/lib/newspaper_works/logging_spec.rb +53 -0
  393. data/spec/lib/newspaper_works/page_finder_spec.rb +53 -0
  394. data/spec/lib/newspaper_works/resource_fetcher_spec.rb +65 -0
  395. data/spec/lib/newspaper_works/text_extraction/alto_reader_spec.rb +49 -0
  396. data/spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb +84 -0
  397. data/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb +54 -0
  398. data/spec/lib/newspaper_works/text_extraction/word_coords_builder_spec.rb +30 -0
  399. data/spec/lib/tasks/newspaper_works_rake_spec.rb +124 -0
  400. data/spec/misc_shared.rb +109 -0
  401. data/spec/model_shared.rb +134 -0
  402. data/spec/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior_spec.rb +45 -0
  403. data/spec/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior_spec.rb +27 -0
  404. data/spec/models/concerns/newspaper_works/newspaper_core_metadata_spec.rb +45 -0
  405. data/spec/models/concerns/newspaper_works/place_of_publication_behavior_spec.rb +17 -0
  406. data/spec/models/concerns/newspaper_works/scanned_media_metadata_spec.rb +35 -0
  407. data/spec/models/newspaper_article_spec.rb +73 -0
  408. data/spec/models/newspaper_container_spec.rb +111 -0
  409. data/spec/models/newspaper_issue_spec.rb +91 -0
  410. data/spec/models/newspaper_page_spec.rb +44 -0
  411. data/spec/models/newspaper_title_spec.rb +116 -0
  412. data/spec/models/newspaper_works/derivative_attachment_spec.rb +37 -0
  413. data/spec/models/newspaper_works/ingest_file_relation_spec.rb +56 -0
  414. data/spec/models/solr_document_spec.rb +14 -0
  415. data/spec/ndnp_shared.rb +48 -0
  416. data/spec/presenters/hyrax/newspaper_article_presenter_spec.rb +53 -0
  417. data/spec/presenters/hyrax/newspaper_container_presenter_spec.rb +20 -0
  418. data/spec/presenters/hyrax/newspaper_issue_presenter_spec.rb +65 -0
  419. data/spec/presenters/hyrax/newspaper_page_presenter_spec.rb +75 -0
  420. data/spec/presenters/hyrax/newspaper_title_presenter_spec.rb +153 -0
  421. data/spec/presenters/newspaper_works/iiif_manifest_presenter_behavior_spec.rb +32 -0
  422. data/spec/presenters/newspaper_works/issue_info_presenter_spec.rb +51 -0
  423. data/spec/presenters/newspaper_works/newspaper_core_presenter_spec.rb +22 -0
  424. data/spec/presenters/newspaper_works/persistent_url_presenter_behavior_spec.rb +24 -0
  425. data/spec/presenters/newspaper_works/place_of_publication_presenter_behavior_spec.rb +17 -0
  426. data/spec/presenters/newspaper_works/scanned_media_presenter_spec.rb +18 -0
  427. data/spec/presenters/newspaper_works/title_info_presenter_spec.rb +23 -0
  428. data/spec/routing/route_spec.rb +52 -0
  429. data/spec/search_builders/custom_search_builder_spec.rb +34 -0
  430. data/spec/search_builders/newspaper_works/newspapers_search_builder_spec.rb +33 -0
  431. data/spec/services/hyrax/article_genre_service_spec.rb +12 -0
  432. data/spec/services/hyrax/resource_types_service_spec.rb +12 -0
  433. data/spec/services/newspaper_works/jp2_derivative_service_spec.rb +62 -0
  434. data/spec/services/newspaper_works/newspaper_page_derivative_service_spec.rb +125 -0
  435. data/spec/services/newspaper_works/pdf_derivative_service_spec.rb +62 -0
  436. data/spec/services/newspaper_works/pluggable_derivative_service_spec.rb +204 -0
  437. data/spec/services/newspaper_works/text_extraction_derivative_service_spec.rb +82 -0
  438. data/spec/services/newspaper_works/text_formats_from_alto_service_spec.rb +129 -0
  439. data/spec/services/newspaper_works/tiff_derivative_service_spec.rb +58 -0
  440. data/spec/spec_helper.rb +261 -0
  441. data/spec/support/controller_level_helpers.rb +28 -0
  442. data/spec/test_app_templates/lib/generators/test_app_generator.rb +22 -0
  443. data/spec/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb_spec.rb +36 -0
  444. data/spec/views/catalog/_index_header_list_newspaper_page.html.erb_spec.rb +26 -0
  445. data/spec/views/catalog/_thumbnail_list_newspaper_page.html.erb_spec.rb +35 -0
  446. data/spec/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb_spec.rb +16 -0
  447. data/spec/views/hyrax/newspaper_titles/_issue_search_form.html.erb_spec.rb +33 -0
  448. data/spec/views/hyrax/newspaper_titles/_issues_calendar.html.erb_spec.rb +37 -0
  449. data/spec/views/hyrax/newspaper_titles/show.html.erb_spec.rb +87 -0
  450. data/spec/views/newspaper_works/base/_attribute_rows.html.erb_spec.rb +60 -0
  451. data/spec/views/newspaper_works/base/_newspaper_hierarchy.html.erb_spec.rb +80 -0
  452. data/spec/views/newspaper_works/base/_show.html.erb_spec.rb +78 -0
  453. data/spec/views/newspaper_works/newspapers_search/search.html.erb_spec.rb +54 -0
  454. data/spec/views/records/edit_fields/_place_of_publication.html.erb_spec.rb +26 -0
  455. data/tasks/newspaperworks_dev.rake +26 -0
  456. data/test/integration/navigation_test.rb +7 -0
  457. data/test/lib/generators/newspaper_works/install_generator_test.rb +16 -0
  458. data/test/newspaper_works_test.rb +7 -0
  459. data/test/test_helper.rb +17 -0
  460. data/tmp/.keep +0 -0
  461. metadata +1037 -0
@@ -0,0 +1,101 @@
1
+ module NewspaperWorks
2
+ module Ingest
3
+ module NDNP
4
+ class IssueIngester
5
+ include NewspaperWorks::Logging
6
+ include NewspaperWorks::Ingest::NDNP::NDNPAssetHelper
7
+ include NewspaperWorks::Ingest::PubFinder
8
+
9
+ attr_accessor :issue, :target, :opts
10
+
11
+ delegate :path, to: :issue
12
+
13
+ COPY_FIELDS = [
14
+ :lccn,
15
+ :edition_number,
16
+ :edition_name,
17
+ :volume,
18
+ :publication_date,
19
+ :held_by,
20
+ :issue_number
21
+ ].freeze
22
+
23
+ # @param issue [NewspaperWorks::Ingest::NDNP::IssueIngest]
24
+ # source issue data
25
+ # @param opts [Hash]
26
+ # ingest options, e.g. administrative metadata
27
+ def initialize(issue, opts = {})
28
+ @issue = issue
29
+ @opts = opts
30
+ @target = nil
31
+ configure_logger('ingest')
32
+ end
33
+
34
+ def ingest
35
+ construct_issue
36
+ ingest_pages
37
+ NewspaperWorks::ComposeIssuePDFJob.perform_later(@target)
38
+ end
39
+
40
+ def construct_issue
41
+ create_issue
42
+ find_or_create_linked_publication
43
+ end
44
+
45
+ def ingest_pages
46
+ issue.each do |page|
47
+ page_ingester(page).ingest
48
+ end
49
+ end
50
+
51
+ private
52
+
53
+ def page_ingester(page_data)
54
+ NewspaperWorks::Ingest::NDNP::PageIngester.new(
55
+ page_data,
56
+ @target,
57
+ @opts
58
+ )
59
+ end
60
+
61
+ def publication_date
62
+ parsed = DateTime.iso8601(issue.metadata.publication_date)
63
+ parsed.strftime('%B %-d, %Y')
64
+ end
65
+
66
+ def publication_title(issue)
67
+ issue.metadata.publication_title.strip.split(/ \(/)[0]
68
+ end
69
+
70
+ def issue_title
71
+ "#{publication_title(issue)}: #{publication_date}"
72
+ end
73
+
74
+ def copy_issue_metadata
75
+ metadata = issue.metadata
76
+ # set (required, plural) title from single value obtained from reel:
77
+ @target.title = [issue_title]
78
+ # copy all fields with singular (non-repeatable) values on both
79
+ # target NewspaperIssue object, and metadata source:
80
+ COPY_FIELDS.each do |fieldname|
81
+ @target.send("#{fieldname}=", metadata.send(fieldname.to_s))
82
+ end
83
+ end
84
+
85
+ def create_issue
86
+ @target = NewspaperIssue.create
87
+ copy_issue_metadata
88
+ assign_administrative_metadata
89
+ @target.save!
90
+ write_log("Saved metadata to new NewspaperIssue #{@target.id}")
91
+ end
92
+
93
+ def find_or_create_linked_publication
94
+ title = publication_title(issue)
95
+ lccn = issue.metadata.lccn
96
+ find_or_create_publication_for_issue(@target, lccn, title, @opts)
97
+ end
98
+ end
99
+ end
100
+ end
101
+ end
@@ -0,0 +1,96 @@
1
+ module NewspaperWorks
2
+ module Ingest
3
+ module NDNP
4
+ class IssueMetadata
5
+ include NewspaperWorks::Ingest::NDNP::NDNPMetsHelper
6
+
7
+ attr_accessor :path, :doc, :parent
8
+
9
+ def initialize(path, parent = nil)
10
+ @path = path
11
+ @parent = parent
12
+ @doc = nil
13
+ load_doc
14
+ end
15
+
16
+ def inspect
17
+ format(
18
+ "<#{self.class}:0x000000000%<oid>x\n" \
19
+ "\tpath: '#{path}',\n",
20
+ oid: object_id << 1
21
+ )
22
+ end
23
+
24
+ # LCCN (mandatory)
25
+ # @return [String]
26
+ def lccn
27
+ xpath("//mods:identifier[@type='lccn']").text
28
+ end
29
+
30
+ # Volume number (optional)
31
+ # @return [String,NilClass]
32
+ def volume
33
+ result = xpath("//mods:detail[@type='volume']/mods:number")
34
+ return if result.size.zero?
35
+ result.text
36
+ end
37
+
38
+ # Issue number (optional)
39
+ # @return [String,NilClass]
40
+ def issue_number
41
+ result = xpath("//mods:detail[@type='issue']/mods:number")
42
+ return if result.size.zero?
43
+ result.text
44
+ end
45
+
46
+ # Edition name
47
+ # Edition name is optional ("caption" / "label") is optional
48
+ # in NDNP, but as it may be used as a label for readability.
49
+ # @return [String,NilClass]
50
+ def edition_name
51
+ ed_name = xpath("//mods:detail[@type='edition']/mods:caption")
52
+ return ed_name.text unless ed_name.size.zero?
53
+ end
54
+
55
+ # Edition name, with fallback to edition number (mandatory)
56
+ # @return [String]
57
+ def edition_number
58
+ xpath("//mods:detail[@type='edition']/mods:number").text
59
+ end
60
+
61
+ # Issue date (mandatory field) as ISO 8601 datestamp string
62
+ # @return [String] (ISO-8601 date) publication date
63
+ def publication_date
64
+ xpath("//mods:originInfo/mods:dateIssued").text
65
+ end
66
+
67
+ def publication_title
68
+ # try from reel first
69
+ reel = parent.nil? ? nil : parent.container
70
+ return reel.metadata.title unless reel.nil?
71
+ # fallback to parsing //mets/@LABEL
72
+ label = xpath('//mets:mets/@LABEL').first
73
+ v = label.nil? ? '' : label.value.split(/[,] [0-9]/)[0]
74
+ # based on label convention:
75
+ # "ACME Times (Springfield, UT), 1911-01-25, First Edition"
76
+ # Returns the name and (*for now TBD*) place of publication
77
+ # as a string in parentheses.
78
+ v.split(/, [0-9]/)[0]
79
+ end
80
+
81
+ # Original Source Repository (NDNP-mandatory)
82
+ # @return [String]
83
+ def held_by
84
+ xpath("//mods:physicalLocation").first['displayLabel']
85
+ end
86
+
87
+ private
88
+
89
+ def load_doc
90
+ @doc = @parent.doc unless @parent.nil?
91
+ @doc = Nokogiri::XML(File.open(path)) if @doc.nil?
92
+ end
93
+ end
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,20 @@
1
+ module NewspaperWorks
2
+ module Ingest
3
+ module NDNP
4
+ # Mixin for mets-specific XPath and traversal of issue/page data
5
+ module NDNPAssetHelper
6
+ # Set administrative metadata for asset, based on options saved
7
+ # on ingester state.
8
+ # Pre-conditions for use:
9
+ # consuming class implements @target pointing to work asset
10
+ # consuming class implements @opts pointing to Hash
11
+ def assign_administrative_metadata(work = nil)
12
+ NewspaperWorks::Ingest.assign_administrative_metadata(
13
+ work || @target,
14
+ @opts
15
+ )
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,70 @@
1
+ require 'nokogiri'
2
+
3
+ module NewspaperWorks
4
+ module Ingest
5
+ module NDNP
6
+ # Mixin for mets-specific XPath and traversal of issue/page data
7
+ module NDNPMetsHelper
8
+ XML_NS = {
9
+ mets: 'http://www.loc.gov/METS/',
10
+ METS: 'http://www.loc.gov/METS/',
11
+ mods: 'http://www.loc.gov/mods/v3',
12
+ MODS: 'http://www.loc.gov/mods/v3',
13
+ ndnp: 'http://www.loc.gov/ndnp',
14
+ NDNP: 'http://www.loc.gov/ndnp'
15
+ }.freeze
16
+
17
+ # DRY XPath without repeatedly specifying default namespace urlmap
18
+ def xpath(expr, context = nil)
19
+ context ||= doc
20
+ context.xpath(
21
+ expr,
22
+ **XML_NS
23
+ )
24
+ end
25
+
26
+ def dmd_node
27
+ xpath("//mets:dmdSec[@ID='#{dmdid}']")
28
+ end
29
+
30
+ def normalize_path(specified_path)
31
+ return specified_path if specified_path.start_with?('/')
32
+ basename = File.dirname(path)
33
+ File.join(basename, specified_path)
34
+ end
35
+
36
+ # returns hash of "use" key string to path value
37
+ def page_files
38
+ # get pointers from structmap:
39
+ file_group = xpath("//mets:structMap//mets:div[@DMDID='#{dmdid}']")
40
+ result = xpath('mets:fptr', file_group).map do |fptr|
41
+ file_id = fptr['FILEID']
42
+ file_node = xpath(
43
+ "//mets:fileSec//mets:fileGrp//mets:file[@ID='#{file_id}']"
44
+ ).first
45
+ [
46
+ file_node['USE'],
47
+ xpath('mets:FLocat', file_node).first.attribute_with_ns(
48
+ 'href',
49
+ 'http://www.w3.org/1999/xlink'
50
+ ).to_s
51
+ ]
52
+ end
53
+ result.to_h
54
+ end
55
+
56
+ def container_path
57
+ reel_dir = File.expand_path('..', File.dirname(path))
58
+ reel_base = File.basename(reel_dir)
59
+ File.join(reel_dir, "#{reel_base}_1.xml")
60
+ end
61
+
62
+ def container
63
+ reel_path = container_path
64
+ return unless File.exist?(reel_path)
65
+ NewspaperWorks::Ingest::NDNP::ContainerIngest.new(reel_path)
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,47 @@
1
+ module NewspaperWorks
2
+ module Ingest
3
+ module NDNP
4
+ class PageIngest
5
+ include NewspaperWorks::Ingest::NDNP::NDNPMetsHelper
6
+
7
+ attr_accessor :path, :dmdid, :doc, :files
8
+
9
+ def initialize(path = nil, dmdid = nil, parent = nil)
10
+ raise ArgumentError, 'No path provided' if path.nil?
11
+ @path = path
12
+ @dmdid = dmdid
13
+ @doc = nil
14
+ @parent = parent
15
+ @metadata = nil
16
+ load_doc
17
+ @files = page_files.values.map(&method(:normalize_path))
18
+ end
19
+
20
+ def inspect
21
+ format(
22
+ "<#{self.class}:0x000000000%<oid>x\n" \
23
+ "\tpath: '#{path}',\n" \
24
+ "\tdmdid: '#{dmdid}' ...>",
25
+ oid: object_id << 1
26
+ )
27
+ end
28
+
29
+ def metadata
30
+ return @metadata unless @metadata.nil?
31
+ @metadata = NewspaperWorks::Ingest::NDNP::PageMetadata.new(
32
+ path,
33
+ self,
34
+ dmdid
35
+ )
36
+ end
37
+
38
+ private
39
+
40
+ def load_doc
41
+ @doc = @parent.doc unless @parent.nil?
42
+ @doc = Nokogiri::XML(File.open(path)) if @doc.nil?
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,157 @@
1
+ require 'newspaper_works/logging'
2
+
3
+ module NewspaperWorks
4
+ module Ingest
5
+ module NDNP
6
+ class PageIngester
7
+ include NewspaperWorks::Logging
8
+ include NewspaperWorks::Ingest::NDNP::NDNPAssetHelper
9
+
10
+ attr_accessor :page, :issue, :target, :opts
11
+
12
+ delegate :path, :dmdid, to: :page
13
+
14
+ COPY_FIELDS = [
15
+ :width,
16
+ :height,
17
+ :page_number,
18
+ :identifier
19
+ ].freeze
20
+
21
+ COPY_FIELDS_PLURALIZE = [
22
+ :identifier
23
+ ].freeze
24
+
25
+ # @param page [NewspaperWorks::Ingest::NDNP::PageIngest]
26
+ # source page data
27
+ # @param issue [NewspaperIssue]
28
+ # source issue data
29
+ # @param opts [Hash]
30
+ # ingest options, e.g. administrative metadata
31
+ def initialize(page, issue, opts = {})
32
+ @page = page
33
+ @issue = issue
34
+ @opts = opts
35
+ # target is to-be-created NewspaperPage:
36
+ @target = nil
37
+ @work_files = nil
38
+ configure_logger('ingest')
39
+ end
40
+
41
+ def ingest
42
+ construct_page
43
+ ingest_page_files
44
+ link_reel
45
+ end
46
+
47
+ def construct_page
48
+ @target = NewspaperPage.create!(title: page_title)
49
+ write_log(
50
+ "Created NewspaperPage work #{@target.id} "\
51
+ "with title '#{@target.title[0]}'"
52
+ )
53
+ copy_page_metadata
54
+ assign_administrative_metadata
55
+ link_issue
56
+ @target.save!
57
+ write_log("Saved metadata to NewspaperPage work #{@target.id}")
58
+ end
59
+
60
+ # Ingest primary, derivative files; other derivatives including
61
+ # thumbnail, plain-text, json will be made by NewspaperWorks
62
+ # derivative service components as a consequence of commiting
63
+ # files assigned (via actor stack, via WorkFiles).
64
+ def ingest_page_files
65
+ @work_files = NewspaperWorks::Data::WorkFiles.new(@target)
66
+ page.files.each do |path|
67
+ ext = path.downcase.split('.')[-1]
68
+ if ['tif', 'tiff'].include?(ext)
69
+ ingest_primary_file(path)
70
+ else
71
+ ingest_derivative_file(path)
72
+ end
73
+ end
74
+ write_log("Beginning file attachment process (WorkFiles.commit!) "\
75
+ "for work #{@target.id}")
76
+ @work_files.commit!
77
+ end
78
+
79
+ def link_reel
80
+ reel_data = @page.container
81
+ return if reel_data.nil?
82
+ ingester = NewspaperWorks::Ingest::NDNP::ContainerIngester.new(
83
+ reel_data,
84
+ issue.publication,
85
+ @opts
86
+ )
87
+ # find-or-create container, linked to publication:
88
+ ingester.ingest
89
+ # link target page to container asset for reel:
90
+ ingester.link(@target)
91
+ end
92
+
93
+ private
94
+
95
+ def ingest_primary_file(path)
96
+ unless File.exist?(path)
97
+ pdf_path = page.files.select { |p| p.end_with?('pdf') }[0]
98
+ # make and get TIFF path (to generated tmp file):
99
+ path = make_tiff(pdf_path)
100
+ end
101
+ write_log("Assigned primary file to work #{@target.id}, #{path}")
102
+ @work_files.assign(path)
103
+ end
104
+
105
+ def ingest_derivative_file(path)
106
+ write_log("Assigned derivative file to work #{@target.id}, #{path}")
107
+ @work_files.derivatives.assign(path)
108
+ end
109
+
110
+ def link_issue
111
+ issue.ordered_members << @target # page
112
+ issue.save!
113
+ write_log(
114
+ "Linked NewspaperIssue work #{issue.id} "\
115
+ "to NewspaperPage work #{@target.id}"
116
+ )
117
+ end
118
+
119
+ # dir whitelist
120
+ def whitelist
121
+ Hyrax.config.whitelisted_ingest_dirs
122
+ end
123
+
124
+ # Generate TIFF in temporary file, return its path, given path to PDF
125
+ # @param pdf_path [String] path to single-page PDF
126
+ # @return [String] path to generated TIFF
127
+ def make_tiff(pdf_path)
128
+ write_log(
129
+ "Creating TIFF from PDF in lieu of missing for work "\
130
+ " (#{@target.id})",
131
+ Logger::WARN
132
+ )
133
+ whitelist.push(Dir.tmpdir) unless whitelist.include?(Dir.tmpdir)
134
+ NewspaperWorks::Ingest::PdfPages.new(pdf_path).to_a[0]
135
+ end
136
+
137
+ # Page title as issue title plus page title
138
+ # e.g. "ACME Tribune (1910-01-02): Page 2"
139
+ # @return [String] composed page title
140
+ def page_title
141
+ ["#{issue.title.first}: Page #{@page.metadata.page_number}"]
142
+ end
143
+
144
+ def copy_page_metadata
145
+ metadata = page.metadata
146
+ # copy all fields with singular (non-repeatable) values on both
147
+ # target NewspaperIssue object, and metadata source:
148
+ COPY_FIELDS.each do |fieldname|
149
+ value = metadata.send(fieldname.to_s)
150
+ pluralize = COPY_FIELDS_PLURALIZE.include?(fieldname)
151
+ @target.send("#{fieldname}=", pluralize ? [value] : value)
152
+ end
153
+ end
154
+ end
155
+ end
156
+ end
157
+ end
@@ -0,0 +1,112 @@
1
+ require 'nokogiri'
2
+
3
+ module NewspaperWorks
4
+ module Ingest
5
+ module NDNP
6
+ class PageMetadata
7
+ # mixin convenience methods for NDNP XML, plus XML_NS hash
8
+ include NewspaperWorks::Ingest::NDNP::NDNPMetsHelper
9
+
10
+ attr_accessor :path, :dmdid, :doc
11
+
12
+ def initialize(path = nil, parent = nil, dmdid = nil)
13
+ raise ArgumentError, 'No context provided' if path.nil? && parent.nil?
14
+ @path = path
15
+ @parent = parent
16
+ @dmdid = dmdid
17
+ @doc = nil
18
+ load_doc
19
+ end
20
+
21
+ def inspect
22
+ format(
23
+ "<#{self.class}:0x000000000%<oid>x\n" \
24
+ "\tpath: '#{path}',\n" \
25
+ "\tdmdid: '#{dmdid}' ...>",
26
+ oid: object_id << 1
27
+ )
28
+ end
29
+
30
+ # Printed page number, if printed; optional field in NDNP spec.
31
+ # "Number" is used liberally, and may contain both alpha
32
+ # and numeric characters. As such, return value is String.
33
+ #
34
+ # If NDNP issue data fails to provide an explicitly
35
+ # human-readable page number, fallback to sequence
36
+ # number, in String form.
37
+ #
38
+ # @return [String, NilClass] Page "number" string
39
+ def page_number
40
+ detail = dmd_node.xpath(
41
+ ".//mods:mods//mods:detail[@type='page number']",
42
+ **XML_NS
43
+ )
44
+ if detail.size.zero?
45
+ fallback = page_sequence_number
46
+ return fallback.nil? ? nil : fallback.to_s
47
+ end
48
+ detail.xpath("mods:number", **XML_NS).first.text
49
+ end
50
+
51
+ # Page sequence number, indexical to order in issue.
52
+ # "Number" here is one-indexed positive integer, position in
53
+ # issue. Mandatory for page of issue, nil for page of reel.
54
+ # @return [Integer,NilClass] Page sequence number, positive integer
55
+ def page_sequence_number
56
+ detail = dmd_node.xpath(
57
+ ".//mods:mods//mods:extent[@unit='pages']",
58
+ **XML_NS
59
+ )
60
+ node = detail.xpath("mods:start", **XML_NS).first
61
+ node.text.to_i unless node.nil?
62
+ end
63
+
64
+ # Extract identifier from page ALTO, based on file name.
65
+ # XML parsing of big documents are expensive, so use regex to
66
+ # scan for fileName element, and return its value.
67
+ # @return [String,NilClass] file name or path, or nil.
68
+ def identifier
69
+ matches = page_alto.scan(/<fileName>([^<]*)<\/fileName>/).first
70
+ matches.size.zero? ? nil : stripped_filename(matches[0])
71
+ end
72
+
73
+ def height
74
+ alto_page_meta('HEIGHT').to_i
75
+ end
76
+
77
+ def width
78
+ alto_page_meta('WIDTH').to_i
79
+ end
80
+
81
+ private
82
+
83
+ # filename stripped of base path and file extension
84
+ def stripped_filename(path)
85
+ File.basename(path).split('.')[0]
86
+ end
87
+
88
+ def load_doc
89
+ @doc = @parent.doc unless @parent.nil?
90
+ @doc = Nokogiri::XML(File.open(path)) if @doc.nil?
91
+ end
92
+
93
+ def alto_path
94
+ specified_path = page_files['ocr']
95
+ normalize_path(specified_path)
96
+ end
97
+
98
+ def page_alto
99
+ File.read(alto_path)
100
+ end
101
+
102
+ def alto_page_meta(key)
103
+ matches = page_alto.scan(/(<Page [^>]*>)/).first
104
+ return if matches.size.zero?
105
+ # parse xml <Page> start tag fragment, get attributes:
106
+ page_tag = Nokogiri::XML(matches[0]).root
107
+ page_tag[key]
108
+ end
109
+ end
110
+ end
111
+ end
112
+ end
@@ -0,0 +1,21 @@
1
+ require 'newspaper_works/ingest/ndnp/ndnp_mets_helper'
2
+ require 'newspaper_works/ingest/ndnp/ndnp_asset_helper'
3
+ require 'newspaper_works/ingest/ndnp/page_ingest'
4
+ require 'newspaper_works/ingest/ndnp/page_ingester'
5
+ require 'newspaper_works/ingest/ndnp/page_metadata'
6
+ require 'newspaper_works/ingest/ndnp/issue_ingest'
7
+ require 'newspaper_works/ingest/ndnp/issue_ingester'
8
+ require 'newspaper_works/ingest/ndnp/issue_metadata'
9
+ require 'newspaper_works/ingest/ndnp/container_ingest'
10
+ require 'newspaper_works/ingest/ndnp/container_ingester'
11
+ require 'newspaper_works/ingest/ndnp/container_metadata'
12
+ require 'newspaper_works/ingest/ndnp/batch_xml_ingest'
13
+ require 'newspaper_works/ingest/ndnp/batch_ingester'
14
+
15
+ module NewspaperWorks
16
+ module Ingest
17
+ # Module for NDNP-specific ingest components
18
+ module NDNP
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,56 @@
1
+ module NewspaperWorks
2
+ module Ingest
3
+ class NewspaperIssueIngest < BaseIngest
4
+ @configured = false
5
+
6
+ class << self
7
+ def configure
8
+ return if @configured == true
9
+ # PDF ingest may save page images to /tmp (via Dir.tmpdir), which
10
+ # needs whitelisting for use by NewspaperWorks::Data::WorkFiles.commit!
11
+ # via Hyrax CreateWithRemoteFilesActor:
12
+ whitelist = Hyrax.config.whitelisted_ingest_dirs
13
+ whitelist.push(Dir.tmpdir) unless whitelist.include?(Dir.tmpdir)
14
+ @configured = true
15
+ end
16
+ end
17
+
18
+ def import
19
+ # first, handle the PDF itself on the issue...
20
+ super
21
+ # ...then create child works from split pages
22
+ create_child_pages
23
+ end
24
+
25
+ # Creates child pages with attached TIFF masters, can be called by
26
+ # `import`, or independently if `load` is called first. The
27
+ # latter is appropriate if framework is already handling the
28
+ # NewspaperIssue file attachment (e.g. Hyrax upload via browser).
29
+ def create_child_pages
30
+ self.class.configure
31
+ pages = NewspaperWorks::Ingest::PdfPages.new(path).to_a
32
+ pages.each_with_index do |tiffpath, idx|
33
+ page = new_child_page_with_file(tiffpath, idx)
34
+ @work.ordered_members << page
35
+ end
36
+ @work.save!(validate: false) unless pages.empty?
37
+ end
38
+
39
+ def new_child_page_with_file(tiffpath, idx)
40
+ page_number = idx + 1
41
+ page = NewspaperPage.new
42
+ page.title = ["#{@work.title.first}: Page #{page_number}"]
43
+ # technically, a sequence number distinct from displayed page number
44
+ page.page_number = page_number.to_s
45
+ # Set depositor and admin-set id:
46
+ page.depositor = @work.depositor
47
+ page.admin_set_id = @work.admin_set_id
48
+ # copying permissions also by effect copies visibility:
49
+ page.permissions_attributes = @work.permissions.map(&:to_hash)
50
+ NewspaperPageIngest.new(page).ingest(tiffpath)
51
+ page.save!
52
+ page
53
+ end
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,6 @@
1
+ module NewspaperWorks
2
+ module Ingest
3
+ class NewspaperPageIngest < BaseIngest
4
+ end
5
+ end
6
+ end