newspaper_works 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (461) hide show
  1. checksums.yaml +7 -0
  2. data/.fcrepo_wrapper +4 -0
  3. data/.gitignore +43 -0
  4. data/.rubocop.yml +143 -0
  5. data/.solr_wrapper +8 -0
  6. data/.travis.yml +50 -0
  7. data/Gemfile +47 -0
  8. data/LICENSE +203 -0
  9. data/README.md +159 -0
  10. data/Rakefile +38 -0
  11. data/app/actors/hyrax/actors/newspaper_article_actor.rb +8 -0
  12. data/app/actors/hyrax/actors/newspaper_container_actor.rb +8 -0
  13. data/app/actors/hyrax/actors/newspaper_issue_actor.rb +8 -0
  14. data/app/actors/hyrax/actors/newspaper_page_actor.rb +8 -0
  15. data/app/actors/hyrax/actors/newspaper_title_actor.rb +8 -0
  16. data/app/actors/newspaper_works/actors/newspaper_works_upload_actor.rb +88 -0
  17. data/app/assets/config/newspaper_works_manifest.js +2 -0
  18. data/app/assets/images/newspaper_works/.keep +0 -0
  19. data/app/assets/javascripts/newspaper_works/autocomplete_fix.js +33 -0
  20. data/app/assets/javascripts/newspaper_works/ocr_search.js.erb +6 -0
  21. data/app/assets/javascripts/newspaper_works/thumbnail_highlights.js.erb +102 -0
  22. data/app/assets/javascripts/newspaper_works.js +4 -0
  23. data/app/assets/stylesheets/newspaper_works/_issue_search.scss +13 -0
  24. data/app/assets/stylesheets/newspaper_works/_issues_calendar.scss +18 -0
  25. data/app/assets/stylesheets/newspaper_works/_newspaper_works.scss +4 -0
  26. data/app/assets/stylesheets/newspaper_works/_newspapers_search.scss +38 -0
  27. data/app/assets/stylesheets/newspaper_works/_search_results.scss +12 -0
  28. data/app/controllers/hyrax/newspaper_articles_controller.rb +14 -0
  29. data/app/controllers/hyrax/newspaper_containers_controller.rb +14 -0
  30. data/app/controllers/hyrax/newspaper_issues_controller.rb +14 -0
  31. data/app/controllers/hyrax/newspaper_pages_controller.rb +14 -0
  32. data/app/controllers/hyrax/newspaper_titles_controller.rb +13 -0
  33. data/app/controllers/newspaper_works/newspapers_controller.rb +117 -0
  34. data/app/controllers/newspaper_works/newspapers_search_controller.rb +26 -0
  35. data/app/forms/hyrax/newspaper_article_form.rb +11 -0
  36. data/app/forms/hyrax/newspaper_container_form.rb +11 -0
  37. data/app/forms/hyrax/newspaper_issue_form.rb +11 -0
  38. data/app/forms/hyrax/newspaper_page_form.rb +15 -0
  39. data/app/forms/hyrax/newspaper_title_form.rb +12 -0
  40. data/app/forms/newspaper_works/newspaper_core_form_data.rb +17 -0
  41. data/app/helpers/hyrax/newspaper_articles_helper.rb +5 -0
  42. data/app/helpers/hyrax/newspaper_containers_helper.rb +5 -0
  43. data/app/helpers/hyrax/newspaper_issues_helper.rb +5 -0
  44. data/app/helpers/hyrax/newspaper_pages_helper.rb +5 -0
  45. data/app/helpers/newspaper_works/application_helper.rb +5 -0
  46. data/app/helpers/newspaper_works/breadcrumb_helper.rb +92 -0
  47. data/app/helpers/newspaper_works/newspaper_works_helper_behavior.rb +103 -0
  48. data/app/helpers/newspaper_works/newspapers_helper.rb +5 -0
  49. data/app/indexers/concerns/newspaper_works/indexes_full_text.rb +17 -0
  50. data/app/indexers/concerns/newspaper_works/indexes_place_of_publication.rb +67 -0
  51. data/app/indexers/concerns/newspaper_works/indexes_publication_date_range.rb +35 -0
  52. data/app/indexers/concerns/newspaper_works/indexes_relationships.rb +125 -0
  53. data/app/indexers/newspaper_article_indexer.rb +16 -0
  54. data/app/indexers/newspaper_container_indexer.rb +18 -0
  55. data/app/indexers/newspaper_issue_indexer.rb +26 -0
  56. data/app/indexers/newspaper_page_indexer.rb +9 -0
  57. data/app/indexers/newspaper_title_indexer.rb +19 -0
  58. data/app/indexers/newspaper_works/newspaper_core_indexer.rb +21 -0
  59. data/app/jobs/newspaper_works/application_job.rb +4 -0
  60. data/app/jobs/newspaper_works/compose_issue_pdf_job.rb +13 -0
  61. data/app/jobs/newspaper_works/create_issue_pages_job.rb +19 -0
  62. data/app/mailers/newspaper_works/application_mailer.rb +8 -0
  63. data/app/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior.rb +82 -0
  64. data/app/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior.rb +27 -0
  65. data/app/models/concerns/newspaper_works/newspaper_core_metadata.rb +67 -0
  66. data/app/models/concerns/newspaper_works/place_of_publication_behavior.rb +15 -0
  67. data/app/models/concerns/newspaper_works/scanned_media_metadata.rb +43 -0
  68. data/app/models/concerns/newspaper_works/solr/document.rb +25 -0
  69. data/app/models/file_set.rb +10 -0
  70. data/app/models/newspaper_article.rb +158 -0
  71. data/app/models/newspaper_container.rb +86 -0
  72. data/app/models/newspaper_issue.rb +115 -0
  73. data/app/models/newspaper_page.rb +70 -0
  74. data/app/models/newspaper_title.rb +111 -0
  75. data/app/models/newspaper_works/application_record.rb +6 -0
  76. data/app/models/newspaper_works/derivative_attachment.rb +8 -0
  77. data/app/models/newspaper_works/ingest_file_relation.rb +14 -0
  78. data/app/presenters/hyrax/newspaper_article_presenter.rb +38 -0
  79. data/app/presenters/hyrax/newspaper_container_presenter.rb +11 -0
  80. data/app/presenters/hyrax/newspaper_issue_presenter.rb +62 -0
  81. data/app/presenters/hyrax/newspaper_page_presenter.rb +72 -0
  82. data/app/presenters/hyrax/newspaper_title_presenter.rb +86 -0
  83. data/app/presenters/newspaper_works/iiif_manifest_presenter_behavior.rb +29 -0
  84. data/app/presenters/newspaper_works/issue_info_presenter.rb +29 -0
  85. data/app/presenters/newspaper_works/newspaper_core_presenter.rb +9 -0
  86. data/app/presenters/newspaper_works/persistent_url_presenter_behavior.rb +16 -0
  87. data/app/presenters/newspaper_works/place_of_publication_presenter_behavior.rb +8 -0
  88. data/app/presenters/newspaper_works/scanned_media_presenter.rb +7 -0
  89. data/app/presenters/newspaper_works/title_info_presenter.rb +13 -0
  90. data/app/search_builders/concerns/newspaper_works/exclude_models.rb +16 -0
  91. data/app/search_builders/concerns/newspaper_works/highlight_search_params.rb +14 -0
  92. data/app/search_builders/newspaper_works/newspapers_search_builder.rb +26 -0
  93. data/app/services/hyrax/article_genre_service.rb +9 -0
  94. data/app/services/newspaper_works/jp2_derivative_service.rb +120 -0
  95. data/app/services/newspaper_works/newspaper_page_derivative_service.rb +91 -0
  96. data/app/services/newspaper_works/pdf_derivative_service.rb +45 -0
  97. data/app/services/newspaper_works/pluggable_derivative_service.rb +114 -0
  98. data/app/services/newspaper_works/text_extraction_derivative_service.rb +56 -0
  99. data/app/services/newspaper_works/text_formats_from_alto_service.rb +77 -0
  100. data/app/services/newspaper_works/tiff_derivative_service.rb +54 -0
  101. data/app/validators/newspaper_works/publication_date_start_end_validator.rb +48 -0
  102. data/app/validators/newspaper_works/publication_date_validator.rb +16 -0
  103. data/app/views/catalog/_index_gallery_newspaper_article_wrapper.html.erb +9 -0
  104. data/app/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb +9 -0
  105. data/app/views/catalog/_index_header_gallery_newspaper_article.html.erb +23 -0
  106. data/app/views/catalog/_index_header_gallery_newspaper_page.html.erb +23 -0
  107. data/app/views/catalog/_index_header_list_newspaper_article.html.erb +7 -0
  108. data/app/views/catalog/_index_header_list_newspaper_page.html.erb +7 -0
  109. data/app/views/catalog/_snippets_more.html.erb +16 -0
  110. data/app/views/catalog/_thumbnail_list_newspaper_article.html.erb +6 -0
  111. data/app/views/catalog/_thumbnail_list_newspaper_page.html.erb +6 -0
  112. data/app/views/hyrax/file_sets/_actions.html.erb +45 -0
  113. data/app/views/hyrax/newspaper_articles/_newspaper_article.html.erb +2 -0
  114. data/app/views/hyrax/newspaper_articles/show.html.erb +1 -0
  115. data/app/views/hyrax/newspaper_containers/_newspaper_container.html.erb +2 -0
  116. data/app/views/hyrax/newspaper_containers/show.html.erb +1 -0
  117. data/app/views/hyrax/newspaper_issues/_newspaper_issue.html.erb +2 -0
  118. data/app/views/hyrax/newspaper_issues/show.html.erb +1 -0
  119. data/app/views/hyrax/newspaper_pages/_newspaper_page.html.erb +2 -0
  120. data/app/views/hyrax/newspaper_pages/show.html.erb +1 -0
  121. data/app/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb +5 -0
  122. data/app/views/hyrax/newspaper_titles/_issue_search_form.html.erb +33 -0
  123. data/app/views/hyrax/newspaper_titles/_issues_calendar.html.erb +63 -0
  124. data/app/views/hyrax/newspaper_titles/_newspaper_title.html.erb +2 -0
  125. data/app/views/hyrax/newspaper_titles/show.html.erb +54 -0
  126. data/app/views/newspaper_works/base/_attribute_rows.html.erb +42 -0
  127. data/app/views/newspaper_works/base/_attributes.html.erb +16 -0
  128. data/app/views/newspaper_works/base/_metadata.html.erb +6 -0
  129. data/app/views/newspaper_works/base/_newspaper_hierarchy.html.erb +14 -0
  130. data/app/views/newspaper_works/base/_persistent_url.html.erb +1 -0
  131. data/app/views/newspaper_works/base/_show.html.erb +45 -0
  132. data/app/views/newspaper_works/newspapers_search/_date_fields.html.erb +29 -0
  133. data/app/views/newspaper_works/newspapers_search/_facet_layout.html.erb +8 -0
  134. data/app/views/newspaper_works/newspapers_search/_facet_limit.html.erb +17 -0
  135. data/app/views/newspaper_works/newspapers_search/_front_pages_input.html.erb +5 -0
  136. data/app/views/newspaper_works/newspapers_search/_keyword_input.html.erb +18 -0
  137. data/app/views/newspaper_works/newspapers_search/_newspapers_facets.html.erb +5 -0
  138. data/app/views/newspaper_works/newspapers_search/_newspapers_search_form.html.erb +13 -0
  139. data/app/views/newspaper_works/newspapers_search/_newspapers_search_help.html.erb +8 -0
  140. data/app/views/newspaper_works/newspapers_search/search.html.erb +13 -0
  141. data/app/views/records/edit_fields/_alternate_title.html.erb +4 -0
  142. data/app/views/records/edit_fields/_genre.html.erb +4 -0
  143. data/app/views/records/edit_fields/_place_of_publication.html.erb +14 -0
  144. data/app/views/records/edit_fields/_subtitle.html.erb +4 -0
  145. data/bin/rails +13 -0
  146. data/config/fcrepo_wrapper_test.yml +5 -0
  147. data/config/initializers/assets.rb +2 -0
  148. data/config/locales/newspaper_article.de.yml +12 -0
  149. data/config/locales/newspaper_article.en.yml +12 -0
  150. data/config/locales/newspaper_article.es.yml +12 -0
  151. data/config/locales/newspaper_article.fr.yml +12 -0
  152. data/config/locales/newspaper_article.it.yml +12 -0
  153. data/config/locales/newspaper_article.pt-BR.yml +12 -0
  154. data/config/locales/newspaper_article.zh.yml +12 -0
  155. data/config/locales/newspaper_container.de.yml +8 -0
  156. data/config/locales/newspaper_container.en.yml +8 -0
  157. data/config/locales/newspaper_container.es.yml +8 -0
  158. data/config/locales/newspaper_container.fr.yml +8 -0
  159. data/config/locales/newspaper_container.it.yml +8 -0
  160. data/config/locales/newspaper_container.pt-BR.yml +8 -0
  161. data/config/locales/newspaper_container.zh.yml +8 -0
  162. data/config/locales/newspaper_issue.de.yml +8 -0
  163. data/config/locales/newspaper_issue.en.yml +8 -0
  164. data/config/locales/newspaper_issue.es.yml +8 -0
  165. data/config/locales/newspaper_issue.fr.yml +8 -0
  166. data/config/locales/newspaper_issue.it.yml +8 -0
  167. data/config/locales/newspaper_issue.pt-BR.yml +8 -0
  168. data/config/locales/newspaper_issue.zh.yml +8 -0
  169. data/config/locales/newspaper_page.de.yml +15 -0
  170. data/config/locales/newspaper_page.en.yml +15 -0
  171. data/config/locales/newspaper_page.es.yml +15 -0
  172. data/config/locales/newspaper_page.fr.yml +15 -0
  173. data/config/locales/newspaper_page.it.yml +15 -0
  174. data/config/locales/newspaper_page.pt-BR.yml +15 -0
  175. data/config/locales/newspaper_page.zh.yml +15 -0
  176. data/config/locales/newspaper_title.de.yml +8 -0
  177. data/config/locales/newspaper_title.en.yml +8 -0
  178. data/config/locales/newspaper_title.es.yml +8 -0
  179. data/config/locales/newspaper_title.fr.yml +8 -0
  180. data/config/locales/newspaper_title.it.yml +8 -0
  181. data/config/locales/newspaper_title.pt-BR.yml +8 -0
  182. data/config/locales/newspaper_title.zh.yml +8 -0
  183. data/config/locales/newspaper_works.de.yml +50 -0
  184. data/config/locales/newspaper_works.en.yml +52 -0
  185. data/config/locales/newspaper_works.es.yml +52 -0
  186. data/config/locales/newspaper_works.fr.yml +52 -0
  187. data/config/locales/newspaper_works.it.yml +52 -0
  188. data/config/locales/newspaper_works.pt-BR.yml +52 -0
  189. data/config/locales/newspaper_works.zh.yml +52 -0
  190. data/config/routes.rb +9 -0
  191. data/config/solr_wrapper_test.yml +9 -0
  192. data/config/test-fixture/solr-config/_rest_managed.json +3 -0
  193. data/config/test-fixture/solr-config/admin-extra.html +31 -0
  194. data/config/test-fixture/solr-config/elevate.xml +36 -0
  195. data/config/test-fixture/solr-config/mapping-ISOLatin1Accent.txt +246 -0
  196. data/config/test-fixture/solr-config/protwords.txt +21 -0
  197. data/config/test-fixture/solr-config/schema.xml +366 -0
  198. data/config/test-fixture/solr-config/scripts.conf +24 -0
  199. data/config/test-fixture/solr-config/solrconfig.xml +322 -0
  200. data/config/test-fixture/solr-config/spellings.txt +2 -0
  201. data/config/test-fixture/solr-config/stopwords.txt +58 -0
  202. data/config/test-fixture/solr-config/stopwords_en.txt +58 -0
  203. data/config/test-fixture/solr-config/synonyms.txt +31 -0
  204. data/config/test-fixture/solr-config/xslt/example.xsl +132 -0
  205. data/config/test-fixture/solr-config/xslt/example_atom.xsl +67 -0
  206. data/config/test-fixture/solr-config/xslt/example_rss.xsl +66 -0
  207. data/config/test-fixture/solr-config/xslt/luke.xsl +337 -0
  208. data/config/vendor/imagemagick-6-policy.xml +76 -0
  209. data/db/migrate/20181214181358_create_newspaper_works_derivative_attachments.rb +12 -0
  210. data/db/migrate/20190107165909_create_newspaper_works_ingest_file_relations.rb +11 -0
  211. data/lib/generators/newspaper_works/assets_generator.rb +29 -0
  212. data/lib/generators/newspaper_works/blacklight_advanced_search_generator.rb +44 -0
  213. data/lib/generators/newspaper_works/blacklight_iiif_search_generator.rb +41 -0
  214. data/lib/generators/newspaper_works/catalog_controller_generator.rb +60 -0
  215. data/lib/generators/newspaper_works/install_generator.rb +97 -0
  216. data/lib/generators/newspaper_works/templates/annotation_behavior.rb +6 -0
  217. data/lib/generators/newspaper_works/templates/config/authorities/newspaper_article_genres.yml +86 -0
  218. data/lib/generators/newspaper_works/templates/config/initializers/newspaper_works.rb +12 -0
  219. data/lib/generators/newspaper_works/templates/config/initializers/patch_blacklight_advanced_search.rb +74 -0
  220. data/lib/generators/newspaper_works/templates/custom_search_builder.rb +23 -0
  221. data/lib/generators/newspaper_works/templates/newspaper_works.scss +1 -0
  222. data/lib/generators/newspaper_works/templates/newspaper_works_helper.rb +3 -0
  223. data/lib/generators/newspaper_works/templates/search_behavior.rb +6 -0
  224. data/lib/newspaper_works/configuration.rb +14 -0
  225. data/lib/newspaper_works/data/fileset_helper.rb +25 -0
  226. data/lib/newspaper_works/data/path_helper.rb +40 -0
  227. data/lib/newspaper_works/data/work_derivatives.rb +314 -0
  228. data/lib/newspaper_works/data/work_file.rb +92 -0
  229. data/lib/newspaper_works/data/work_files.rb +181 -0
  230. data/lib/newspaper_works/data.rb +35 -0
  231. data/lib/newspaper_works/engine.rb +42 -0
  232. data/lib/newspaper_works/errors.rb +14 -0
  233. data/lib/newspaper_works/ingest/base_ingest.rb +69 -0
  234. data/lib/newspaper_works/ingest/base_publication_info.rb +35 -0
  235. data/lib/newspaper_works/ingest/batch_ingest_helper.rb +44 -0
  236. data/lib/newspaper_works/ingest/batch_issue_ingester.rb +129 -0
  237. data/lib/newspaper_works/ingest/chronam_publication_info.rb +133 -0
  238. data/lib/newspaper_works/ingest/from_command.rb +52 -0
  239. data/lib/newspaper_works/ingest/image_ingest_issues.rb +43 -0
  240. data/lib/newspaper_works/ingest/issue_images.rb +51 -0
  241. data/lib/newspaper_works/ingest/lc_publication_info.rb +144 -0
  242. data/lib/newspaper_works/ingest/named_issue_metadata.rb +60 -0
  243. data/lib/newspaper_works/ingest/ndnp/batch_ingester.rb +64 -0
  244. data/lib/newspaper_works/ingest/ndnp/batch_xml_ingest.rb +72 -0
  245. data/lib/newspaper_works/ingest/ndnp/container_ingest.rb +99 -0
  246. data/lib/newspaper_works/ingest/ndnp/container_ingester.rb +84 -0
  247. data/lib/newspaper_works/ingest/ndnp/container_metadata.rb +87 -0
  248. data/lib/newspaper_works/ingest/ndnp/issue_ingest.rb +81 -0
  249. data/lib/newspaper_works/ingest/ndnp/issue_ingester.rb +101 -0
  250. data/lib/newspaper_works/ingest/ndnp/issue_metadata.rb +96 -0
  251. data/lib/newspaper_works/ingest/ndnp/ndnp_asset_helper.rb +20 -0
  252. data/lib/newspaper_works/ingest/ndnp/ndnp_mets_helper.rb +70 -0
  253. data/lib/newspaper_works/ingest/ndnp/page_ingest.rb +47 -0
  254. data/lib/newspaper_works/ingest/ndnp/page_ingester.rb +157 -0
  255. data/lib/newspaper_works/ingest/ndnp/page_metadata.rb +112 -0
  256. data/lib/newspaper_works/ingest/ndnp.rb +21 -0
  257. data/lib/newspaper_works/ingest/newspaper_issue_ingest.rb +56 -0
  258. data/lib/newspaper_works/ingest/newspaper_page_ingest.rb +6 -0
  259. data/lib/newspaper_works/ingest/page_image.rb +52 -0
  260. data/lib/newspaper_works/ingest/path_enumeration.rb +52 -0
  261. data/lib/newspaper_works/ingest/pdf_images.rb +85 -0
  262. data/lib/newspaper_works/ingest/pdf_issue.rb +20 -0
  263. data/lib/newspaper_works/ingest/pdf_issues.rb +39 -0
  264. data/lib/newspaper_works/ingest/pdf_pages.rb +114 -0
  265. data/lib/newspaper_works/ingest/pub_finder.rb +89 -0
  266. data/lib/newspaper_works/ingest/publication_info.rb +44 -0
  267. data/lib/newspaper_works/ingest.rb +90 -0
  268. data/lib/newspaper_works/issue_pdf_composer.rb +111 -0
  269. data/lib/newspaper_works/logging.rb +54 -0
  270. data/lib/newspaper_works/page_finder.rb +62 -0
  271. data/lib/newspaper_works/resource_fetcher.rb +78 -0
  272. data/lib/newspaper_works/text_extraction/alto_reader.rb +122 -0
  273. data/lib/newspaper_works/text_extraction/page_ocr.rb +100 -0
  274. data/lib/newspaper_works/text_extraction/render_alto.rb +84 -0
  275. data/lib/newspaper_works/text_extraction/word_coords_builder.rb +30 -0
  276. data/lib/newspaper_works/text_extraction.rb +10 -0
  277. data/lib/newspaper_works/version.rb +3 -0
  278. data/lib/newspaper_works.rb +19 -0
  279. data/lib/tasks/newspaper_works_tasks.rake +39 -0
  280. data/newspaper_works.gemspec +49 -0
  281. data/spec/.keep.txt +1 -0
  282. data/spec/actors/newspaper_works/actors/newspaper_works_upload_actor_spec.rb +69 -0
  283. data/spec/controllers/catalog_controller_spec.rb +63 -0
  284. data/spec/controllers/newspaper_works/newspapers_controller_spec.rb +114 -0
  285. data/spec/controllers/newspaper_works/newspapers_search_controller_spec.rb +21 -0
  286. data/spec/factories/ability.rb +6 -0
  287. data/spec/factories/newspaper_issue.rb +7 -0
  288. data/spec/factories/newspaper_issue_ingest.rb +6 -0
  289. data/spec/factories/newspaper_page.rb +7 -0
  290. data/spec/factories/newspaper_page_ingest.rb +6 -0
  291. data/spec/factories/newspaper_page_solr_document.rb +12 -0
  292. data/spec/factories/newspaper_title.rb +8 -0
  293. data/spec/factories/uploaded_pdf_file.rb +9 -0
  294. data/spec/factories/user.rb +13 -0
  295. data/spec/features/front_pages_for_title_spec.rb +19 -0
  296. data/spec/features/newspaper_title_search_spec.rb +30 -0
  297. data/spec/features/newspapers_search_spec.rb +49 -0
  298. data/spec/features/search_results_thumbnail_highlights_spec.rb +33 -0
  299. data/spec/features_shared.rb +71 -0
  300. data/spec/fixtures/files/4.1.07.jp2 +0 -0
  301. data/spec/fixtures/files/4.1.07.tiff +0 -0
  302. data/spec/fixtures/files/README.md +7 -0
  303. data/spec/fixtures/files/alto-2-0.xsd +714 -0
  304. data/spec/fixtures/files/broken-truncated.pdf +0 -0
  305. data/spec/fixtures/files/credits.md +16 -0
  306. data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
  307. data/spec/fixtures/files/minimal-1-page.pdf +0 -0
  308. data/spec/fixtures/files/minimal-2-page.pdf +0 -0
  309. data/spec/fixtures/files/minimal-alto.xml +31 -0
  310. data/spec/fixtures/files/ndnp-alto-sample.xml +24 -0
  311. data/spec/fixtures/files/ndnp-sample1-json.json +1 -0
  312. data/spec/fixtures/files/ndnp-sample1-txt.txt +1 -0
  313. data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
  314. data/spec/fixtures/files/ocr_alto.xml +202 -0
  315. data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +202 -0
  316. data/spec/fixtures/files/ocr_color.tiff +0 -0
  317. data/spec/fixtures/files/ocr_gray.jp2 +0 -0
  318. data/spec/fixtures/files/ocr_gray.tiff +0 -0
  319. data/spec/fixtures/files/ocr_mono.tiff +0 -0
  320. data/spec/fixtures/files/page1.tiff +0 -0
  321. data/spec/fixtures/files/resource_mocks/chronam/http404-expected +0 -0
  322. data/spec/fixtures/files/resource_mocks/chronam/sn84038814.rdf +1028 -0
  323. data/spec/fixtures/files/resource_mocks/chronam/sn93059126.rdf +36 -0
  324. data/spec/fixtures/files/resource_mocks/chronam/sn94051019.rdf +37 -0
  325. data/spec/fixtures/files/resource_mocks/geonames/Chicopee +1104 -0
  326. data/spec/fixtures/files/resource_mocks/geonames/Denver +1104 -0
  327. data/spec/fixtures/files/resource_mocks/geonames/Marysville +279 -0
  328. data/spec/fixtures/files/resource_mocks/geonames/Marysville2 +279 -0
  329. data/spec/fixtures/files/resource_mocks/geonames/SLC +1104 -0
  330. data/spec/fixtures/files/resource_mocks/lccn/sn2099999999 +1 -0
  331. data/spec/fixtures/files/resource_mocks/lccn/sn82014496 +2 -0
  332. data/spec/fixtures/files/resource_mocks/lccn/sn83020109 +1 -0
  333. data/spec/fixtures/files/resource_mocks/lccn/sn83021453 +2 -0
  334. data/spec/fixtures/files/resource_mocks/lccn/sn83045396 +2 -0
  335. data/spec/fixtures/files/resource_mocks/lccn/sn84038814 +2 -0
  336. data/spec/fixtures/files/resource_mocks/lccn/sn93059126 +1 -0
  337. data/spec/fixtures/files/resource_mocks/lccn/sn94051019 +1 -0
  338. data/spec/fixtures/files/resource_mocks/lccn/sn99999999 +1 -0
  339. data/spec/fixtures/files/resource_mocks/urls.json +82 -0
  340. data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
  341. data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
  342. data/spec/fixtures/files/thumbnail.jpg +0 -0
  343. data/spec/forms/hyrax/newspaper_article_form_spec.rb +33 -0
  344. data/spec/forms/hyrax/newspaper_container_form_spec.rb +30 -0
  345. data/spec/forms/hyrax/newspaper_issue_form_spec.rb +31 -0
  346. data/spec/forms/hyrax/newspaper_page_form_spec.rb +28 -0
  347. data/spec/forms/hyrax/newspaper_title_form_spec.rb +31 -0
  348. data/spec/forms/newspaper_works/newspaper_core_form_data_spec.rb +12 -0
  349. data/spec/helpers/newspaper_works/breadcrumb_helper_spec.rb +82 -0
  350. data/spec/helpers/newspaper_works_helper_spec.rb +57 -0
  351. data/spec/indexers/concerns/newspaper_works/indexes_full_text_spec.rb +31 -0
  352. data/spec/indexers/concerns/newspaper_works/indexes_place_of_publication_spec.rb +53 -0
  353. data/spec/indexers/concerns/newspaper_works/indexes_publication_date_range_spec.rb +39 -0
  354. data/spec/indexers/concerns/newspaper_works/indexes_relationships_spec.rb +86 -0
  355. data/spec/indexers/newspaper_article_indexer_spec.rb +29 -0
  356. data/spec/indexers/newspaper_issue_indexer_spec.rb +19 -0
  357. data/spec/indexers/newspaper_title_indexer_spec.rb +22 -0
  358. data/spec/indexers/newspaper_works/newspaper_core_indexer_spec.rb +23 -0
  359. data/spec/lib/newspaper_works/configuration_spec.rb +18 -0
  360. data/spec/lib/newspaper_works/data/work_derivatives_spec.rb +245 -0
  361. data/spec/lib/newspaper_works/data/work_file_spec.rb +99 -0
  362. data/spec/lib/newspaper_works/data/work_files_spec.rb +224 -0
  363. data/spec/lib/newspaper_works/ingest/batch_issue_ingester_spec.rb +158 -0
  364. data/spec/lib/newspaper_works/ingest/chronam_publication_info_spec.rb +35 -0
  365. data/spec/lib/newspaper_works/ingest/from_command_spec.rb +75 -0
  366. data/spec/lib/newspaper_works/ingest/image_ingest_issues_spec.rb +62 -0
  367. data/spec/lib/newspaper_works/ingest/ingest_shared.rb +75 -0
  368. data/spec/lib/newspaper_works/ingest/issue_images_spec.rb +65 -0
  369. data/spec/lib/newspaper_works/ingest/lc_publication_info_spec.rb +34 -0
  370. data/spec/lib/newspaper_works/ingest/ndnp/batch_ingester_spec.rb +131 -0
  371. data/spec/lib/newspaper_works/ingest/ndnp/batch_xml_ingest_spec.rb +64 -0
  372. data/spec/lib/newspaper_works/ingest/ndnp/container_ingest_spec.rb +44 -0
  373. data/spec/lib/newspaper_works/ingest/ndnp/container_ingester_spec.rb +126 -0
  374. data/spec/lib/newspaper_works/ingest/ndnp/container_metadata_spec.rb +36 -0
  375. data/spec/lib/newspaper_works/ingest/ndnp/issue_ingest_spec.rb +108 -0
  376. data/spec/lib/newspaper_works/ingest/ndnp/issue_ingester_spec.rb +155 -0
  377. data/spec/lib/newspaper_works/ingest/ndnp/issue_metadata_spec.rb +84 -0
  378. data/spec/lib/newspaper_works/ingest/ndnp/page_ingest_spec.rb +79 -0
  379. data/spec/lib/newspaper_works/ingest/ndnp/page_ingester_spec.rb +184 -0
  380. data/spec/lib/newspaper_works/ingest/ndnp/page_metadata_spec.rb +85 -0
  381. data/spec/lib/newspaper_works/ingest/newspaper_issue_ingest_spec.rb +83 -0
  382. data/spec/lib/newspaper_works/ingest/newspaper_page_ingest_spec.rb +77 -0
  383. data/spec/lib/newspaper_works/ingest/page_image_spec.rb +29 -0
  384. data/spec/lib/newspaper_works/ingest/pdf_images_spec.rb +32 -0
  385. data/spec/lib/newspaper_works/ingest/pdf_issue_spec.rb +29 -0
  386. data/spec/lib/newspaper_works/ingest/pdf_issues_spec.rb +62 -0
  387. data/spec/lib/newspaper_works/ingest/pdf_pages_spec.rb +110 -0
  388. data/spec/lib/newspaper_works/ingest/pub_finder_spec.rb +58 -0
  389. data/spec/lib/newspaper_works/ingest/publication_info_spec.rb +61 -0
  390. data/spec/lib/newspaper_works/ingest_spec.rb +45 -0
  391. data/spec/lib/newspaper_works/issue_pdf_composer_spec.rb +101 -0
  392. data/spec/lib/newspaper_works/logging_spec.rb +53 -0
  393. data/spec/lib/newspaper_works/page_finder_spec.rb +53 -0
  394. data/spec/lib/newspaper_works/resource_fetcher_spec.rb +65 -0
  395. data/spec/lib/newspaper_works/text_extraction/alto_reader_spec.rb +49 -0
  396. data/spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb +84 -0
  397. data/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb +54 -0
  398. data/spec/lib/newspaper_works/text_extraction/word_coords_builder_spec.rb +30 -0
  399. data/spec/lib/tasks/newspaper_works_rake_spec.rb +124 -0
  400. data/spec/misc_shared.rb +109 -0
  401. data/spec/model_shared.rb +134 -0
  402. data/spec/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior_spec.rb +45 -0
  403. data/spec/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior_spec.rb +27 -0
  404. data/spec/models/concerns/newspaper_works/newspaper_core_metadata_spec.rb +45 -0
  405. data/spec/models/concerns/newspaper_works/place_of_publication_behavior_spec.rb +17 -0
  406. data/spec/models/concerns/newspaper_works/scanned_media_metadata_spec.rb +35 -0
  407. data/spec/models/newspaper_article_spec.rb +73 -0
  408. data/spec/models/newspaper_container_spec.rb +111 -0
  409. data/spec/models/newspaper_issue_spec.rb +91 -0
  410. data/spec/models/newspaper_page_spec.rb +44 -0
  411. data/spec/models/newspaper_title_spec.rb +116 -0
  412. data/spec/models/newspaper_works/derivative_attachment_spec.rb +37 -0
  413. data/spec/models/newspaper_works/ingest_file_relation_spec.rb +56 -0
  414. data/spec/models/solr_document_spec.rb +14 -0
  415. data/spec/ndnp_shared.rb +48 -0
  416. data/spec/presenters/hyrax/newspaper_article_presenter_spec.rb +53 -0
  417. data/spec/presenters/hyrax/newspaper_container_presenter_spec.rb +20 -0
  418. data/spec/presenters/hyrax/newspaper_issue_presenter_spec.rb +65 -0
  419. data/spec/presenters/hyrax/newspaper_page_presenter_spec.rb +75 -0
  420. data/spec/presenters/hyrax/newspaper_title_presenter_spec.rb +153 -0
  421. data/spec/presenters/newspaper_works/iiif_manifest_presenter_behavior_spec.rb +32 -0
  422. data/spec/presenters/newspaper_works/issue_info_presenter_spec.rb +51 -0
  423. data/spec/presenters/newspaper_works/newspaper_core_presenter_spec.rb +22 -0
  424. data/spec/presenters/newspaper_works/persistent_url_presenter_behavior_spec.rb +24 -0
  425. data/spec/presenters/newspaper_works/place_of_publication_presenter_behavior_spec.rb +17 -0
  426. data/spec/presenters/newspaper_works/scanned_media_presenter_spec.rb +18 -0
  427. data/spec/presenters/newspaper_works/title_info_presenter_spec.rb +23 -0
  428. data/spec/routing/route_spec.rb +52 -0
  429. data/spec/search_builders/custom_search_builder_spec.rb +34 -0
  430. data/spec/search_builders/newspaper_works/newspapers_search_builder_spec.rb +33 -0
  431. data/spec/services/hyrax/article_genre_service_spec.rb +12 -0
  432. data/spec/services/hyrax/resource_types_service_spec.rb +12 -0
  433. data/spec/services/newspaper_works/jp2_derivative_service_spec.rb +62 -0
  434. data/spec/services/newspaper_works/newspaper_page_derivative_service_spec.rb +125 -0
  435. data/spec/services/newspaper_works/pdf_derivative_service_spec.rb +62 -0
  436. data/spec/services/newspaper_works/pluggable_derivative_service_spec.rb +204 -0
  437. data/spec/services/newspaper_works/text_extraction_derivative_service_spec.rb +82 -0
  438. data/spec/services/newspaper_works/text_formats_from_alto_service_spec.rb +129 -0
  439. data/spec/services/newspaper_works/tiff_derivative_service_spec.rb +58 -0
  440. data/spec/spec_helper.rb +261 -0
  441. data/spec/support/controller_level_helpers.rb +28 -0
  442. data/spec/test_app_templates/lib/generators/test_app_generator.rb +22 -0
  443. data/spec/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb_spec.rb +36 -0
  444. data/spec/views/catalog/_index_header_list_newspaper_page.html.erb_spec.rb +26 -0
  445. data/spec/views/catalog/_thumbnail_list_newspaper_page.html.erb_spec.rb +35 -0
  446. data/spec/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb_spec.rb +16 -0
  447. data/spec/views/hyrax/newspaper_titles/_issue_search_form.html.erb_spec.rb +33 -0
  448. data/spec/views/hyrax/newspaper_titles/_issues_calendar.html.erb_spec.rb +37 -0
  449. data/spec/views/hyrax/newspaper_titles/show.html.erb_spec.rb +87 -0
  450. data/spec/views/newspaper_works/base/_attribute_rows.html.erb_spec.rb +60 -0
  451. data/spec/views/newspaper_works/base/_newspaper_hierarchy.html.erb_spec.rb +80 -0
  452. data/spec/views/newspaper_works/base/_show.html.erb_spec.rb +78 -0
  453. data/spec/views/newspaper_works/newspapers_search/search.html.erb_spec.rb +54 -0
  454. data/spec/views/records/edit_fields/_place_of_publication.html.erb_spec.rb +26 -0
  455. data/tasks/newspaperworks_dev.rake +26 -0
  456. data/test/integration/navigation_test.rb +7 -0
  457. data/test/lib/generators/newspaper_works/install_generator_test.rb +16 -0
  458. data/test/newspaper_works_test.rb +7 -0
  459. data/test/test_helper.rb +17 -0
  460. data/tmp/.keep +0 -0
  461. metadata +1037 -0
@@ -0,0 +1,144 @@
1
+ require 'faraday'
2
+ require 'nokogiri'
3
+ require 'uri'
4
+
5
+ module NewspaperWorks
6
+ module Ingest
7
+ class LCPublicationInfo < BasePublicationInfo
8
+ attr_accessor :place_of_publication, :full_title, :lccn, :place_name, :doc
9
+
10
+ XML_NS = {
11
+ mods: 'http://www.loc.gov/mods/v3',
12
+ MODS: 'http://www.loc.gov/mods/v3'
13
+ }.freeze
14
+
15
+ BASE_URL = 'https://lccn.loc.gov'.freeze
16
+
17
+ def initialize(lccn)
18
+ super(lccn)
19
+ @doc = nil
20
+ @full_title = nil
21
+ @place_of_publication = nil
22
+ @place_name = nil
23
+ load
24
+ end
25
+
26
+ def inspect
27
+ format(
28
+ "<#{self.class}:0x000000000%<oid>x " \
29
+ "\tlccn: '#{@lccn}'>",
30
+ oid: object_id << 1
31
+ )
32
+ end
33
+
34
+ def url
35
+ "#{BASE_URL}/#{@lccn}/mods"
36
+ end
37
+
38
+ def load_lc
39
+ resp = NewspaperWorks::ResourceFetcher.get url
40
+ @doc = Nokogiri.XML(resp['body'])
41
+ return if empty?
42
+ # try title[@type="uniform"] first:
43
+ title = find('//mods:titleInfo[@type="uniform"]/mods:title').first
44
+ # if no type="uniform" title, try non-alternate bare titleInfo:
45
+ # -- in either case, should omit any non-sorted article (e.g. "The")
46
+ title = find('//mods:titleInfo[count(@type)=0]/mods:title').first if title.nil?
47
+ @full_title = title.text unless title.nil?
48
+ end
49
+
50
+ def mods_place_name
51
+ # prefer geographic subject hierarchy for place name construction:
52
+ city = find('//mods:hierarchicalGeographic/mods:city').first
53
+ # State (e.g. "Utah"), Province (e.g. "Ontario"), other (e.g. "England")
54
+ state = find('//mods:hierarchicalGeographic/mods:state').first
55
+ # if state is nil, fallback to country in its place
56
+ state = find('//mods:hierarchicalGeographic/mods:country').first if state.nil?
57
+ return "#{city.text}, #{state.text}" if city && state
58
+ # fallback to placeTerm text, which may be abbreviated in such a
59
+ # way that geonames struggles to find on search; for a list of
60
+ # abbreviations, see:
61
+ # https://www.loc.gov/aba/publications/FreeSHM/H0810.pdf
62
+ name = find('//mods:originInfo//mods:placeTerm[@type="text"]').first
63
+ name.nil? ? nil : name.text
64
+ end
65
+
66
+ def load_place
67
+ @place_name = mods_place_name || place_name_from_title(@full_title)
68
+ return if @place_name.nil?
69
+ uri = NewspaperWorks::Ingest.geonames_place_uri(@place_name)
70
+ @place_of_publication = uri
71
+ end
72
+
73
+ def empty?
74
+ @doc.nil? || @doc.root.children.empty?
75
+ end
76
+
77
+ def load
78
+ load_lc
79
+ load_place unless @full_title.nil?
80
+ end
81
+
82
+ def title
83
+ return if empty?
84
+ NewspaperWorks::Ingest.normalize_title(@full_title.split(/ [\(]/)[0])
85
+ end
86
+
87
+ # ISO-639-2 three character language code, default is 'eng' (English)
88
+ def language(default = 'eng')
89
+ return if empty?
90
+ v = find('//mods:language/mods:languageTerm').first
91
+ v.nil? ? default : v.text
92
+ end
93
+
94
+ def issn
95
+ return if empty?
96
+ v = find('//mods:mods/mods:identifier[@type="issn"]').first
97
+ v.nil? ? nil : v.text
98
+ end
99
+
100
+ def oclcnum
101
+ return if empty?
102
+ v = find('//mods:mods/mods:identifier[@type="oclc"]').first
103
+ v.nil? ? nil : oclc_prefixed(v.text)
104
+ end
105
+
106
+ def preceded_by
107
+ related_by('preceding')
108
+ end
109
+
110
+ def succeeded_by
111
+ related_by('succeeding')
112
+ end
113
+
114
+ private
115
+
116
+ def related_by(key)
117
+ return if empty?
118
+ v = find("//mods:relatedItem[@type='#{key}']")
119
+ return nil if v.empty?
120
+ lccn = lccn_for(v[0])
121
+ return "#{BASE_URL}/#{lccn}" unless lccn.nil?
122
+ # No LCCN, ergo no URL, but a related item with a literal title?
123
+ titles = find('mods:title', v[0])
124
+ titles.empty? ? nil : titles[0].text
125
+ end
126
+
127
+ def lccn_for(related_item)
128
+ identifiers = find('mods:identifier[@type="local"]', related_item)
129
+ selected = identifiers.select { |v| v.text.start_with?('(DLC)') }
130
+ return if selected.size.zero?
131
+ selected.first.text.split(')')[-1].sub(' ', '')
132
+ end
133
+
134
+ def find(expr, context = nil)
135
+ context ||= @doc
136
+ return if context.nil? && empty?
137
+ context.xpath(
138
+ expr,
139
+ **XML_NS
140
+ )
141
+ end
142
+ end
143
+ end
144
+ end
@@ -0,0 +1,60 @@
1
+ require 'date'
2
+
3
+ module NewspaperWorks
4
+ module Ingest
5
+ # Mixin for deducing issue metadata from path, publication info.
6
+ # precondition: consuming class has accessor for:
7
+ # - `path`: full path to issue
8
+ # - `publication`: a `NewspaperWorks::Ingest::PublicationInfo object.
9
+ module NamedIssueMetadata
10
+ # Memoized filename from path:
11
+ # @return [String]
12
+ def filename
13
+ return @filename unless @filename.nil?
14
+ @filename = File.basename(path)
15
+ end
16
+
17
+ def validate_path
18
+ # expect path to exist:
19
+ raise ArgumentError unless File.exist?(path)
20
+ # `YYYYMMDDEE` with valid date digits, optional `EE` edition
21
+ ptn = /^([0-9]{4})(1[012]|[0][1-9])(3[01]|[12][0-9]|0[1-9])([0-9]{2})?/
22
+ raise ArgumentError unless ptn.match(filename)
23
+ end
24
+
25
+ # Publication date stamp
26
+ # @return [String] ISO 8601 date stamp
27
+ def publication_date
28
+ year = filename.slice(0, 4).to_i
29
+ month = filename.slice(4, 2).to_i
30
+ day = filename.slice(6, 2).to_i
31
+ DateTime.new(year, month, day).iso8601[0..9]
32
+ end
33
+
34
+ # Issue edition number
35
+ # @return [Integer] number of issue edition
36
+ def edition_number
37
+ # use file name minus file extension (if applicable, e.g. PDF):
38
+ base = filename.split('.')[0..-2].join('.')
39
+ # default for PDF or issue dir not specifying edition value in
40
+ # name (before file extension, if applicable):
41
+ return 1 if base.size < 10
42
+ # ...otherwise use explicitly provided edition number in filename
43
+ base.slice(8, 2).to_i
44
+ end
45
+
46
+ # rubocop:disable Rails/Delegate
47
+ def lccn
48
+ publication.lccn
49
+ end
50
+ # rubocop:enable Rails/Delegate
51
+
52
+ def title
53
+ title_date = DateTime.iso8601(publication_date).strftime('%B %-d, %Y')
54
+ v = "#{publication.title}: #{title_date}"
55
+ v = "#{v} (#{edition_number})" if edition_number.to_i > 1
56
+ [v]
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,64 @@
1
+ require 'date'
2
+ require 'find'
3
+ require 'optparse'
4
+
5
+ module NewspaperWorks
6
+ module Ingest
7
+ module NDNP
8
+ class BatchIngester
9
+ extend NewspaperWorks::Ingest::FromCommand
10
+ include NewspaperWorks::Logging
11
+
12
+ attr_accessor :path, :batch, :opts
13
+
14
+ # normalize path, possibly from directory, to contained batch
15
+ # manifest XML path:
16
+ # @param path [String]
17
+ def self.normalize_path(path)
18
+ return path unless File.directory?(path)
19
+ batch_xml_path = Find.find(path).select do |f|
20
+ f.downcase.end_with?('batch_1.xml', 'batch.xml')
21
+ end
22
+ batch_xml_path.find { |f| f.end_with?('_1.xml') } || batch_xml_path[0]
23
+ end
24
+
25
+ # @param path [String] path to batch xml or directory
26
+ # @param opts [Hash]
27
+ # global ingest options, to be passed to ingester components,
28
+ # may include administrative metadata.
29
+ def initialize(path, opts = {})
30
+ @path = self.class.normalize_path(path)
31
+ raise IOError, "No batch file found: #{path}" if @path.empty?
32
+ @opts = opts
33
+ @batch = batch_enumerator
34
+ configure_logger('ingest')
35
+ end
36
+
37
+ def ingest
38
+ write_log("Beginning NDNP batch ingest for #{@path}")
39
+ batch.each do |issue|
40
+ issue_ingester(issue).ingest
41
+ end
42
+ write_log(
43
+ "NDNP batch ingest complete!"
44
+ )
45
+ end
46
+
47
+ private
48
+
49
+ # Return BatchIngest object as enumerable of issues:
50
+ def batch_enumerator
51
+ NewspaperWorks::Ingest::NDNP::BatchXMLIngest.new(path)
52
+ end
53
+
54
+ def issue_ingester(issue)
55
+ NewspaperWorks::Ingest::NDNP::IssueIngester.new(issue, @opts)
56
+ end
57
+
58
+ def normalize_date(v)
59
+ (v.is_a?(String) ? Date.parse(v) : v).to_s
60
+ end
61
+ end
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,72 @@
1
+ require 'nokogiri'
2
+
3
+ module NewspaperWorks
4
+ module Ingest
5
+ module NDNP
6
+ class BatchXMLIngest
7
+ include Enumerable
8
+ include NewspaperWorks::Ingest::NDNP::NDNPMetsHelper
9
+
10
+ attr_accessor :container_paths, :issue_paths, :path
11
+
12
+ delegate :size, to: :issue_paths
13
+
14
+ def initialize(path)
15
+ @path = path
16
+ load_doc
17
+ @container_paths = xpath('//ndnp:batch//ndnp:reel').map do |e|
18
+ normalize_path(e.text)
19
+ end
20
+ @issue_paths = xpath('//ndnp:batch//ndnp:issue').map do |e|
21
+ normalize_path(e.text)
22
+ end
23
+ end
24
+
25
+ def name
26
+ xpath('//ndnp:batch').first.attributes['name'].value
27
+ end
28
+
29
+ def get(path)
30
+ return get_issue(path) if issue_paths.include?(path)
31
+ get_container(path)
32
+ end
33
+
34
+ def issues
35
+ issue_paths.map { |path| get(path) }
36
+ end
37
+
38
+ def containers
39
+ container_paths.map { |path| get(path) }
40
+ end
41
+
42
+ def each
43
+ @issue_paths.each do |path|
44
+ yield get_issue(path)
45
+ end
46
+ end
47
+
48
+ private
49
+
50
+ def get_issue(path)
51
+ NewspaperWorks::Ingest::NDNP::IssueIngest.new(path)
52
+ end
53
+
54
+ def get_container(path)
55
+ NewspaperWorks::Ingest::NDNP::ContainerIngest.new(path)
56
+ end
57
+
58
+ def xpath(expr)
59
+ ns = {
60
+ ndnp: 'http://www.loc.gov/ndnp',
61
+ NDNP: 'http://www.loc.gov/ndnp'
62
+ }
63
+ @doc.xpath(expr, **ns)
64
+ end
65
+
66
+ def load_doc
67
+ @doc = Nokogiri::XML(File.open(path))
68
+ end
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,99 @@
1
+ module NewspaperWorks
2
+ module Ingest
3
+ module NDNP
4
+ class ContainerIngest
5
+ # Enumerable of IssueIngest objects for issues in pages
6
+ include Enumerable
7
+ include NewspaperWorks::Ingest::NDNP::NDNPMetsHelper
8
+
9
+ attr_accessor :path, :doc, :dmdids, :issue_paths
10
+
11
+ def initialize(path)
12
+ @path = path
13
+ @doc = nil
14
+ @metadata = nil
15
+ # identifiers of control images, which we make accessible, but are
16
+ # not the primary focus of enumeration:
17
+ @dmdids = nil
18
+ @issue_paths = []
19
+ load_doc
20
+ end
21
+
22
+ def inspect
23
+ format(
24
+ "<#{self.class}:0x000000000%<oid>x\n" \
25
+ "\tpath: '#{path}',\n",
26
+ oid: object_id << 1
27
+ )
28
+ end
29
+
30
+ def identifier
31
+ metadata.reel_number
32
+ end
33
+
34
+ # Return control image as PageIngest object.
35
+ # These objects will not have pagination/sequence data, but
36
+ # will provide an equivalent programmatic interface for file access
37
+ # of control images, as one would access normal page files.
38
+ # @return [NewspaperWorks::Ingest::NDNP::PageIngest]
39
+ def page_by_dmdid(dmdid)
40
+ NewspaperWorks::Ingest::NDNP::PageIngest.new(@path, dmdid, self)
41
+ end
42
+
43
+ # Get IssueIngest object, given path to its XML
44
+ # return [NewspaperWorks::Ingest::NDNP::IssueIngest]
45
+ def issue_by_path(path)
46
+ NewspaperWorks::Ingest::NDNP::IssueIngest.new(path)
47
+ end
48
+
49
+ def each
50
+ @issue_paths.each do |path|
51
+ yield issue_by_path(path)
52
+ end
53
+ end
54
+
55
+ def size
56
+ @issue_paths.size
57
+ end
58
+
59
+ def metadata
60
+ return @metadata unless @metadata.nil?
61
+ @metadata = NewspaperWorks::Ingest::NDNP::ContainerMetadata.new(
62
+ path,
63
+ self
64
+ )
65
+ end
66
+
67
+ private
68
+
69
+ def load_doc
70
+ @doc = Nokogiri::XML(File.open(path)) if @doc.nil?
71
+ page_divs = doc.xpath(
72
+ "//mets:structMap/mets:div[@TYPE='np:reel']/" \
73
+ "mets:div[@TYPE='np:target']",
74
+ mets: 'http://www.loc.gov/METS/'
75
+ )
76
+ # identifiers for reel control images:
77
+ @dmdids = page_divs.map { |div| div.attr('DMDID') }
78
+ load_issue_paths
79
+ end
80
+
81
+ # Load instance attribute for issue paths,
82
+ # based on listing of directory in which reel XML is present.
83
+ # This is done without context of batch xml,
84
+ # with file name expectations based on convention,
85
+ # as expressed in NDNP technical guidelines,
86
+ # which presume that the issue XML file name will (sans extension)
87
+ # match directory name for the issue, in date+edition syntax.
88
+ def load_issue_paths
89
+ issue_dir_paths = Dir["#{File.dirname(path)}/*/"].select do |v|
90
+ !File.basename(v).match(/^[0-9]+$/).nil?
91
+ end
92
+ @issue_paths = issue_dir_paths.map do |path|
93
+ File.join(path, "#{File.basename(path)}.xml")
94
+ end
95
+ end
96
+ end
97
+ end
98
+ end
99
+ end
@@ -0,0 +1,84 @@
1
+ module NewspaperWorks
2
+ module Ingest
3
+ module NDNP
4
+ # Ingester for reel/container, given reel source data
5
+ # and required publication (NewspaperTitle) asset.
6
+ # Responsibile for creating/finding container, linking
7
+ # to (parent) publication and (child) pages.
8
+ class ContainerIngester
9
+ include NewspaperWorks::Ingest::NDNP::NDNPAssetHelper
10
+
11
+ attr_accessor :source, :target, :publication, :opts
12
+
13
+ # Create ingester in context of source reel data, NewspaperTitle
14
+ # @param source [NewspaperWorks::Ingest::NDNP::ContainerIngest]
15
+ # @param publication [NewspaperTitle] Required publication to link to
16
+ # @param opts [Hash]
17
+ # ingest options, e.g. administrative metadata
18
+ def initialize(source, publication, opts = {})
19
+ @source = source
20
+ @publication = publication
21
+ @opts = opts
22
+ # initially nil, populate w/ NewspaperContainer object via .ingest
23
+ @target = nil
24
+ end
25
+
26
+ def ingest
27
+ find_or_create_container
28
+ link_publication
29
+ end
30
+
31
+ # Link a page to target container
32
+ # @param page [NewspaperPage]
33
+ def link(page)
34
+ @target.ordered_members << page
35
+ # save each link attempt (for now no deferring/bundling)
36
+ @target.save!
37
+ end
38
+
39
+ def find_or_create_container
40
+ @target = find_container
41
+ create_container if @target.nil?
42
+ end
43
+
44
+ private
45
+
46
+ def metadata
47
+ @source.metadata
48
+ end
49
+
50
+ def find_container
51
+ NewspaperContainer.where(identifier: metadata.reel_number).first
52
+ end
53
+
54
+ def create_container
55
+ @target = NewspaperContainer.create
56
+ copy_metadata
57
+ assign_administrative_metadata
58
+ @target.save!
59
+ end
60
+
61
+ def copy_metadata
62
+ reel_number = metadata.reel_number
63
+ @target.identifier = [reel_number]
64
+ @target.title = ["Microform reel (#{reel_number})"]
65
+ copy_fields = [
66
+ :held_by,
67
+ :publication_date_start,
68
+ :publication_date_end
69
+ ]
70
+ copy_fields.each do |fieldname|
71
+ value = metadata.send(fieldname.to_s)
72
+ @target.send("#{fieldname}=", value)
73
+ end
74
+ end
75
+
76
+ def link_publication
77
+ return unless @target.publication.nil?
78
+ @publication.members << @target
79
+ @publication.save!
80
+ end
81
+ end
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,87 @@
1
+ module NewspaperWorks
2
+ module Ingest
3
+ module NDNP
4
+ class ContainerMetadata
5
+ include NewspaperWorks::Ingest::NDNP::NDNPMetsHelper
6
+
7
+ attr_accessor :path, :doc
8
+
9
+ def initialize(path, parent = nil)
10
+ @path = path
11
+ @parent = parent
12
+ @doc = nil
13
+ load_doc
14
+ end
15
+
16
+ def inspect
17
+ format(
18
+ "<#{self.class}:0x000000000%<oid>x\n" \
19
+ "\tpath: '#{path}',\n",
20
+ oid: object_id << 1
21
+ )
22
+ end
23
+
24
+ # Reel Number (NDNP-mandatory)
25
+ # @return [String] a serial number string for reel, may correspond
26
+ # to an issued barcode
27
+ def reel_number
28
+ v = xpath("//mods:identifier[@type='reel number']").first
29
+ return v.text unless v.nil?
30
+ xpath('//mets:mets/@LABEL').first.value
31
+ end
32
+
33
+ # Original Source Repository (NDNP-mandatory)
34
+ # @return [String]
35
+ def held_by
36
+ v = xpath("//mods:physicalLocation").first
37
+ return v['displayLabel'] unless v.nil?
38
+ # fallback to look at mods:note/@displayLabel, when the
39
+ # @type="agencyResponsibleForReproduction"
40
+ xpath(
41
+ '//mods:note[@type="agencyResponsibleForReproduction"]' \
42
+ '/@displayLabel'
43
+ ).first.value
44
+ end
45
+
46
+ # Media genre/form (Page Physical Description, e.g. "microform")
47
+ # NDNP Mandatory.
48
+ # @return [String]
49
+ def genre
50
+ form = xpath('//mods:physicalDescription/MODS:form').first
51
+ form.attributes['type'].value
52
+ end
53
+
54
+ # Titles (on Reel) (optional)
55
+ # @return [String] title
56
+ def title
57
+ techmd('ndnp:titles')
58
+ end
59
+
60
+ # Start Date (optional)
61
+ # @return [String] ISO 8601 formatted date
62
+ def publication_date_start
63
+ techmd('ndnp:startDate')
64
+ end
65
+
66
+ # End Date (optional)
67
+ # @return [String] ISO 8601 formatted date
68
+ def publication_date_end
69
+ techmd('ndnp:endDate')
70
+ end
71
+
72
+ private
73
+
74
+ def load_doc
75
+ @doc = @parent.doc unless @parent.nil?
76
+ @doc = Nokogiri::XML(File.open(path)) if @doc.nil?
77
+ end
78
+
79
+ def techmd(spec = nil)
80
+ base = xpath('//ndnp:reelTechMD')
81
+ return base if spec.nil?
82
+ base.xpath(spec).first.text
83
+ end
84
+ end
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,81 @@
1
+ module NewspaperWorks
2
+ module Ingest
3
+ module NDNP
4
+ class IssueIngest
5
+ include Enumerable
6
+ include NewspaperWorks::Ingest::NDNP::NDNPMetsHelper
7
+
8
+ attr_accessor :path, :doc, :dmdids
9
+
10
+ def initialize(path)
11
+ @path = path
12
+ @doc = nil
13
+ @metadata = nil
14
+ # Enumeration based on list of DMDID loaded by load_doc
15
+ @dmdids = nil
16
+ load_doc
17
+ # cache dmdid -> PageIngest
18
+ @page_cache = {}
19
+ end
20
+
21
+ def inspect
22
+ format(
23
+ "<#{self.class}:0x000000000%<oid>x\n" \
24
+ "\tpath: '#{path}',\n",
25
+ oid: object_id << 1
26
+ )
27
+ end
28
+
29
+ def identifier
30
+ metadata.lccn
31
+ end
32
+
33
+ def page_by_dmdid(dmdid)
34
+ return @page_cache[dmdid] if @page_cache.key?(dmdid)
35
+ p = NewspaperWorks::Ingest::NDNP::PageIngest.new(@path, dmdid, self)
36
+ @page_cache[dmdid] = p
37
+ p
38
+ end
39
+
40
+ def page_by_sequence_number(n)
41
+ page_by_dmdid(
42
+ doc.xpath(
43
+ "//mods:extent//mods:start[text()='#{n}']",
44
+ mets: 'http://www.loc.gov/METS/',
45
+ mods: 'http://www.loc.gov/mods/v3'
46
+ ).first.ancestors('dmdSec').first['ID']
47
+ )
48
+ end
49
+
50
+ def each
51
+ @dmdids.each do |dmdid|
52
+ yield page_by_dmdid(dmdid)
53
+ end
54
+ end
55
+
56
+ def size
57
+ @dmdids.size
58
+ end
59
+
60
+ def metadata
61
+ return @metadata unless @metadata.nil?
62
+ @metadata = NewspaperWorks::Ingest::NDNP::IssueMetadata.new(
63
+ path,
64
+ self
65
+ )
66
+ end
67
+
68
+ private
69
+
70
+ def load_doc
71
+ @doc = Nokogiri::XML(File.open(path)) if @doc.nil?
72
+ page_divs = doc.xpath(
73
+ "//mets:structMap//mets:div[@TYPE='np:page']",
74
+ mets: 'http://www.loc.gov/METS/'
75
+ )
76
+ @dmdids = page_divs.map { |div| div.attr('DMDID') }
77
+ end
78
+ end
79
+ end
80
+ end
81
+ end