newspaper_works 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (461) hide show
  1. checksums.yaml +7 -0
  2. data/.fcrepo_wrapper +4 -0
  3. data/.gitignore +43 -0
  4. data/.rubocop.yml +143 -0
  5. data/.solr_wrapper +8 -0
  6. data/.travis.yml +50 -0
  7. data/Gemfile +47 -0
  8. data/LICENSE +203 -0
  9. data/README.md +159 -0
  10. data/Rakefile +38 -0
  11. data/app/actors/hyrax/actors/newspaper_article_actor.rb +8 -0
  12. data/app/actors/hyrax/actors/newspaper_container_actor.rb +8 -0
  13. data/app/actors/hyrax/actors/newspaper_issue_actor.rb +8 -0
  14. data/app/actors/hyrax/actors/newspaper_page_actor.rb +8 -0
  15. data/app/actors/hyrax/actors/newspaper_title_actor.rb +8 -0
  16. data/app/actors/newspaper_works/actors/newspaper_works_upload_actor.rb +88 -0
  17. data/app/assets/config/newspaper_works_manifest.js +2 -0
  18. data/app/assets/images/newspaper_works/.keep +0 -0
  19. data/app/assets/javascripts/newspaper_works/autocomplete_fix.js +33 -0
  20. data/app/assets/javascripts/newspaper_works/ocr_search.js.erb +6 -0
  21. data/app/assets/javascripts/newspaper_works/thumbnail_highlights.js.erb +102 -0
  22. data/app/assets/javascripts/newspaper_works.js +4 -0
  23. data/app/assets/stylesheets/newspaper_works/_issue_search.scss +13 -0
  24. data/app/assets/stylesheets/newspaper_works/_issues_calendar.scss +18 -0
  25. data/app/assets/stylesheets/newspaper_works/_newspaper_works.scss +4 -0
  26. data/app/assets/stylesheets/newspaper_works/_newspapers_search.scss +38 -0
  27. data/app/assets/stylesheets/newspaper_works/_search_results.scss +12 -0
  28. data/app/controllers/hyrax/newspaper_articles_controller.rb +14 -0
  29. data/app/controllers/hyrax/newspaper_containers_controller.rb +14 -0
  30. data/app/controllers/hyrax/newspaper_issues_controller.rb +14 -0
  31. data/app/controllers/hyrax/newspaper_pages_controller.rb +14 -0
  32. data/app/controllers/hyrax/newspaper_titles_controller.rb +13 -0
  33. data/app/controllers/newspaper_works/newspapers_controller.rb +117 -0
  34. data/app/controllers/newspaper_works/newspapers_search_controller.rb +26 -0
  35. data/app/forms/hyrax/newspaper_article_form.rb +11 -0
  36. data/app/forms/hyrax/newspaper_container_form.rb +11 -0
  37. data/app/forms/hyrax/newspaper_issue_form.rb +11 -0
  38. data/app/forms/hyrax/newspaper_page_form.rb +15 -0
  39. data/app/forms/hyrax/newspaper_title_form.rb +12 -0
  40. data/app/forms/newspaper_works/newspaper_core_form_data.rb +17 -0
  41. data/app/helpers/hyrax/newspaper_articles_helper.rb +5 -0
  42. data/app/helpers/hyrax/newspaper_containers_helper.rb +5 -0
  43. data/app/helpers/hyrax/newspaper_issues_helper.rb +5 -0
  44. data/app/helpers/hyrax/newspaper_pages_helper.rb +5 -0
  45. data/app/helpers/newspaper_works/application_helper.rb +5 -0
  46. data/app/helpers/newspaper_works/breadcrumb_helper.rb +92 -0
  47. data/app/helpers/newspaper_works/newspaper_works_helper_behavior.rb +103 -0
  48. data/app/helpers/newspaper_works/newspapers_helper.rb +5 -0
  49. data/app/indexers/concerns/newspaper_works/indexes_full_text.rb +17 -0
  50. data/app/indexers/concerns/newspaper_works/indexes_place_of_publication.rb +67 -0
  51. data/app/indexers/concerns/newspaper_works/indexes_publication_date_range.rb +35 -0
  52. data/app/indexers/concerns/newspaper_works/indexes_relationships.rb +125 -0
  53. data/app/indexers/newspaper_article_indexer.rb +16 -0
  54. data/app/indexers/newspaper_container_indexer.rb +18 -0
  55. data/app/indexers/newspaper_issue_indexer.rb +26 -0
  56. data/app/indexers/newspaper_page_indexer.rb +9 -0
  57. data/app/indexers/newspaper_title_indexer.rb +19 -0
  58. data/app/indexers/newspaper_works/newspaper_core_indexer.rb +21 -0
  59. data/app/jobs/newspaper_works/application_job.rb +4 -0
  60. data/app/jobs/newspaper_works/compose_issue_pdf_job.rb +13 -0
  61. data/app/jobs/newspaper_works/create_issue_pages_job.rb +19 -0
  62. data/app/mailers/newspaper_works/application_mailer.rb +8 -0
  63. data/app/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior.rb +82 -0
  64. data/app/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior.rb +27 -0
  65. data/app/models/concerns/newspaper_works/newspaper_core_metadata.rb +67 -0
  66. data/app/models/concerns/newspaper_works/place_of_publication_behavior.rb +15 -0
  67. data/app/models/concerns/newspaper_works/scanned_media_metadata.rb +43 -0
  68. data/app/models/concerns/newspaper_works/solr/document.rb +25 -0
  69. data/app/models/file_set.rb +10 -0
  70. data/app/models/newspaper_article.rb +158 -0
  71. data/app/models/newspaper_container.rb +86 -0
  72. data/app/models/newspaper_issue.rb +115 -0
  73. data/app/models/newspaper_page.rb +70 -0
  74. data/app/models/newspaper_title.rb +111 -0
  75. data/app/models/newspaper_works/application_record.rb +6 -0
  76. data/app/models/newspaper_works/derivative_attachment.rb +8 -0
  77. data/app/models/newspaper_works/ingest_file_relation.rb +14 -0
  78. data/app/presenters/hyrax/newspaper_article_presenter.rb +38 -0
  79. data/app/presenters/hyrax/newspaper_container_presenter.rb +11 -0
  80. data/app/presenters/hyrax/newspaper_issue_presenter.rb +62 -0
  81. data/app/presenters/hyrax/newspaper_page_presenter.rb +72 -0
  82. data/app/presenters/hyrax/newspaper_title_presenter.rb +86 -0
  83. data/app/presenters/newspaper_works/iiif_manifest_presenter_behavior.rb +29 -0
  84. data/app/presenters/newspaper_works/issue_info_presenter.rb +29 -0
  85. data/app/presenters/newspaper_works/newspaper_core_presenter.rb +9 -0
  86. data/app/presenters/newspaper_works/persistent_url_presenter_behavior.rb +16 -0
  87. data/app/presenters/newspaper_works/place_of_publication_presenter_behavior.rb +8 -0
  88. data/app/presenters/newspaper_works/scanned_media_presenter.rb +7 -0
  89. data/app/presenters/newspaper_works/title_info_presenter.rb +13 -0
  90. data/app/search_builders/concerns/newspaper_works/exclude_models.rb +16 -0
  91. data/app/search_builders/concerns/newspaper_works/highlight_search_params.rb +14 -0
  92. data/app/search_builders/newspaper_works/newspapers_search_builder.rb +26 -0
  93. data/app/services/hyrax/article_genre_service.rb +9 -0
  94. data/app/services/newspaper_works/jp2_derivative_service.rb +120 -0
  95. data/app/services/newspaper_works/newspaper_page_derivative_service.rb +91 -0
  96. data/app/services/newspaper_works/pdf_derivative_service.rb +45 -0
  97. data/app/services/newspaper_works/pluggable_derivative_service.rb +114 -0
  98. data/app/services/newspaper_works/text_extraction_derivative_service.rb +56 -0
  99. data/app/services/newspaper_works/text_formats_from_alto_service.rb +77 -0
  100. data/app/services/newspaper_works/tiff_derivative_service.rb +54 -0
  101. data/app/validators/newspaper_works/publication_date_start_end_validator.rb +48 -0
  102. data/app/validators/newspaper_works/publication_date_validator.rb +16 -0
  103. data/app/views/catalog/_index_gallery_newspaper_article_wrapper.html.erb +9 -0
  104. data/app/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb +9 -0
  105. data/app/views/catalog/_index_header_gallery_newspaper_article.html.erb +23 -0
  106. data/app/views/catalog/_index_header_gallery_newspaper_page.html.erb +23 -0
  107. data/app/views/catalog/_index_header_list_newspaper_article.html.erb +7 -0
  108. data/app/views/catalog/_index_header_list_newspaper_page.html.erb +7 -0
  109. data/app/views/catalog/_snippets_more.html.erb +16 -0
  110. data/app/views/catalog/_thumbnail_list_newspaper_article.html.erb +6 -0
  111. data/app/views/catalog/_thumbnail_list_newspaper_page.html.erb +6 -0
  112. data/app/views/hyrax/file_sets/_actions.html.erb +45 -0
  113. data/app/views/hyrax/newspaper_articles/_newspaper_article.html.erb +2 -0
  114. data/app/views/hyrax/newspaper_articles/show.html.erb +1 -0
  115. data/app/views/hyrax/newspaper_containers/_newspaper_container.html.erb +2 -0
  116. data/app/views/hyrax/newspaper_containers/show.html.erb +1 -0
  117. data/app/views/hyrax/newspaper_issues/_newspaper_issue.html.erb +2 -0
  118. data/app/views/hyrax/newspaper_issues/show.html.erb +1 -0
  119. data/app/views/hyrax/newspaper_pages/_newspaper_page.html.erb +2 -0
  120. data/app/views/hyrax/newspaper_pages/show.html.erb +1 -0
  121. data/app/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb +5 -0
  122. data/app/views/hyrax/newspaper_titles/_issue_search_form.html.erb +33 -0
  123. data/app/views/hyrax/newspaper_titles/_issues_calendar.html.erb +63 -0
  124. data/app/views/hyrax/newspaper_titles/_newspaper_title.html.erb +2 -0
  125. data/app/views/hyrax/newspaper_titles/show.html.erb +54 -0
  126. data/app/views/newspaper_works/base/_attribute_rows.html.erb +42 -0
  127. data/app/views/newspaper_works/base/_attributes.html.erb +16 -0
  128. data/app/views/newspaper_works/base/_metadata.html.erb +6 -0
  129. data/app/views/newspaper_works/base/_newspaper_hierarchy.html.erb +14 -0
  130. data/app/views/newspaper_works/base/_persistent_url.html.erb +1 -0
  131. data/app/views/newspaper_works/base/_show.html.erb +45 -0
  132. data/app/views/newspaper_works/newspapers_search/_date_fields.html.erb +29 -0
  133. data/app/views/newspaper_works/newspapers_search/_facet_layout.html.erb +8 -0
  134. data/app/views/newspaper_works/newspapers_search/_facet_limit.html.erb +17 -0
  135. data/app/views/newspaper_works/newspapers_search/_front_pages_input.html.erb +5 -0
  136. data/app/views/newspaper_works/newspapers_search/_keyword_input.html.erb +18 -0
  137. data/app/views/newspaper_works/newspapers_search/_newspapers_facets.html.erb +5 -0
  138. data/app/views/newspaper_works/newspapers_search/_newspapers_search_form.html.erb +13 -0
  139. data/app/views/newspaper_works/newspapers_search/_newspapers_search_help.html.erb +8 -0
  140. data/app/views/newspaper_works/newspapers_search/search.html.erb +13 -0
  141. data/app/views/records/edit_fields/_alternate_title.html.erb +4 -0
  142. data/app/views/records/edit_fields/_genre.html.erb +4 -0
  143. data/app/views/records/edit_fields/_place_of_publication.html.erb +14 -0
  144. data/app/views/records/edit_fields/_subtitle.html.erb +4 -0
  145. data/bin/rails +13 -0
  146. data/config/fcrepo_wrapper_test.yml +5 -0
  147. data/config/initializers/assets.rb +2 -0
  148. data/config/locales/newspaper_article.de.yml +12 -0
  149. data/config/locales/newspaper_article.en.yml +12 -0
  150. data/config/locales/newspaper_article.es.yml +12 -0
  151. data/config/locales/newspaper_article.fr.yml +12 -0
  152. data/config/locales/newspaper_article.it.yml +12 -0
  153. data/config/locales/newspaper_article.pt-BR.yml +12 -0
  154. data/config/locales/newspaper_article.zh.yml +12 -0
  155. data/config/locales/newspaper_container.de.yml +8 -0
  156. data/config/locales/newspaper_container.en.yml +8 -0
  157. data/config/locales/newspaper_container.es.yml +8 -0
  158. data/config/locales/newspaper_container.fr.yml +8 -0
  159. data/config/locales/newspaper_container.it.yml +8 -0
  160. data/config/locales/newspaper_container.pt-BR.yml +8 -0
  161. data/config/locales/newspaper_container.zh.yml +8 -0
  162. data/config/locales/newspaper_issue.de.yml +8 -0
  163. data/config/locales/newspaper_issue.en.yml +8 -0
  164. data/config/locales/newspaper_issue.es.yml +8 -0
  165. data/config/locales/newspaper_issue.fr.yml +8 -0
  166. data/config/locales/newspaper_issue.it.yml +8 -0
  167. data/config/locales/newspaper_issue.pt-BR.yml +8 -0
  168. data/config/locales/newspaper_issue.zh.yml +8 -0
  169. data/config/locales/newspaper_page.de.yml +15 -0
  170. data/config/locales/newspaper_page.en.yml +15 -0
  171. data/config/locales/newspaper_page.es.yml +15 -0
  172. data/config/locales/newspaper_page.fr.yml +15 -0
  173. data/config/locales/newspaper_page.it.yml +15 -0
  174. data/config/locales/newspaper_page.pt-BR.yml +15 -0
  175. data/config/locales/newspaper_page.zh.yml +15 -0
  176. data/config/locales/newspaper_title.de.yml +8 -0
  177. data/config/locales/newspaper_title.en.yml +8 -0
  178. data/config/locales/newspaper_title.es.yml +8 -0
  179. data/config/locales/newspaper_title.fr.yml +8 -0
  180. data/config/locales/newspaper_title.it.yml +8 -0
  181. data/config/locales/newspaper_title.pt-BR.yml +8 -0
  182. data/config/locales/newspaper_title.zh.yml +8 -0
  183. data/config/locales/newspaper_works.de.yml +50 -0
  184. data/config/locales/newspaper_works.en.yml +52 -0
  185. data/config/locales/newspaper_works.es.yml +52 -0
  186. data/config/locales/newspaper_works.fr.yml +52 -0
  187. data/config/locales/newspaper_works.it.yml +52 -0
  188. data/config/locales/newspaper_works.pt-BR.yml +52 -0
  189. data/config/locales/newspaper_works.zh.yml +52 -0
  190. data/config/routes.rb +9 -0
  191. data/config/solr_wrapper_test.yml +9 -0
  192. data/config/test-fixture/solr-config/_rest_managed.json +3 -0
  193. data/config/test-fixture/solr-config/admin-extra.html +31 -0
  194. data/config/test-fixture/solr-config/elevate.xml +36 -0
  195. data/config/test-fixture/solr-config/mapping-ISOLatin1Accent.txt +246 -0
  196. data/config/test-fixture/solr-config/protwords.txt +21 -0
  197. data/config/test-fixture/solr-config/schema.xml +366 -0
  198. data/config/test-fixture/solr-config/scripts.conf +24 -0
  199. data/config/test-fixture/solr-config/solrconfig.xml +322 -0
  200. data/config/test-fixture/solr-config/spellings.txt +2 -0
  201. data/config/test-fixture/solr-config/stopwords.txt +58 -0
  202. data/config/test-fixture/solr-config/stopwords_en.txt +58 -0
  203. data/config/test-fixture/solr-config/synonyms.txt +31 -0
  204. data/config/test-fixture/solr-config/xslt/example.xsl +132 -0
  205. data/config/test-fixture/solr-config/xslt/example_atom.xsl +67 -0
  206. data/config/test-fixture/solr-config/xslt/example_rss.xsl +66 -0
  207. data/config/test-fixture/solr-config/xslt/luke.xsl +337 -0
  208. data/config/vendor/imagemagick-6-policy.xml +76 -0
  209. data/db/migrate/20181214181358_create_newspaper_works_derivative_attachments.rb +12 -0
  210. data/db/migrate/20190107165909_create_newspaper_works_ingest_file_relations.rb +11 -0
  211. data/lib/generators/newspaper_works/assets_generator.rb +29 -0
  212. data/lib/generators/newspaper_works/blacklight_advanced_search_generator.rb +44 -0
  213. data/lib/generators/newspaper_works/blacklight_iiif_search_generator.rb +41 -0
  214. data/lib/generators/newspaper_works/catalog_controller_generator.rb +60 -0
  215. data/lib/generators/newspaper_works/install_generator.rb +97 -0
  216. data/lib/generators/newspaper_works/templates/annotation_behavior.rb +6 -0
  217. data/lib/generators/newspaper_works/templates/config/authorities/newspaper_article_genres.yml +86 -0
  218. data/lib/generators/newspaper_works/templates/config/initializers/newspaper_works.rb +12 -0
  219. data/lib/generators/newspaper_works/templates/config/initializers/patch_blacklight_advanced_search.rb +74 -0
  220. data/lib/generators/newspaper_works/templates/custom_search_builder.rb +23 -0
  221. data/lib/generators/newspaper_works/templates/newspaper_works.scss +1 -0
  222. data/lib/generators/newspaper_works/templates/newspaper_works_helper.rb +3 -0
  223. data/lib/generators/newspaper_works/templates/search_behavior.rb +6 -0
  224. data/lib/newspaper_works/configuration.rb +14 -0
  225. data/lib/newspaper_works/data/fileset_helper.rb +25 -0
  226. data/lib/newspaper_works/data/path_helper.rb +40 -0
  227. data/lib/newspaper_works/data/work_derivatives.rb +314 -0
  228. data/lib/newspaper_works/data/work_file.rb +92 -0
  229. data/lib/newspaper_works/data/work_files.rb +181 -0
  230. data/lib/newspaper_works/data.rb +35 -0
  231. data/lib/newspaper_works/engine.rb +42 -0
  232. data/lib/newspaper_works/errors.rb +14 -0
  233. data/lib/newspaper_works/ingest/base_ingest.rb +69 -0
  234. data/lib/newspaper_works/ingest/base_publication_info.rb +35 -0
  235. data/lib/newspaper_works/ingest/batch_ingest_helper.rb +44 -0
  236. data/lib/newspaper_works/ingest/batch_issue_ingester.rb +129 -0
  237. data/lib/newspaper_works/ingest/chronam_publication_info.rb +133 -0
  238. data/lib/newspaper_works/ingest/from_command.rb +52 -0
  239. data/lib/newspaper_works/ingest/image_ingest_issues.rb +43 -0
  240. data/lib/newspaper_works/ingest/issue_images.rb +51 -0
  241. data/lib/newspaper_works/ingest/lc_publication_info.rb +144 -0
  242. data/lib/newspaper_works/ingest/named_issue_metadata.rb +60 -0
  243. data/lib/newspaper_works/ingest/ndnp/batch_ingester.rb +64 -0
  244. data/lib/newspaper_works/ingest/ndnp/batch_xml_ingest.rb +72 -0
  245. data/lib/newspaper_works/ingest/ndnp/container_ingest.rb +99 -0
  246. data/lib/newspaper_works/ingest/ndnp/container_ingester.rb +84 -0
  247. data/lib/newspaper_works/ingest/ndnp/container_metadata.rb +87 -0
  248. data/lib/newspaper_works/ingest/ndnp/issue_ingest.rb +81 -0
  249. data/lib/newspaper_works/ingest/ndnp/issue_ingester.rb +101 -0
  250. data/lib/newspaper_works/ingest/ndnp/issue_metadata.rb +96 -0
  251. data/lib/newspaper_works/ingest/ndnp/ndnp_asset_helper.rb +20 -0
  252. data/lib/newspaper_works/ingest/ndnp/ndnp_mets_helper.rb +70 -0
  253. data/lib/newspaper_works/ingest/ndnp/page_ingest.rb +47 -0
  254. data/lib/newspaper_works/ingest/ndnp/page_ingester.rb +157 -0
  255. data/lib/newspaper_works/ingest/ndnp/page_metadata.rb +112 -0
  256. data/lib/newspaper_works/ingest/ndnp.rb +21 -0
  257. data/lib/newspaper_works/ingest/newspaper_issue_ingest.rb +56 -0
  258. data/lib/newspaper_works/ingest/newspaper_page_ingest.rb +6 -0
  259. data/lib/newspaper_works/ingest/page_image.rb +52 -0
  260. data/lib/newspaper_works/ingest/path_enumeration.rb +52 -0
  261. data/lib/newspaper_works/ingest/pdf_images.rb +85 -0
  262. data/lib/newspaper_works/ingest/pdf_issue.rb +20 -0
  263. data/lib/newspaper_works/ingest/pdf_issues.rb +39 -0
  264. data/lib/newspaper_works/ingest/pdf_pages.rb +114 -0
  265. data/lib/newspaper_works/ingest/pub_finder.rb +89 -0
  266. data/lib/newspaper_works/ingest/publication_info.rb +44 -0
  267. data/lib/newspaper_works/ingest.rb +90 -0
  268. data/lib/newspaper_works/issue_pdf_composer.rb +111 -0
  269. data/lib/newspaper_works/logging.rb +54 -0
  270. data/lib/newspaper_works/page_finder.rb +62 -0
  271. data/lib/newspaper_works/resource_fetcher.rb +78 -0
  272. data/lib/newspaper_works/text_extraction/alto_reader.rb +122 -0
  273. data/lib/newspaper_works/text_extraction/page_ocr.rb +100 -0
  274. data/lib/newspaper_works/text_extraction/render_alto.rb +84 -0
  275. data/lib/newspaper_works/text_extraction/word_coords_builder.rb +30 -0
  276. data/lib/newspaper_works/text_extraction.rb +10 -0
  277. data/lib/newspaper_works/version.rb +3 -0
  278. data/lib/newspaper_works.rb +19 -0
  279. data/lib/tasks/newspaper_works_tasks.rake +39 -0
  280. data/newspaper_works.gemspec +49 -0
  281. data/spec/.keep.txt +1 -0
  282. data/spec/actors/newspaper_works/actors/newspaper_works_upload_actor_spec.rb +69 -0
  283. data/spec/controllers/catalog_controller_spec.rb +63 -0
  284. data/spec/controllers/newspaper_works/newspapers_controller_spec.rb +114 -0
  285. data/spec/controllers/newspaper_works/newspapers_search_controller_spec.rb +21 -0
  286. data/spec/factories/ability.rb +6 -0
  287. data/spec/factories/newspaper_issue.rb +7 -0
  288. data/spec/factories/newspaper_issue_ingest.rb +6 -0
  289. data/spec/factories/newspaper_page.rb +7 -0
  290. data/spec/factories/newspaper_page_ingest.rb +6 -0
  291. data/spec/factories/newspaper_page_solr_document.rb +12 -0
  292. data/spec/factories/newspaper_title.rb +8 -0
  293. data/spec/factories/uploaded_pdf_file.rb +9 -0
  294. data/spec/factories/user.rb +13 -0
  295. data/spec/features/front_pages_for_title_spec.rb +19 -0
  296. data/spec/features/newspaper_title_search_spec.rb +30 -0
  297. data/spec/features/newspapers_search_spec.rb +49 -0
  298. data/spec/features/search_results_thumbnail_highlights_spec.rb +33 -0
  299. data/spec/features_shared.rb +71 -0
  300. data/spec/fixtures/files/4.1.07.jp2 +0 -0
  301. data/spec/fixtures/files/4.1.07.tiff +0 -0
  302. data/spec/fixtures/files/README.md +7 -0
  303. data/spec/fixtures/files/alto-2-0.xsd +714 -0
  304. data/spec/fixtures/files/broken-truncated.pdf +0 -0
  305. data/spec/fixtures/files/credits.md +16 -0
  306. data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
  307. data/spec/fixtures/files/minimal-1-page.pdf +0 -0
  308. data/spec/fixtures/files/minimal-2-page.pdf +0 -0
  309. data/spec/fixtures/files/minimal-alto.xml +31 -0
  310. data/spec/fixtures/files/ndnp-alto-sample.xml +24 -0
  311. data/spec/fixtures/files/ndnp-sample1-json.json +1 -0
  312. data/spec/fixtures/files/ndnp-sample1-txt.txt +1 -0
  313. data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
  314. data/spec/fixtures/files/ocr_alto.xml +202 -0
  315. data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +202 -0
  316. data/spec/fixtures/files/ocr_color.tiff +0 -0
  317. data/spec/fixtures/files/ocr_gray.jp2 +0 -0
  318. data/spec/fixtures/files/ocr_gray.tiff +0 -0
  319. data/spec/fixtures/files/ocr_mono.tiff +0 -0
  320. data/spec/fixtures/files/page1.tiff +0 -0
  321. data/spec/fixtures/files/resource_mocks/chronam/http404-expected +0 -0
  322. data/spec/fixtures/files/resource_mocks/chronam/sn84038814.rdf +1028 -0
  323. data/spec/fixtures/files/resource_mocks/chronam/sn93059126.rdf +36 -0
  324. data/spec/fixtures/files/resource_mocks/chronam/sn94051019.rdf +37 -0
  325. data/spec/fixtures/files/resource_mocks/geonames/Chicopee +1104 -0
  326. data/spec/fixtures/files/resource_mocks/geonames/Denver +1104 -0
  327. data/spec/fixtures/files/resource_mocks/geonames/Marysville +279 -0
  328. data/spec/fixtures/files/resource_mocks/geonames/Marysville2 +279 -0
  329. data/spec/fixtures/files/resource_mocks/geonames/SLC +1104 -0
  330. data/spec/fixtures/files/resource_mocks/lccn/sn2099999999 +1 -0
  331. data/spec/fixtures/files/resource_mocks/lccn/sn82014496 +2 -0
  332. data/spec/fixtures/files/resource_mocks/lccn/sn83020109 +1 -0
  333. data/spec/fixtures/files/resource_mocks/lccn/sn83021453 +2 -0
  334. data/spec/fixtures/files/resource_mocks/lccn/sn83045396 +2 -0
  335. data/spec/fixtures/files/resource_mocks/lccn/sn84038814 +2 -0
  336. data/spec/fixtures/files/resource_mocks/lccn/sn93059126 +1 -0
  337. data/spec/fixtures/files/resource_mocks/lccn/sn94051019 +1 -0
  338. data/spec/fixtures/files/resource_mocks/lccn/sn99999999 +1 -0
  339. data/spec/fixtures/files/resource_mocks/urls.json +82 -0
  340. data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
  341. data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
  342. data/spec/fixtures/files/thumbnail.jpg +0 -0
  343. data/spec/forms/hyrax/newspaper_article_form_spec.rb +33 -0
  344. data/spec/forms/hyrax/newspaper_container_form_spec.rb +30 -0
  345. data/spec/forms/hyrax/newspaper_issue_form_spec.rb +31 -0
  346. data/spec/forms/hyrax/newspaper_page_form_spec.rb +28 -0
  347. data/spec/forms/hyrax/newspaper_title_form_spec.rb +31 -0
  348. data/spec/forms/newspaper_works/newspaper_core_form_data_spec.rb +12 -0
  349. data/spec/helpers/newspaper_works/breadcrumb_helper_spec.rb +82 -0
  350. data/spec/helpers/newspaper_works_helper_spec.rb +57 -0
  351. data/spec/indexers/concerns/newspaper_works/indexes_full_text_spec.rb +31 -0
  352. data/spec/indexers/concerns/newspaper_works/indexes_place_of_publication_spec.rb +53 -0
  353. data/spec/indexers/concerns/newspaper_works/indexes_publication_date_range_spec.rb +39 -0
  354. data/spec/indexers/concerns/newspaper_works/indexes_relationships_spec.rb +86 -0
  355. data/spec/indexers/newspaper_article_indexer_spec.rb +29 -0
  356. data/spec/indexers/newspaper_issue_indexer_spec.rb +19 -0
  357. data/spec/indexers/newspaper_title_indexer_spec.rb +22 -0
  358. data/spec/indexers/newspaper_works/newspaper_core_indexer_spec.rb +23 -0
  359. data/spec/lib/newspaper_works/configuration_spec.rb +18 -0
  360. data/spec/lib/newspaper_works/data/work_derivatives_spec.rb +245 -0
  361. data/spec/lib/newspaper_works/data/work_file_spec.rb +99 -0
  362. data/spec/lib/newspaper_works/data/work_files_spec.rb +224 -0
  363. data/spec/lib/newspaper_works/ingest/batch_issue_ingester_spec.rb +158 -0
  364. data/spec/lib/newspaper_works/ingest/chronam_publication_info_spec.rb +35 -0
  365. data/spec/lib/newspaper_works/ingest/from_command_spec.rb +75 -0
  366. data/spec/lib/newspaper_works/ingest/image_ingest_issues_spec.rb +62 -0
  367. data/spec/lib/newspaper_works/ingest/ingest_shared.rb +75 -0
  368. data/spec/lib/newspaper_works/ingest/issue_images_spec.rb +65 -0
  369. data/spec/lib/newspaper_works/ingest/lc_publication_info_spec.rb +34 -0
  370. data/spec/lib/newspaper_works/ingest/ndnp/batch_ingester_spec.rb +131 -0
  371. data/spec/lib/newspaper_works/ingest/ndnp/batch_xml_ingest_spec.rb +64 -0
  372. data/spec/lib/newspaper_works/ingest/ndnp/container_ingest_spec.rb +44 -0
  373. data/spec/lib/newspaper_works/ingest/ndnp/container_ingester_spec.rb +126 -0
  374. data/spec/lib/newspaper_works/ingest/ndnp/container_metadata_spec.rb +36 -0
  375. data/spec/lib/newspaper_works/ingest/ndnp/issue_ingest_spec.rb +108 -0
  376. data/spec/lib/newspaper_works/ingest/ndnp/issue_ingester_spec.rb +155 -0
  377. data/spec/lib/newspaper_works/ingest/ndnp/issue_metadata_spec.rb +84 -0
  378. data/spec/lib/newspaper_works/ingest/ndnp/page_ingest_spec.rb +79 -0
  379. data/spec/lib/newspaper_works/ingest/ndnp/page_ingester_spec.rb +184 -0
  380. data/spec/lib/newspaper_works/ingest/ndnp/page_metadata_spec.rb +85 -0
  381. data/spec/lib/newspaper_works/ingest/newspaper_issue_ingest_spec.rb +83 -0
  382. data/spec/lib/newspaper_works/ingest/newspaper_page_ingest_spec.rb +77 -0
  383. data/spec/lib/newspaper_works/ingest/page_image_spec.rb +29 -0
  384. data/spec/lib/newspaper_works/ingest/pdf_images_spec.rb +32 -0
  385. data/spec/lib/newspaper_works/ingest/pdf_issue_spec.rb +29 -0
  386. data/spec/lib/newspaper_works/ingest/pdf_issues_spec.rb +62 -0
  387. data/spec/lib/newspaper_works/ingest/pdf_pages_spec.rb +110 -0
  388. data/spec/lib/newspaper_works/ingest/pub_finder_spec.rb +58 -0
  389. data/spec/lib/newspaper_works/ingest/publication_info_spec.rb +61 -0
  390. data/spec/lib/newspaper_works/ingest_spec.rb +45 -0
  391. data/spec/lib/newspaper_works/issue_pdf_composer_spec.rb +101 -0
  392. data/spec/lib/newspaper_works/logging_spec.rb +53 -0
  393. data/spec/lib/newspaper_works/page_finder_spec.rb +53 -0
  394. data/spec/lib/newspaper_works/resource_fetcher_spec.rb +65 -0
  395. data/spec/lib/newspaper_works/text_extraction/alto_reader_spec.rb +49 -0
  396. data/spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb +84 -0
  397. data/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb +54 -0
  398. data/spec/lib/newspaper_works/text_extraction/word_coords_builder_spec.rb +30 -0
  399. data/spec/lib/tasks/newspaper_works_rake_spec.rb +124 -0
  400. data/spec/misc_shared.rb +109 -0
  401. data/spec/model_shared.rb +134 -0
  402. data/spec/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior_spec.rb +45 -0
  403. data/spec/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior_spec.rb +27 -0
  404. data/spec/models/concerns/newspaper_works/newspaper_core_metadata_spec.rb +45 -0
  405. data/spec/models/concerns/newspaper_works/place_of_publication_behavior_spec.rb +17 -0
  406. data/spec/models/concerns/newspaper_works/scanned_media_metadata_spec.rb +35 -0
  407. data/spec/models/newspaper_article_spec.rb +73 -0
  408. data/spec/models/newspaper_container_spec.rb +111 -0
  409. data/spec/models/newspaper_issue_spec.rb +91 -0
  410. data/spec/models/newspaper_page_spec.rb +44 -0
  411. data/spec/models/newspaper_title_spec.rb +116 -0
  412. data/spec/models/newspaper_works/derivative_attachment_spec.rb +37 -0
  413. data/spec/models/newspaper_works/ingest_file_relation_spec.rb +56 -0
  414. data/spec/models/solr_document_spec.rb +14 -0
  415. data/spec/ndnp_shared.rb +48 -0
  416. data/spec/presenters/hyrax/newspaper_article_presenter_spec.rb +53 -0
  417. data/spec/presenters/hyrax/newspaper_container_presenter_spec.rb +20 -0
  418. data/spec/presenters/hyrax/newspaper_issue_presenter_spec.rb +65 -0
  419. data/spec/presenters/hyrax/newspaper_page_presenter_spec.rb +75 -0
  420. data/spec/presenters/hyrax/newspaper_title_presenter_spec.rb +153 -0
  421. data/spec/presenters/newspaper_works/iiif_manifest_presenter_behavior_spec.rb +32 -0
  422. data/spec/presenters/newspaper_works/issue_info_presenter_spec.rb +51 -0
  423. data/spec/presenters/newspaper_works/newspaper_core_presenter_spec.rb +22 -0
  424. data/spec/presenters/newspaper_works/persistent_url_presenter_behavior_spec.rb +24 -0
  425. data/spec/presenters/newspaper_works/place_of_publication_presenter_behavior_spec.rb +17 -0
  426. data/spec/presenters/newspaper_works/scanned_media_presenter_spec.rb +18 -0
  427. data/spec/presenters/newspaper_works/title_info_presenter_spec.rb +23 -0
  428. data/spec/routing/route_spec.rb +52 -0
  429. data/spec/search_builders/custom_search_builder_spec.rb +34 -0
  430. data/spec/search_builders/newspaper_works/newspapers_search_builder_spec.rb +33 -0
  431. data/spec/services/hyrax/article_genre_service_spec.rb +12 -0
  432. data/spec/services/hyrax/resource_types_service_spec.rb +12 -0
  433. data/spec/services/newspaper_works/jp2_derivative_service_spec.rb +62 -0
  434. data/spec/services/newspaper_works/newspaper_page_derivative_service_spec.rb +125 -0
  435. data/spec/services/newspaper_works/pdf_derivative_service_spec.rb +62 -0
  436. data/spec/services/newspaper_works/pluggable_derivative_service_spec.rb +204 -0
  437. data/spec/services/newspaper_works/text_extraction_derivative_service_spec.rb +82 -0
  438. data/spec/services/newspaper_works/text_formats_from_alto_service_spec.rb +129 -0
  439. data/spec/services/newspaper_works/tiff_derivative_service_spec.rb +58 -0
  440. data/spec/spec_helper.rb +261 -0
  441. data/spec/support/controller_level_helpers.rb +28 -0
  442. data/spec/test_app_templates/lib/generators/test_app_generator.rb +22 -0
  443. data/spec/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb_spec.rb +36 -0
  444. data/spec/views/catalog/_index_header_list_newspaper_page.html.erb_spec.rb +26 -0
  445. data/spec/views/catalog/_thumbnail_list_newspaper_page.html.erb_spec.rb +35 -0
  446. data/spec/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb_spec.rb +16 -0
  447. data/spec/views/hyrax/newspaper_titles/_issue_search_form.html.erb_spec.rb +33 -0
  448. data/spec/views/hyrax/newspaper_titles/_issues_calendar.html.erb_spec.rb +37 -0
  449. data/spec/views/hyrax/newspaper_titles/show.html.erb_spec.rb +87 -0
  450. data/spec/views/newspaper_works/base/_attribute_rows.html.erb_spec.rb +60 -0
  451. data/spec/views/newspaper_works/base/_newspaper_hierarchy.html.erb_spec.rb +80 -0
  452. data/spec/views/newspaper_works/base/_show.html.erb_spec.rb +78 -0
  453. data/spec/views/newspaper_works/newspapers_search/search.html.erb_spec.rb +54 -0
  454. data/spec/views/records/edit_fields/_place_of_publication.html.erb_spec.rb +26 -0
  455. data/tasks/newspaperworks_dev.rake +26 -0
  456. data/test/integration/navigation_test.rb +7 -0
  457. data/test/lib/generators/newspaper_works/install_generator_test.rb +16 -0
  458. data/test/newspaper_works_test.rb +7 -0
  459. data/test/test_helper.rb +17 -0
  460. data/tmp/.keep +0 -0
  461. metadata +1037 -0
@@ -0,0 +1,69 @@
1
+ require 'newspaper_works/data'
2
+
3
+ module NewspaperWorks
4
+ module Ingest
5
+ # base class for ingesting works, implements, as-needed, temp files
6
+ class BaseIngest
7
+ include NewspaperWorks::Data::PathHelper
8
+
9
+ attr_accessor :work, :io, :path, :filename
10
+
11
+ def initialize(work)
12
+ # adapted context:
13
+ @work = work
14
+ end
15
+
16
+ def loadpath(source)
17
+ # quick check the file exists and is readable on filesystem:
18
+ raise ArgumentError, 'File not found or readable' unless
19
+ File.readable?(source)
20
+ # path may be relative to Dir.pwd, but no matter for our use
21
+ @path = source.to_s
22
+ @io = File.open(@path)
23
+ @filename ||= File.split(@path)[-1]
24
+ end
25
+
26
+ def loadio(source)
27
+ # either an IO with a path, or an IO with filename passed in
28
+ # args; presume we need a filename to describe/identify.
29
+ raise ArgumentError, 'Explicit or inferred file name required' unless
30
+ source.respond_to?('path') || @filename
31
+ @io = source
32
+ @path = source.respond_to?('path') ? source.path : nil
33
+ @filename ||= File.split(@path)[-1]
34
+ end
35
+
36
+ def load(source, filename: nil)
37
+ # source is a string path, Pathname object, or quacks like an IO
38
+ unless source.class == String ||
39
+ source.class == Pathname ||
40
+ source.respond_to?('read')
41
+ raise ArgumentError, 'Source is neither path nor IO object'
42
+ end
43
+ # permit the possibility of a filename identifier metadata distinct
44
+ # from the actual path on disk:
45
+ @filename = filename
46
+ ispath = source.class == String || source.class == Pathname
47
+ loader = ispath ? method(:loadpath) : method(:loadio)
48
+ loader.call(source)
49
+ end
50
+
51
+ # default handler attaches file to work's file set, subclasses
52
+ # may overwride or wrap this.
53
+ def import
54
+ files = NewspaperWorks::Data::WorkFiles.new(work)
55
+ files.assign(path)
56
+ files.commit!
57
+ end
58
+
59
+ def user
60
+ defined?(current_user) ? current_user : User.batch_user
61
+ end
62
+
63
+ def ingest(source, filename: nil)
64
+ load(source, filename: filename)
65
+ import
66
+ end
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,35 @@
1
+ module NewspaperWorks
2
+ module Ingest
3
+ class BasePublicationInfo
4
+ attr_accessor :lccn, :issn
5
+
6
+ def initialize(lccn)
7
+ @lccn = lccn
8
+ load
9
+ end
10
+
11
+ def load
12
+ raise NotImplementedError, "abstract"
13
+ end
14
+
15
+ # Return normalized, prefixed OCLC number from numeric Integer or
16
+ # String inputs; prefxes based on number of digits, leaves any
17
+ # prefix in input unchanged.
18
+ # @param oclcnum [String, Integer] prefixed or unprefixed OCLC control #
19
+ # @return [String] normalized, prefixed OCLC number
20
+ def oclc_prefixed(oclcnum)
21
+ # unprefixed number, as string
22
+ digits = oclcnum.to_s.gsub(/[A-Za-z]/, '')
23
+ return "ocm#{digits}" if digits.size == 8
24
+ return "ocn#{digits}" if digits.size == 9
25
+ "on#{digits}"
26
+ end
27
+
28
+ def place_name_from_title(title)
29
+ parts = title.split(/ [\(]/)
30
+ return if parts.size < 2
31
+ parts[1].split(')')[0]
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,44 @@
1
+ require 'find'
2
+
3
+ module NewspaperWorks
4
+ module Ingest
5
+ # mixin module for common batch ingest steps
6
+ module BatchIngestHelper
7
+ def detect_media(path)
8
+ result = 'pdf' # default
9
+ Find.find(path) do |p|
10
+ result = 'image' if p.end_with?('jp2') || /TIF[F]?$/i.match(p)
11
+ end
12
+ result
13
+ end
14
+
15
+ def lccn_from_path(path)
16
+ File.basename(path)
17
+ end
18
+
19
+ def normalize_lccn(v)
20
+ p = /^[A-Za-z]{0,3}[0-9]{8}([0-9]{2})?$/
21
+ v = v.gsub(/\s+/, '').downcase.slice(0, 13)
22
+ raise ArgumentError, "LCCN appears invalid: #{v}" unless p.match(v)
23
+ v
24
+ end
25
+
26
+ def issue_title(issue_data)
27
+ issue_data.title
28
+ end
29
+
30
+ def copy_issue_metadata(source, target)
31
+ target.title = issue_title(source)
32
+ target.lccn = source.lccn
33
+ target.publication_date = source.publication_date
34
+ target.edition_number = source.edition_number
35
+ end
36
+
37
+ def attach_file(work, path)
38
+ attachment = NewspaperWorks::Data::WorkFiles.of(work)
39
+ attachment.assign(path)
40
+ attachment.commit!
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,129 @@
1
+ require 'open3'
2
+ require 'tmpdir'
3
+
4
+ module NewspaperWorks
5
+ module Ingest
6
+ class BatchIssueIngester
7
+ # CLI constructor, related class methods:
8
+ extend NewspaperWorks::Ingest::FromCommand
9
+
10
+ include NewspaperWorks::Ingest::PubFinder
11
+ include NewspaperWorks::Ingest::BatchIngestHelper
12
+ include NewspaperWorks::Logging
13
+
14
+ attr_accessor :path, :lccn, :publication, :opts, :issues
15
+
16
+ def initialize(path, opts = {})
17
+ @path = path
18
+ lccn = opts[:lccn]
19
+ @lccn = normalize_lccn(lccn.nil? ? lccn_from_path(path) : lccn)
20
+ # get publication info for LCCN from authority web service:
21
+ @publication = NewspaperWorks::Ingest::PublicationInfo.new(@lccn)
22
+ # issues for publication, as enumerable of PDFIssue
23
+ @issues = issue_enumerator
24
+ @opts = opts
25
+ configure_logger('ingest')
26
+ end
27
+
28
+ def issue_enumerator
29
+ impl = NewspaperWorks::Ingest::PDFIssues
30
+ impl = NewspaperWorks::Ingest::ImageIngestIssues if detect_media(path) == 'image'
31
+ # issue enumerator depends on detected media:
32
+ impl.new(path, publication)
33
+ end
34
+
35
+ def link_publication(issue)
36
+ find_or_create_publication_for_issue(
37
+ issue,
38
+ @lccn,
39
+ @publication.title,
40
+ @opts
41
+ )
42
+ end
43
+
44
+ def create_issue(issue_data)
45
+ issue = NewspaperIssue.create
46
+ copy_issue_metadata(issue_data, issue)
47
+ NewspaperWorks::Ingest.assign_administrative_metadata(issue, @opts)
48
+ issue.save!
49
+ write_log(
50
+ "Created new NewspaperIssue work with date, lccn, edition metadata:"\
51
+ "\n"\
52
+ "\tLCCN: #{@lccn}\n"\
53
+ "\tPublication Date: #{issue_data.publication_date}\n"\
54
+ "\tEdition number: #{issue_data.edition_number}"
55
+ )
56
+ link_publication(issue)
57
+ issue
58
+ end
59
+
60
+ def ingest_pdf(issue, path)
61
+ # ingest primary PDF for issue:
62
+ attach_file(issue, path)
63
+ # queue page creation job:
64
+ CreateIssuePagesJob.perform_later(issue, [path], nil, nil)
65
+ end
66
+
67
+ def create_page(page_image, issue)
68
+ page = NewspaperPage.create
69
+ page.title = page_image.title
70
+ page.page_number = page_image.page_number
71
+ page.save!
72
+ # Link page as a child of issue, via ordered members:
73
+ issue.ordered_members << page
74
+ NewspaperWorks::Ingest.assign_administrative_metadata(page, @opts)
75
+ issue.save!
76
+ # Ensure we have a source TIFF file, attach to page:
77
+ path = page_image.path
78
+ path = page_image.path.end_with?('jp2') ? make_tiff(path) : path
79
+ attach_file(page, path)
80
+ end
81
+
82
+ def ingest_pages(issue, issue_data)
83
+ # Create pages in order they appear (lexical)
84
+ issue_data.each_value { |page_image| create_page(page_image, issue) }
85
+ # Make an issue PDF from constituent pages, via retryable async job,
86
+ # which will not succeed until the PDF derivatives are created
87
+ # for each page, but should eventually succeed on that condition:
88
+ NewspaperWorks::ComposeIssuePDFJob.perform_later(issue)
89
+ end
90
+
91
+ def make_tiff(path)
92
+ raise ArgumentError unless path.end_with?('jp2')
93
+ Hyrax.config.whitelisted_ingest_dirs |= [Dir.tmpdir]
94
+ name = File.basename(path).split('.')[0]
95
+ # OpenJPEG2000 has weird quirk, only likes 3-char file ext TIF:
96
+ tiff_path = File.join(Dir.mktmpdir, "#{name}.tif")
97
+ cmd = "opj_decompress -i #{path} -o #{tiff_path}"
98
+ Open3.popen3(cmd) do |_stdin, _stdout, stderr, _wait_thr|
99
+ unless stderr.read.strip.empty?
100
+ msg = "Error converting JP2 to TIFF: #{path}"
101
+ write_log(msg, Logger::ERROR)
102
+ raise msg
103
+ end
104
+ end
105
+ tiff_path
106
+ end
107
+
108
+ def ingest_type
109
+ return 'issue_pdf' if @issues.class == NewspaperWorks::Ingest::PDFIssues
110
+ 'page_image'
111
+ end
112
+
113
+ def ingest
114
+ write_log("Beginning issue(s) batch ingest for #{@path}")
115
+ write_log("\tPublication: #{@publication.title} (LCCN: #{@lccn})")
116
+ @issues.each do |path, issue_data|
117
+ issue = create_issue(issue_data)
118
+ tactic = ingest_type
119
+ ingest_pdf(issue, path) if tactic == 'issue_pdf'
120
+ ingest_pages(issue, issue_data) if tactic == 'page_image'
121
+ end
122
+ write_log(
123
+ "Issue ingest completed for LCCN #{@lccn}. Asyncrhonous jobs "\
124
+ "may still be creating derivatives for issue, and child page works."
125
+ )
126
+ end
127
+ end
128
+ end
129
+ end
@@ -0,0 +1,133 @@
1
+ require 'faraday'
2
+ require 'nokogiri'
3
+
4
+ module NewspaperWorks
5
+ module Ingest
6
+ # Publication info from ChronAm as remote authority for metadata
7
+ class ChronAmPublicationInfo < BasePublicationInfo
8
+ attr_accessor :issn, :title, :place_name, :place_of_publication, :language
9
+
10
+ XML_NS = {
11
+ dcterms: 'http://purl.org/dc/terms/',
12
+ frbr: 'http://purl.org/vocab/frbr/core#',
13
+ owl: 'http://www.w3.org/2002/07/owl#',
14
+ rda: 'http://rdvocab.info/elements/',
15
+ rdf: 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
16
+ rdfs: 'http://www.w3.org/2000/01/rdf-schema#'
17
+ }.freeze
18
+
19
+ BASE_URL = 'https://chroniclingamerica.loc.gov/lccn'.freeze
20
+
21
+ def initialize(lccn)
22
+ # true until loaded
23
+ @empty = true
24
+ super(lccn)
25
+ @issn = nil # chronam doesn't have this
26
+ end
27
+
28
+ def empty?
29
+ @empty
30
+ end
31
+
32
+ def inspect
33
+ format(
34
+ "<#{self.class}:0x000000000%<oid>x " \
35
+ "\tlccn: '#{@lccn}'>",
36
+ oid: object_id << 1
37
+ )
38
+ end
39
+
40
+ def load_place
41
+ place_match = find('//rda:placeOfPublication')
42
+ return if place_match.nil?
43
+ @place_name = place_match.first.text
44
+ @place_of_publication = NewspaperWorks::Ingest.geonames_place_uri(
45
+ @place_name
46
+ )
47
+ end
48
+
49
+ def url
50
+ "#{BASE_URL}/#{@lccn}.rdf"
51
+ end
52
+
53
+ def load
54
+ resp = NewspaperWorks::ResourceFetcher.get url
55
+ return if resp['status'] == 404
56
+ @doc = Nokogiri.XML(resp['body'])
57
+ @title = normalize_title(find('//dcterms:title').first.text)
58
+ @language = iso_language_for(find('//dcterms:language').first.text)
59
+ @empty = false
60
+ load_place
61
+ end
62
+
63
+ def oclcnum
64
+ key = 'info:oclcnum'
65
+ selected = sameas_resources.select { |v| v.text.start_with?(key) }
66
+ return if selected.empty?
67
+ oclc_prefixed(selected[0].text.split('/')[1])
68
+ end
69
+
70
+ def preceded_by
71
+ return if empty?
72
+ found = find('//frbr:successorOf/@rdf:resource').first
73
+ return if found.nil?
74
+ normalize_related(found.text)
75
+ end
76
+
77
+ def succeeded_by
78
+ return if empty?
79
+ found = find('//frbr:successor/@rdf:resource').first
80
+ return if found.nil?
81
+ normalize_related(found.text)
82
+ end
83
+
84
+ private
85
+
86
+ def normalize_title(value)
87
+ NewspaperWorks::Ingest.normalize_title(value)
88
+ end
89
+
90
+ # Returns URL to LC catalog, provided such exists, on the basis of
91
+ # non-empty MODS for given LCCN. Otherwise returns nil.
92
+ def lc_catalog_url(lccn)
93
+ content_url = "https://lccn.loc.gov/#{lccn}"
94
+ url = "#{content_url}/mods"
95
+ resp = NewspaperWorks::ResourceFetcher.get url
96
+ doc = Nokogiri.XML(resp['body'])
97
+ return content_url unless doc.root.children.empty?
98
+ end
99
+
100
+ def normalize_related(value)
101
+ lccn = value.split('/')[-1].split('#')[0]
102
+ lc_url = lc_catalog_url(lccn)
103
+ # URL to lccn.loc.gov is preferred authority for publication URL
104
+ return lc_url unless lc_url.nil?
105
+ # URL to HTML representation of content on ChronAm is fallback
106
+ "#{BASE_URL}/#{lccn}"
107
+ end
108
+
109
+ def sameas_resources
110
+ find('//owl:sameAs/@rdf:resource') || []
111
+ end
112
+
113
+ def find(expr, context = nil)
114
+ context ||= @doc
115
+ return if context.nil?
116
+ context.xpath(expr, **XML_NS)
117
+ end
118
+
119
+ # ISO 639-2 three-character code from ISO 639-1 two-character code
120
+ # or equivalent lingvoj resource URL used by ChronAm;
121
+ # uses HTML language tables maintained by LOC.
122
+ def iso_language_for(code)
123
+ # handle case where source language code is lingvoj url:
124
+ code = code.split('/')[-1]
125
+ lookup_url = 'https://www.loc.gov/standards/iso639-2/php/langcodes_name.php'
126
+ lookup_url += "?iso_639_1=#{code}"
127
+ resp = NewspaperWorks::ResourceFetcher.get lookup_url
128
+ html = Nokogiri::HTML(resp['body'])
129
+ html.xpath('//table[1]/tr[2]/td[2]').first.text.strip
130
+ end
131
+ end
132
+ end
133
+ end
@@ -0,0 +1,52 @@
1
+ module NewspaperWorks
2
+ module Ingest
3
+ # class-method mixin module for ingest command-line invocation
4
+ # usage in classes: `extend NewspaperWorks::Ingest::FromCommand`
5
+ # These are all expected to be class methods in various CLI ingests.
6
+ module FromCommand
7
+ # alternate constructor from ARGV
8
+ # @param options [Array<String>]
9
+ def from_command(options, cmd_name)
10
+ path, opts = batch_path(options, cmd_name)
11
+ missing_path(cmd_name) if path.nil?
12
+ path = normalize_path(path)
13
+ missing_path(cmd_name, "Not found: #{path}") unless File.exist?(path)
14
+ Hyrax.config.whitelisted_ingest_dirs.push(File.dirname(path))
15
+ new(path, opts)
16
+ end
17
+
18
+ def missing_path(cmd_name, msg = "Missing path argument")
19
+ STDERR.puts "Usage: #{cmd_name} -- --path=PATH"
20
+ STDERR.puts "#{msg}. Exiting."
21
+ # rubocop:disable Rails/Exit
22
+ exit(1) if cmd_name.start_with?('rake')
23
+ # rubocop:enable Rails/Exit
24
+ end
25
+
26
+ def batch_path(options, cmd_name)
27
+ path = nil
28
+ params = {}
29
+ parser = OptionParser.new
30
+ args = parser.order!(options) {}
31
+ parser.banner = "Usage: #{cmd_name} -- --path=PATH"
32
+ parser.on('-i PATH', '--path PATH') do |p|
33
+ path = p
34
+ end
35
+ parser.on('--admin_set=ADMIN_SET')
36
+ parser.on('--depositor=DEPOSITOR')
37
+ parser.on('--visibility=VISIBILITY')
38
+ # lccn used by PDF issue ingest, but not NDNP ingest:
39
+ parser.on('--lccn=LCCN')
40
+ parser.parse!(args, into: params)
41
+ [path, params]
42
+ end
43
+
44
+ # default normalization is no normalization of path
45
+ # @param path [String]
46
+ # @return [String]
47
+ def normalize_path(path)
48
+ path
49
+ end
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,43 @@
1
+ module NewspaperWorks
2
+ module Ingest
3
+ class ImageIngestIssues
4
+ include Enumerable
5
+ include NewspaperWorks::Ingest::PathEnumeration
6
+
7
+ attr_accessor :path, :publication
8
+
9
+ delegate :lccn, to: :publication
10
+
11
+ def initialize(path, publication)
12
+ # path is path to publication directory containing issues:
13
+ @path = path
14
+ # Publication info
15
+ @publication = publication
16
+ @issue_paths = nil
17
+ end
18
+
19
+ def paths
20
+ return @issue_paths unless @issue_paths.nil?
21
+ result = []
22
+ entries = Dir.entries(path).map { |n| File.join(path, n) }
23
+ entries.select { |p| !File.basename(p).start_with?('.') }.each do |p|
24
+ next unless File.directory?(p)
25
+ next unless path_validates?(p)
26
+ result.push(p)
27
+ end
28
+ @issue_paths = result
29
+ end
30
+
31
+ def info(path)
32
+ NewspaperWorks::Ingest::IssueImages.new(path, @publication)
33
+ end
34
+
35
+ private
36
+
37
+ def path_validates?(p)
38
+ ptn = /^([0-9]{4})(1[012]|[0][1-9])(3[01]|[12][0-9]|0[1-9])([0-9]{2})?/
39
+ ptn.match(File.basename(p)) ? true : false
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,51 @@
1
+ require 'date'
2
+ require 'find'
3
+
4
+ module NewspaperWorks
5
+ module Ingest
6
+ # Represents TIFF/JP2 issue, provides metadata, enumerates PageImage objects
7
+ class IssueImages
8
+ # most acccessors for issue/edition metadata, publication metadata
9
+ # provided by including this mixin:
10
+ include NewspaperWorks::Ingest::NamedIssueMetadata
11
+
12
+ # Path enumeration by mixing in Enumerable, PathEnumeration
13
+ include Enumerable
14
+ include NewspaperWorks::Ingest::PathEnumeration
15
+
16
+ attr_accessor :path, :publication
17
+
18
+ # things that look like images, by file extension:
19
+ IMAGE_EXT = ['tiff', 'tif', 'jp2', 'jpg', 'png'].freeze
20
+
21
+ def initialize(path, publication)
22
+ @path = path
23
+ raise ArgumentError, 'Path not directory' unless File.directory?(path)
24
+ validate_path
25
+ # as a NewspaperWorks::Ingest::PublicationInfo object:
26
+ @publication = publication
27
+ @pages = nil
28
+ end
29
+
30
+ def page_paths
31
+ return @pages unless @pages.nil?
32
+ @pages = []
33
+ entries = Dir.entries(path).map { |n| File.join(path, n) }
34
+ entries.sort.each do |p|
35
+ next unless File.ftype(p) == 'file'
36
+ ext = File.basename(p).downcase.split('.')[-1]
37
+ next unless IMAGE_EXT.include?(ext)
38
+ @pages.push(p)
39
+ end
40
+ @pages
41
+ end
42
+
43
+ def info(path)
44
+ page_seq_num = page_paths.index(path) + 1
45
+ NewspaperWorks::Ingest::PageImage.new(path, self, page_seq_num)
46
+ end
47
+
48
+ alias paths page_paths
49
+ end
50
+ end
51
+ end