newspaper_works 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (461) hide show
  1. checksums.yaml +7 -0
  2. data/.fcrepo_wrapper +4 -0
  3. data/.gitignore +43 -0
  4. data/.rubocop.yml +143 -0
  5. data/.solr_wrapper +8 -0
  6. data/.travis.yml +50 -0
  7. data/Gemfile +47 -0
  8. data/LICENSE +203 -0
  9. data/README.md +159 -0
  10. data/Rakefile +38 -0
  11. data/app/actors/hyrax/actors/newspaper_article_actor.rb +8 -0
  12. data/app/actors/hyrax/actors/newspaper_container_actor.rb +8 -0
  13. data/app/actors/hyrax/actors/newspaper_issue_actor.rb +8 -0
  14. data/app/actors/hyrax/actors/newspaper_page_actor.rb +8 -0
  15. data/app/actors/hyrax/actors/newspaper_title_actor.rb +8 -0
  16. data/app/actors/newspaper_works/actors/newspaper_works_upload_actor.rb +88 -0
  17. data/app/assets/config/newspaper_works_manifest.js +2 -0
  18. data/app/assets/images/newspaper_works/.keep +0 -0
  19. data/app/assets/javascripts/newspaper_works/autocomplete_fix.js +33 -0
  20. data/app/assets/javascripts/newspaper_works/ocr_search.js.erb +6 -0
  21. data/app/assets/javascripts/newspaper_works/thumbnail_highlights.js.erb +102 -0
  22. data/app/assets/javascripts/newspaper_works.js +4 -0
  23. data/app/assets/stylesheets/newspaper_works/_issue_search.scss +13 -0
  24. data/app/assets/stylesheets/newspaper_works/_issues_calendar.scss +18 -0
  25. data/app/assets/stylesheets/newspaper_works/_newspaper_works.scss +4 -0
  26. data/app/assets/stylesheets/newspaper_works/_newspapers_search.scss +38 -0
  27. data/app/assets/stylesheets/newspaper_works/_search_results.scss +12 -0
  28. data/app/controllers/hyrax/newspaper_articles_controller.rb +14 -0
  29. data/app/controllers/hyrax/newspaper_containers_controller.rb +14 -0
  30. data/app/controllers/hyrax/newspaper_issues_controller.rb +14 -0
  31. data/app/controllers/hyrax/newspaper_pages_controller.rb +14 -0
  32. data/app/controllers/hyrax/newspaper_titles_controller.rb +13 -0
  33. data/app/controllers/newspaper_works/newspapers_controller.rb +117 -0
  34. data/app/controllers/newspaper_works/newspapers_search_controller.rb +26 -0
  35. data/app/forms/hyrax/newspaper_article_form.rb +11 -0
  36. data/app/forms/hyrax/newspaper_container_form.rb +11 -0
  37. data/app/forms/hyrax/newspaper_issue_form.rb +11 -0
  38. data/app/forms/hyrax/newspaper_page_form.rb +15 -0
  39. data/app/forms/hyrax/newspaper_title_form.rb +12 -0
  40. data/app/forms/newspaper_works/newspaper_core_form_data.rb +17 -0
  41. data/app/helpers/hyrax/newspaper_articles_helper.rb +5 -0
  42. data/app/helpers/hyrax/newspaper_containers_helper.rb +5 -0
  43. data/app/helpers/hyrax/newspaper_issues_helper.rb +5 -0
  44. data/app/helpers/hyrax/newspaper_pages_helper.rb +5 -0
  45. data/app/helpers/newspaper_works/application_helper.rb +5 -0
  46. data/app/helpers/newspaper_works/breadcrumb_helper.rb +92 -0
  47. data/app/helpers/newspaper_works/newspaper_works_helper_behavior.rb +103 -0
  48. data/app/helpers/newspaper_works/newspapers_helper.rb +5 -0
  49. data/app/indexers/concerns/newspaper_works/indexes_full_text.rb +17 -0
  50. data/app/indexers/concerns/newspaper_works/indexes_place_of_publication.rb +67 -0
  51. data/app/indexers/concerns/newspaper_works/indexes_publication_date_range.rb +35 -0
  52. data/app/indexers/concerns/newspaper_works/indexes_relationships.rb +125 -0
  53. data/app/indexers/newspaper_article_indexer.rb +16 -0
  54. data/app/indexers/newspaper_container_indexer.rb +18 -0
  55. data/app/indexers/newspaper_issue_indexer.rb +26 -0
  56. data/app/indexers/newspaper_page_indexer.rb +9 -0
  57. data/app/indexers/newspaper_title_indexer.rb +19 -0
  58. data/app/indexers/newspaper_works/newspaper_core_indexer.rb +21 -0
  59. data/app/jobs/newspaper_works/application_job.rb +4 -0
  60. data/app/jobs/newspaper_works/compose_issue_pdf_job.rb +13 -0
  61. data/app/jobs/newspaper_works/create_issue_pages_job.rb +19 -0
  62. data/app/mailers/newspaper_works/application_mailer.rb +8 -0
  63. data/app/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior.rb +82 -0
  64. data/app/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior.rb +27 -0
  65. data/app/models/concerns/newspaper_works/newspaper_core_metadata.rb +67 -0
  66. data/app/models/concerns/newspaper_works/place_of_publication_behavior.rb +15 -0
  67. data/app/models/concerns/newspaper_works/scanned_media_metadata.rb +43 -0
  68. data/app/models/concerns/newspaper_works/solr/document.rb +25 -0
  69. data/app/models/file_set.rb +10 -0
  70. data/app/models/newspaper_article.rb +158 -0
  71. data/app/models/newspaper_container.rb +86 -0
  72. data/app/models/newspaper_issue.rb +115 -0
  73. data/app/models/newspaper_page.rb +70 -0
  74. data/app/models/newspaper_title.rb +111 -0
  75. data/app/models/newspaper_works/application_record.rb +6 -0
  76. data/app/models/newspaper_works/derivative_attachment.rb +8 -0
  77. data/app/models/newspaper_works/ingest_file_relation.rb +14 -0
  78. data/app/presenters/hyrax/newspaper_article_presenter.rb +38 -0
  79. data/app/presenters/hyrax/newspaper_container_presenter.rb +11 -0
  80. data/app/presenters/hyrax/newspaper_issue_presenter.rb +62 -0
  81. data/app/presenters/hyrax/newspaper_page_presenter.rb +72 -0
  82. data/app/presenters/hyrax/newspaper_title_presenter.rb +86 -0
  83. data/app/presenters/newspaper_works/iiif_manifest_presenter_behavior.rb +29 -0
  84. data/app/presenters/newspaper_works/issue_info_presenter.rb +29 -0
  85. data/app/presenters/newspaper_works/newspaper_core_presenter.rb +9 -0
  86. data/app/presenters/newspaper_works/persistent_url_presenter_behavior.rb +16 -0
  87. data/app/presenters/newspaper_works/place_of_publication_presenter_behavior.rb +8 -0
  88. data/app/presenters/newspaper_works/scanned_media_presenter.rb +7 -0
  89. data/app/presenters/newspaper_works/title_info_presenter.rb +13 -0
  90. data/app/search_builders/concerns/newspaper_works/exclude_models.rb +16 -0
  91. data/app/search_builders/concerns/newspaper_works/highlight_search_params.rb +14 -0
  92. data/app/search_builders/newspaper_works/newspapers_search_builder.rb +26 -0
  93. data/app/services/hyrax/article_genre_service.rb +9 -0
  94. data/app/services/newspaper_works/jp2_derivative_service.rb +120 -0
  95. data/app/services/newspaper_works/newspaper_page_derivative_service.rb +91 -0
  96. data/app/services/newspaper_works/pdf_derivative_service.rb +45 -0
  97. data/app/services/newspaper_works/pluggable_derivative_service.rb +114 -0
  98. data/app/services/newspaper_works/text_extraction_derivative_service.rb +56 -0
  99. data/app/services/newspaper_works/text_formats_from_alto_service.rb +77 -0
  100. data/app/services/newspaper_works/tiff_derivative_service.rb +54 -0
  101. data/app/validators/newspaper_works/publication_date_start_end_validator.rb +48 -0
  102. data/app/validators/newspaper_works/publication_date_validator.rb +16 -0
  103. data/app/views/catalog/_index_gallery_newspaper_article_wrapper.html.erb +9 -0
  104. data/app/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb +9 -0
  105. data/app/views/catalog/_index_header_gallery_newspaper_article.html.erb +23 -0
  106. data/app/views/catalog/_index_header_gallery_newspaper_page.html.erb +23 -0
  107. data/app/views/catalog/_index_header_list_newspaper_article.html.erb +7 -0
  108. data/app/views/catalog/_index_header_list_newspaper_page.html.erb +7 -0
  109. data/app/views/catalog/_snippets_more.html.erb +16 -0
  110. data/app/views/catalog/_thumbnail_list_newspaper_article.html.erb +6 -0
  111. data/app/views/catalog/_thumbnail_list_newspaper_page.html.erb +6 -0
  112. data/app/views/hyrax/file_sets/_actions.html.erb +45 -0
  113. data/app/views/hyrax/newspaper_articles/_newspaper_article.html.erb +2 -0
  114. data/app/views/hyrax/newspaper_articles/show.html.erb +1 -0
  115. data/app/views/hyrax/newspaper_containers/_newspaper_container.html.erb +2 -0
  116. data/app/views/hyrax/newspaper_containers/show.html.erb +1 -0
  117. data/app/views/hyrax/newspaper_issues/_newspaper_issue.html.erb +2 -0
  118. data/app/views/hyrax/newspaper_issues/show.html.erb +1 -0
  119. data/app/views/hyrax/newspaper_pages/_newspaper_page.html.erb +2 -0
  120. data/app/views/hyrax/newspaper_pages/show.html.erb +1 -0
  121. data/app/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb +5 -0
  122. data/app/views/hyrax/newspaper_titles/_issue_search_form.html.erb +33 -0
  123. data/app/views/hyrax/newspaper_titles/_issues_calendar.html.erb +63 -0
  124. data/app/views/hyrax/newspaper_titles/_newspaper_title.html.erb +2 -0
  125. data/app/views/hyrax/newspaper_titles/show.html.erb +54 -0
  126. data/app/views/newspaper_works/base/_attribute_rows.html.erb +42 -0
  127. data/app/views/newspaper_works/base/_attributes.html.erb +16 -0
  128. data/app/views/newspaper_works/base/_metadata.html.erb +6 -0
  129. data/app/views/newspaper_works/base/_newspaper_hierarchy.html.erb +14 -0
  130. data/app/views/newspaper_works/base/_persistent_url.html.erb +1 -0
  131. data/app/views/newspaper_works/base/_show.html.erb +45 -0
  132. data/app/views/newspaper_works/newspapers_search/_date_fields.html.erb +29 -0
  133. data/app/views/newspaper_works/newspapers_search/_facet_layout.html.erb +8 -0
  134. data/app/views/newspaper_works/newspapers_search/_facet_limit.html.erb +17 -0
  135. data/app/views/newspaper_works/newspapers_search/_front_pages_input.html.erb +5 -0
  136. data/app/views/newspaper_works/newspapers_search/_keyword_input.html.erb +18 -0
  137. data/app/views/newspaper_works/newspapers_search/_newspapers_facets.html.erb +5 -0
  138. data/app/views/newspaper_works/newspapers_search/_newspapers_search_form.html.erb +13 -0
  139. data/app/views/newspaper_works/newspapers_search/_newspapers_search_help.html.erb +8 -0
  140. data/app/views/newspaper_works/newspapers_search/search.html.erb +13 -0
  141. data/app/views/records/edit_fields/_alternate_title.html.erb +4 -0
  142. data/app/views/records/edit_fields/_genre.html.erb +4 -0
  143. data/app/views/records/edit_fields/_place_of_publication.html.erb +14 -0
  144. data/app/views/records/edit_fields/_subtitle.html.erb +4 -0
  145. data/bin/rails +13 -0
  146. data/config/fcrepo_wrapper_test.yml +5 -0
  147. data/config/initializers/assets.rb +2 -0
  148. data/config/locales/newspaper_article.de.yml +12 -0
  149. data/config/locales/newspaper_article.en.yml +12 -0
  150. data/config/locales/newspaper_article.es.yml +12 -0
  151. data/config/locales/newspaper_article.fr.yml +12 -0
  152. data/config/locales/newspaper_article.it.yml +12 -0
  153. data/config/locales/newspaper_article.pt-BR.yml +12 -0
  154. data/config/locales/newspaper_article.zh.yml +12 -0
  155. data/config/locales/newspaper_container.de.yml +8 -0
  156. data/config/locales/newspaper_container.en.yml +8 -0
  157. data/config/locales/newspaper_container.es.yml +8 -0
  158. data/config/locales/newspaper_container.fr.yml +8 -0
  159. data/config/locales/newspaper_container.it.yml +8 -0
  160. data/config/locales/newspaper_container.pt-BR.yml +8 -0
  161. data/config/locales/newspaper_container.zh.yml +8 -0
  162. data/config/locales/newspaper_issue.de.yml +8 -0
  163. data/config/locales/newspaper_issue.en.yml +8 -0
  164. data/config/locales/newspaper_issue.es.yml +8 -0
  165. data/config/locales/newspaper_issue.fr.yml +8 -0
  166. data/config/locales/newspaper_issue.it.yml +8 -0
  167. data/config/locales/newspaper_issue.pt-BR.yml +8 -0
  168. data/config/locales/newspaper_issue.zh.yml +8 -0
  169. data/config/locales/newspaper_page.de.yml +15 -0
  170. data/config/locales/newspaper_page.en.yml +15 -0
  171. data/config/locales/newspaper_page.es.yml +15 -0
  172. data/config/locales/newspaper_page.fr.yml +15 -0
  173. data/config/locales/newspaper_page.it.yml +15 -0
  174. data/config/locales/newspaper_page.pt-BR.yml +15 -0
  175. data/config/locales/newspaper_page.zh.yml +15 -0
  176. data/config/locales/newspaper_title.de.yml +8 -0
  177. data/config/locales/newspaper_title.en.yml +8 -0
  178. data/config/locales/newspaper_title.es.yml +8 -0
  179. data/config/locales/newspaper_title.fr.yml +8 -0
  180. data/config/locales/newspaper_title.it.yml +8 -0
  181. data/config/locales/newspaper_title.pt-BR.yml +8 -0
  182. data/config/locales/newspaper_title.zh.yml +8 -0
  183. data/config/locales/newspaper_works.de.yml +50 -0
  184. data/config/locales/newspaper_works.en.yml +52 -0
  185. data/config/locales/newspaper_works.es.yml +52 -0
  186. data/config/locales/newspaper_works.fr.yml +52 -0
  187. data/config/locales/newspaper_works.it.yml +52 -0
  188. data/config/locales/newspaper_works.pt-BR.yml +52 -0
  189. data/config/locales/newspaper_works.zh.yml +52 -0
  190. data/config/routes.rb +9 -0
  191. data/config/solr_wrapper_test.yml +9 -0
  192. data/config/test-fixture/solr-config/_rest_managed.json +3 -0
  193. data/config/test-fixture/solr-config/admin-extra.html +31 -0
  194. data/config/test-fixture/solr-config/elevate.xml +36 -0
  195. data/config/test-fixture/solr-config/mapping-ISOLatin1Accent.txt +246 -0
  196. data/config/test-fixture/solr-config/protwords.txt +21 -0
  197. data/config/test-fixture/solr-config/schema.xml +366 -0
  198. data/config/test-fixture/solr-config/scripts.conf +24 -0
  199. data/config/test-fixture/solr-config/solrconfig.xml +322 -0
  200. data/config/test-fixture/solr-config/spellings.txt +2 -0
  201. data/config/test-fixture/solr-config/stopwords.txt +58 -0
  202. data/config/test-fixture/solr-config/stopwords_en.txt +58 -0
  203. data/config/test-fixture/solr-config/synonyms.txt +31 -0
  204. data/config/test-fixture/solr-config/xslt/example.xsl +132 -0
  205. data/config/test-fixture/solr-config/xslt/example_atom.xsl +67 -0
  206. data/config/test-fixture/solr-config/xslt/example_rss.xsl +66 -0
  207. data/config/test-fixture/solr-config/xslt/luke.xsl +337 -0
  208. data/config/vendor/imagemagick-6-policy.xml +76 -0
  209. data/db/migrate/20181214181358_create_newspaper_works_derivative_attachments.rb +12 -0
  210. data/db/migrate/20190107165909_create_newspaper_works_ingest_file_relations.rb +11 -0
  211. data/lib/generators/newspaper_works/assets_generator.rb +29 -0
  212. data/lib/generators/newspaper_works/blacklight_advanced_search_generator.rb +44 -0
  213. data/lib/generators/newspaper_works/blacklight_iiif_search_generator.rb +41 -0
  214. data/lib/generators/newspaper_works/catalog_controller_generator.rb +60 -0
  215. data/lib/generators/newspaper_works/install_generator.rb +97 -0
  216. data/lib/generators/newspaper_works/templates/annotation_behavior.rb +6 -0
  217. data/lib/generators/newspaper_works/templates/config/authorities/newspaper_article_genres.yml +86 -0
  218. data/lib/generators/newspaper_works/templates/config/initializers/newspaper_works.rb +12 -0
  219. data/lib/generators/newspaper_works/templates/config/initializers/patch_blacklight_advanced_search.rb +74 -0
  220. data/lib/generators/newspaper_works/templates/custom_search_builder.rb +23 -0
  221. data/lib/generators/newspaper_works/templates/newspaper_works.scss +1 -0
  222. data/lib/generators/newspaper_works/templates/newspaper_works_helper.rb +3 -0
  223. data/lib/generators/newspaper_works/templates/search_behavior.rb +6 -0
  224. data/lib/newspaper_works/configuration.rb +14 -0
  225. data/lib/newspaper_works/data/fileset_helper.rb +25 -0
  226. data/lib/newspaper_works/data/path_helper.rb +40 -0
  227. data/lib/newspaper_works/data/work_derivatives.rb +314 -0
  228. data/lib/newspaper_works/data/work_file.rb +92 -0
  229. data/lib/newspaper_works/data/work_files.rb +181 -0
  230. data/lib/newspaper_works/data.rb +35 -0
  231. data/lib/newspaper_works/engine.rb +42 -0
  232. data/lib/newspaper_works/errors.rb +14 -0
  233. data/lib/newspaper_works/ingest/base_ingest.rb +69 -0
  234. data/lib/newspaper_works/ingest/base_publication_info.rb +35 -0
  235. data/lib/newspaper_works/ingest/batch_ingest_helper.rb +44 -0
  236. data/lib/newspaper_works/ingest/batch_issue_ingester.rb +129 -0
  237. data/lib/newspaper_works/ingest/chronam_publication_info.rb +133 -0
  238. data/lib/newspaper_works/ingest/from_command.rb +52 -0
  239. data/lib/newspaper_works/ingest/image_ingest_issues.rb +43 -0
  240. data/lib/newspaper_works/ingest/issue_images.rb +51 -0
  241. data/lib/newspaper_works/ingest/lc_publication_info.rb +144 -0
  242. data/lib/newspaper_works/ingest/named_issue_metadata.rb +60 -0
  243. data/lib/newspaper_works/ingest/ndnp/batch_ingester.rb +64 -0
  244. data/lib/newspaper_works/ingest/ndnp/batch_xml_ingest.rb +72 -0
  245. data/lib/newspaper_works/ingest/ndnp/container_ingest.rb +99 -0
  246. data/lib/newspaper_works/ingest/ndnp/container_ingester.rb +84 -0
  247. data/lib/newspaper_works/ingest/ndnp/container_metadata.rb +87 -0
  248. data/lib/newspaper_works/ingest/ndnp/issue_ingest.rb +81 -0
  249. data/lib/newspaper_works/ingest/ndnp/issue_ingester.rb +101 -0
  250. data/lib/newspaper_works/ingest/ndnp/issue_metadata.rb +96 -0
  251. data/lib/newspaper_works/ingest/ndnp/ndnp_asset_helper.rb +20 -0
  252. data/lib/newspaper_works/ingest/ndnp/ndnp_mets_helper.rb +70 -0
  253. data/lib/newspaper_works/ingest/ndnp/page_ingest.rb +47 -0
  254. data/lib/newspaper_works/ingest/ndnp/page_ingester.rb +157 -0
  255. data/lib/newspaper_works/ingest/ndnp/page_metadata.rb +112 -0
  256. data/lib/newspaper_works/ingest/ndnp.rb +21 -0
  257. data/lib/newspaper_works/ingest/newspaper_issue_ingest.rb +56 -0
  258. data/lib/newspaper_works/ingest/newspaper_page_ingest.rb +6 -0
  259. data/lib/newspaper_works/ingest/page_image.rb +52 -0
  260. data/lib/newspaper_works/ingest/path_enumeration.rb +52 -0
  261. data/lib/newspaper_works/ingest/pdf_images.rb +85 -0
  262. data/lib/newspaper_works/ingest/pdf_issue.rb +20 -0
  263. data/lib/newspaper_works/ingest/pdf_issues.rb +39 -0
  264. data/lib/newspaper_works/ingest/pdf_pages.rb +114 -0
  265. data/lib/newspaper_works/ingest/pub_finder.rb +89 -0
  266. data/lib/newspaper_works/ingest/publication_info.rb +44 -0
  267. data/lib/newspaper_works/ingest.rb +90 -0
  268. data/lib/newspaper_works/issue_pdf_composer.rb +111 -0
  269. data/lib/newspaper_works/logging.rb +54 -0
  270. data/lib/newspaper_works/page_finder.rb +62 -0
  271. data/lib/newspaper_works/resource_fetcher.rb +78 -0
  272. data/lib/newspaper_works/text_extraction/alto_reader.rb +122 -0
  273. data/lib/newspaper_works/text_extraction/page_ocr.rb +100 -0
  274. data/lib/newspaper_works/text_extraction/render_alto.rb +84 -0
  275. data/lib/newspaper_works/text_extraction/word_coords_builder.rb +30 -0
  276. data/lib/newspaper_works/text_extraction.rb +10 -0
  277. data/lib/newspaper_works/version.rb +3 -0
  278. data/lib/newspaper_works.rb +19 -0
  279. data/lib/tasks/newspaper_works_tasks.rake +39 -0
  280. data/newspaper_works.gemspec +49 -0
  281. data/spec/.keep.txt +1 -0
  282. data/spec/actors/newspaper_works/actors/newspaper_works_upload_actor_spec.rb +69 -0
  283. data/spec/controllers/catalog_controller_spec.rb +63 -0
  284. data/spec/controllers/newspaper_works/newspapers_controller_spec.rb +114 -0
  285. data/spec/controllers/newspaper_works/newspapers_search_controller_spec.rb +21 -0
  286. data/spec/factories/ability.rb +6 -0
  287. data/spec/factories/newspaper_issue.rb +7 -0
  288. data/spec/factories/newspaper_issue_ingest.rb +6 -0
  289. data/spec/factories/newspaper_page.rb +7 -0
  290. data/spec/factories/newspaper_page_ingest.rb +6 -0
  291. data/spec/factories/newspaper_page_solr_document.rb +12 -0
  292. data/spec/factories/newspaper_title.rb +8 -0
  293. data/spec/factories/uploaded_pdf_file.rb +9 -0
  294. data/spec/factories/user.rb +13 -0
  295. data/spec/features/front_pages_for_title_spec.rb +19 -0
  296. data/spec/features/newspaper_title_search_spec.rb +30 -0
  297. data/spec/features/newspapers_search_spec.rb +49 -0
  298. data/spec/features/search_results_thumbnail_highlights_spec.rb +33 -0
  299. data/spec/features_shared.rb +71 -0
  300. data/spec/fixtures/files/4.1.07.jp2 +0 -0
  301. data/spec/fixtures/files/4.1.07.tiff +0 -0
  302. data/spec/fixtures/files/README.md +7 -0
  303. data/spec/fixtures/files/alto-2-0.xsd +714 -0
  304. data/spec/fixtures/files/broken-truncated.pdf +0 -0
  305. data/spec/fixtures/files/credits.md +16 -0
  306. data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
  307. data/spec/fixtures/files/minimal-1-page.pdf +0 -0
  308. data/spec/fixtures/files/minimal-2-page.pdf +0 -0
  309. data/spec/fixtures/files/minimal-alto.xml +31 -0
  310. data/spec/fixtures/files/ndnp-alto-sample.xml +24 -0
  311. data/spec/fixtures/files/ndnp-sample1-json.json +1 -0
  312. data/spec/fixtures/files/ndnp-sample1-txt.txt +1 -0
  313. data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
  314. data/spec/fixtures/files/ocr_alto.xml +202 -0
  315. data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +202 -0
  316. data/spec/fixtures/files/ocr_color.tiff +0 -0
  317. data/spec/fixtures/files/ocr_gray.jp2 +0 -0
  318. data/spec/fixtures/files/ocr_gray.tiff +0 -0
  319. data/spec/fixtures/files/ocr_mono.tiff +0 -0
  320. data/spec/fixtures/files/page1.tiff +0 -0
  321. data/spec/fixtures/files/resource_mocks/chronam/http404-expected +0 -0
  322. data/spec/fixtures/files/resource_mocks/chronam/sn84038814.rdf +1028 -0
  323. data/spec/fixtures/files/resource_mocks/chronam/sn93059126.rdf +36 -0
  324. data/spec/fixtures/files/resource_mocks/chronam/sn94051019.rdf +37 -0
  325. data/spec/fixtures/files/resource_mocks/geonames/Chicopee +1104 -0
  326. data/spec/fixtures/files/resource_mocks/geonames/Denver +1104 -0
  327. data/spec/fixtures/files/resource_mocks/geonames/Marysville +279 -0
  328. data/spec/fixtures/files/resource_mocks/geonames/Marysville2 +279 -0
  329. data/spec/fixtures/files/resource_mocks/geonames/SLC +1104 -0
  330. data/spec/fixtures/files/resource_mocks/lccn/sn2099999999 +1 -0
  331. data/spec/fixtures/files/resource_mocks/lccn/sn82014496 +2 -0
  332. data/spec/fixtures/files/resource_mocks/lccn/sn83020109 +1 -0
  333. data/spec/fixtures/files/resource_mocks/lccn/sn83021453 +2 -0
  334. data/spec/fixtures/files/resource_mocks/lccn/sn83045396 +2 -0
  335. data/spec/fixtures/files/resource_mocks/lccn/sn84038814 +2 -0
  336. data/spec/fixtures/files/resource_mocks/lccn/sn93059126 +1 -0
  337. data/spec/fixtures/files/resource_mocks/lccn/sn94051019 +1 -0
  338. data/spec/fixtures/files/resource_mocks/lccn/sn99999999 +1 -0
  339. data/spec/fixtures/files/resource_mocks/urls.json +82 -0
  340. data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
  341. data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
  342. data/spec/fixtures/files/thumbnail.jpg +0 -0
  343. data/spec/forms/hyrax/newspaper_article_form_spec.rb +33 -0
  344. data/spec/forms/hyrax/newspaper_container_form_spec.rb +30 -0
  345. data/spec/forms/hyrax/newspaper_issue_form_spec.rb +31 -0
  346. data/spec/forms/hyrax/newspaper_page_form_spec.rb +28 -0
  347. data/spec/forms/hyrax/newspaper_title_form_spec.rb +31 -0
  348. data/spec/forms/newspaper_works/newspaper_core_form_data_spec.rb +12 -0
  349. data/spec/helpers/newspaper_works/breadcrumb_helper_spec.rb +82 -0
  350. data/spec/helpers/newspaper_works_helper_spec.rb +57 -0
  351. data/spec/indexers/concerns/newspaper_works/indexes_full_text_spec.rb +31 -0
  352. data/spec/indexers/concerns/newspaper_works/indexes_place_of_publication_spec.rb +53 -0
  353. data/spec/indexers/concerns/newspaper_works/indexes_publication_date_range_spec.rb +39 -0
  354. data/spec/indexers/concerns/newspaper_works/indexes_relationships_spec.rb +86 -0
  355. data/spec/indexers/newspaper_article_indexer_spec.rb +29 -0
  356. data/spec/indexers/newspaper_issue_indexer_spec.rb +19 -0
  357. data/spec/indexers/newspaper_title_indexer_spec.rb +22 -0
  358. data/spec/indexers/newspaper_works/newspaper_core_indexer_spec.rb +23 -0
  359. data/spec/lib/newspaper_works/configuration_spec.rb +18 -0
  360. data/spec/lib/newspaper_works/data/work_derivatives_spec.rb +245 -0
  361. data/spec/lib/newspaper_works/data/work_file_spec.rb +99 -0
  362. data/spec/lib/newspaper_works/data/work_files_spec.rb +224 -0
  363. data/spec/lib/newspaper_works/ingest/batch_issue_ingester_spec.rb +158 -0
  364. data/spec/lib/newspaper_works/ingest/chronam_publication_info_spec.rb +35 -0
  365. data/spec/lib/newspaper_works/ingest/from_command_spec.rb +75 -0
  366. data/spec/lib/newspaper_works/ingest/image_ingest_issues_spec.rb +62 -0
  367. data/spec/lib/newspaper_works/ingest/ingest_shared.rb +75 -0
  368. data/spec/lib/newspaper_works/ingest/issue_images_spec.rb +65 -0
  369. data/spec/lib/newspaper_works/ingest/lc_publication_info_spec.rb +34 -0
  370. data/spec/lib/newspaper_works/ingest/ndnp/batch_ingester_spec.rb +131 -0
  371. data/spec/lib/newspaper_works/ingest/ndnp/batch_xml_ingest_spec.rb +64 -0
  372. data/spec/lib/newspaper_works/ingest/ndnp/container_ingest_spec.rb +44 -0
  373. data/spec/lib/newspaper_works/ingest/ndnp/container_ingester_spec.rb +126 -0
  374. data/spec/lib/newspaper_works/ingest/ndnp/container_metadata_spec.rb +36 -0
  375. data/spec/lib/newspaper_works/ingest/ndnp/issue_ingest_spec.rb +108 -0
  376. data/spec/lib/newspaper_works/ingest/ndnp/issue_ingester_spec.rb +155 -0
  377. data/spec/lib/newspaper_works/ingest/ndnp/issue_metadata_spec.rb +84 -0
  378. data/spec/lib/newspaper_works/ingest/ndnp/page_ingest_spec.rb +79 -0
  379. data/spec/lib/newspaper_works/ingest/ndnp/page_ingester_spec.rb +184 -0
  380. data/spec/lib/newspaper_works/ingest/ndnp/page_metadata_spec.rb +85 -0
  381. data/spec/lib/newspaper_works/ingest/newspaper_issue_ingest_spec.rb +83 -0
  382. data/spec/lib/newspaper_works/ingest/newspaper_page_ingest_spec.rb +77 -0
  383. data/spec/lib/newspaper_works/ingest/page_image_spec.rb +29 -0
  384. data/spec/lib/newspaper_works/ingest/pdf_images_spec.rb +32 -0
  385. data/spec/lib/newspaper_works/ingest/pdf_issue_spec.rb +29 -0
  386. data/spec/lib/newspaper_works/ingest/pdf_issues_spec.rb +62 -0
  387. data/spec/lib/newspaper_works/ingest/pdf_pages_spec.rb +110 -0
  388. data/spec/lib/newspaper_works/ingest/pub_finder_spec.rb +58 -0
  389. data/spec/lib/newspaper_works/ingest/publication_info_spec.rb +61 -0
  390. data/spec/lib/newspaper_works/ingest_spec.rb +45 -0
  391. data/spec/lib/newspaper_works/issue_pdf_composer_spec.rb +101 -0
  392. data/spec/lib/newspaper_works/logging_spec.rb +53 -0
  393. data/spec/lib/newspaper_works/page_finder_spec.rb +53 -0
  394. data/spec/lib/newspaper_works/resource_fetcher_spec.rb +65 -0
  395. data/spec/lib/newspaper_works/text_extraction/alto_reader_spec.rb +49 -0
  396. data/spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb +84 -0
  397. data/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb +54 -0
  398. data/spec/lib/newspaper_works/text_extraction/word_coords_builder_spec.rb +30 -0
  399. data/spec/lib/tasks/newspaper_works_rake_spec.rb +124 -0
  400. data/spec/misc_shared.rb +109 -0
  401. data/spec/model_shared.rb +134 -0
  402. data/spec/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior_spec.rb +45 -0
  403. data/spec/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior_spec.rb +27 -0
  404. data/spec/models/concerns/newspaper_works/newspaper_core_metadata_spec.rb +45 -0
  405. data/spec/models/concerns/newspaper_works/place_of_publication_behavior_spec.rb +17 -0
  406. data/spec/models/concerns/newspaper_works/scanned_media_metadata_spec.rb +35 -0
  407. data/spec/models/newspaper_article_spec.rb +73 -0
  408. data/spec/models/newspaper_container_spec.rb +111 -0
  409. data/spec/models/newspaper_issue_spec.rb +91 -0
  410. data/spec/models/newspaper_page_spec.rb +44 -0
  411. data/spec/models/newspaper_title_spec.rb +116 -0
  412. data/spec/models/newspaper_works/derivative_attachment_spec.rb +37 -0
  413. data/spec/models/newspaper_works/ingest_file_relation_spec.rb +56 -0
  414. data/spec/models/solr_document_spec.rb +14 -0
  415. data/spec/ndnp_shared.rb +48 -0
  416. data/spec/presenters/hyrax/newspaper_article_presenter_spec.rb +53 -0
  417. data/spec/presenters/hyrax/newspaper_container_presenter_spec.rb +20 -0
  418. data/spec/presenters/hyrax/newspaper_issue_presenter_spec.rb +65 -0
  419. data/spec/presenters/hyrax/newspaper_page_presenter_spec.rb +75 -0
  420. data/spec/presenters/hyrax/newspaper_title_presenter_spec.rb +153 -0
  421. data/spec/presenters/newspaper_works/iiif_manifest_presenter_behavior_spec.rb +32 -0
  422. data/spec/presenters/newspaper_works/issue_info_presenter_spec.rb +51 -0
  423. data/spec/presenters/newspaper_works/newspaper_core_presenter_spec.rb +22 -0
  424. data/spec/presenters/newspaper_works/persistent_url_presenter_behavior_spec.rb +24 -0
  425. data/spec/presenters/newspaper_works/place_of_publication_presenter_behavior_spec.rb +17 -0
  426. data/spec/presenters/newspaper_works/scanned_media_presenter_spec.rb +18 -0
  427. data/spec/presenters/newspaper_works/title_info_presenter_spec.rb +23 -0
  428. data/spec/routing/route_spec.rb +52 -0
  429. data/spec/search_builders/custom_search_builder_spec.rb +34 -0
  430. data/spec/search_builders/newspaper_works/newspapers_search_builder_spec.rb +33 -0
  431. data/spec/services/hyrax/article_genre_service_spec.rb +12 -0
  432. data/spec/services/hyrax/resource_types_service_spec.rb +12 -0
  433. data/spec/services/newspaper_works/jp2_derivative_service_spec.rb +62 -0
  434. data/spec/services/newspaper_works/newspaper_page_derivative_service_spec.rb +125 -0
  435. data/spec/services/newspaper_works/pdf_derivative_service_spec.rb +62 -0
  436. data/spec/services/newspaper_works/pluggable_derivative_service_spec.rb +204 -0
  437. data/spec/services/newspaper_works/text_extraction_derivative_service_spec.rb +82 -0
  438. data/spec/services/newspaper_works/text_formats_from_alto_service_spec.rb +129 -0
  439. data/spec/services/newspaper_works/tiff_derivative_service_spec.rb +58 -0
  440. data/spec/spec_helper.rb +261 -0
  441. data/spec/support/controller_level_helpers.rb +28 -0
  442. data/spec/test_app_templates/lib/generators/test_app_generator.rb +22 -0
  443. data/spec/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb_spec.rb +36 -0
  444. data/spec/views/catalog/_index_header_list_newspaper_page.html.erb_spec.rb +26 -0
  445. data/spec/views/catalog/_thumbnail_list_newspaper_page.html.erb_spec.rb +35 -0
  446. data/spec/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb_spec.rb +16 -0
  447. data/spec/views/hyrax/newspaper_titles/_issue_search_form.html.erb_spec.rb +33 -0
  448. data/spec/views/hyrax/newspaper_titles/_issues_calendar.html.erb_spec.rb +37 -0
  449. data/spec/views/hyrax/newspaper_titles/show.html.erb_spec.rb +87 -0
  450. data/spec/views/newspaper_works/base/_attribute_rows.html.erb_spec.rb +60 -0
  451. data/spec/views/newspaper_works/base/_newspaper_hierarchy.html.erb_spec.rb +80 -0
  452. data/spec/views/newspaper_works/base/_show.html.erb_spec.rb +78 -0
  453. data/spec/views/newspaper_works/newspapers_search/search.html.erb_spec.rb +54 -0
  454. data/spec/views/records/edit_fields/_place_of_publication.html.erb_spec.rb +26 -0
  455. data/tasks/newspaperworks_dev.rake +26 -0
  456. data/test/integration/navigation_test.rb +7 -0
  457. data/test/lib/generators/newspaper_works/install_generator_test.rb +16 -0
  458. data/test/newspaper_works_test.rb +7 -0
  459. data/test/test_helper.rb +17 -0
  460. data/tmp/.keep +0 -0
  461. metadata +1037 -0
@@ -0,0 +1,52 @@
1
+ module NewspaperWorks
2
+ module Ingest
3
+ # Represents TIFF/JP2 page, access to file, page-numbering metadata
4
+ class PageImage
5
+ attr_accessor :path, :issue, :sequence
6
+
7
+ delegate :lccn, to: :issue
8
+
9
+ def initialize(path, issue, sequence)
10
+ # path to image:
11
+ @path = path
12
+ validate_path
13
+ # Issue is NewspaperWorks::Ingest::IssueImages object
14
+ @issue = issue
15
+ # sequence is page sequence number (Integer)
16
+ @sequence = sequence.to_i
17
+ end
18
+
19
+ # Page number inferred from image filename, or nil, presuming that:
20
+ # - The page number follows the actual word "page" (case-insenstive)
21
+ # in filename, possibly separated by a dash or underscore.
22
+ # - The page number is terminated by the period-plus-file-extension.
23
+ # - Both of the above can be determined by regular expression match.
24
+ # - Extraneous leading information in filename (e.g. datestamp) will
25
+ # be ignored.
26
+ # - Examples:
27
+ # - 'Page1.tiff'
28
+ # - '2019091801-page_1.jp2'
29
+ # - 'page_C2.tiff'
30
+ # @return [String, NilClass] page number string, or nil if indecipherable
31
+ def named_page_number
32
+ pattern = /(page)([_-]?)([^.]+)([.])/i
33
+ match = pattern.match(path)
34
+ match.nil? ? nil : match[3]
35
+ end
36
+
37
+ def page_number
38
+ named_page_number || @sequence.to_s
39
+ end
40
+
41
+ def title
42
+ ["#{@issue.title.first}: Page #{page_number}"]
43
+ end
44
+
45
+ def validate_path
46
+ # expect path to be regular file, that exists:
47
+ raise ArgumentError unless File.exist?(path)
48
+ raise ArgumentError unless File.ftype(path) == 'file'
49
+ end
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,52 @@
1
+ module NewspaperWorks
2
+ module Ingest
3
+ # Provides enumeration of path keys to object values, where:
4
+ # - Consuming class:
5
+ # - Defines a `paths` method returning array of paths.
6
+ # - Defines an `info` method that returns an object for a path.
7
+ # - Also mixes in Enumerable
8
+ module PathEnumeration
9
+ delegate :size, :include?, to: :_paths
10
+
11
+ def _paths
12
+ paths
13
+ end
14
+
15
+ def _info(path)
16
+ info(path)
17
+ end
18
+
19
+ def each
20
+ return enum_for(:each) unless block_given?
21
+ paths.each do |path|
22
+ yield [path, info(path)]
23
+ end
24
+ end
25
+
26
+ def each_key
27
+ enum_for(:each_key) unless block_given?
28
+ paths.each { |path| yield path }
29
+ end
30
+
31
+ def each_value
32
+ return enum_for(:each_value) unless block_given?
33
+ paths.each do |path|
34
+ yield info(path)
35
+ end
36
+ end
37
+
38
+ def values
39
+ each_value.to_a
40
+ end
41
+
42
+ def entries
43
+ each.to_a
44
+ end
45
+
46
+ alias each_pair each
47
+ alias keys _paths
48
+ alias has_key? include?
49
+ alias [] _info
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,85 @@
1
+ require 'open3'
2
+ require 'mini_magick'
3
+
4
+ module NewspaperWorks
5
+ module Ingest
6
+ # PdfImages uses poppler 0.19+ pdfimages command to extract image
7
+ # listing metadata from PDF files.
8
+ # For dpi extraction, falls back to calculating using MiniMagick,
9
+ # if neccessary.
10
+ class PdfImages
11
+ # class constant column numbers
12
+ COL_WIDTH = 3
13
+ COL_HEIGHT = 4
14
+ COL_COLOR = 5
15
+ COL_CHANNELS = 6
16
+ COL_BITS = 7
17
+ # only poppler 0.25+ has this column in output:
18
+ COL_XPPI = 12
19
+
20
+ def initialize(path)
21
+ @path = path
22
+ @cmd = format('pdfimages -list %<path>s', path: path)
23
+ @output = nil
24
+ @entries = nil
25
+ end
26
+
27
+ def process
28
+ # call just once
29
+ if @output.nil?
30
+ Open3.popen3(@cmd) do |_stdin, stdout, _stderr, _wait_thr|
31
+ @output = stdout.read.split("\n")
32
+ end
33
+ end
34
+ @output.slice(2, @output.size - 1)
35
+ end
36
+
37
+ def entries
38
+ if @entries.nil?
39
+ @entries = []
40
+ output = process
41
+ (0..output.size - 1).each do |i|
42
+ @entries.push(output[i].gsub(/\s+/m, ' ').strip.split(" "))
43
+ end
44
+ end
45
+ @entries
46
+ end
47
+
48
+ def selectcolumn(i, &block)
49
+ result = entries.map { |e| e[i] }
50
+ return result.map!(&block) if block_given?
51
+ result
52
+ end
53
+
54
+ def width
55
+ selectcolumn(COL_WIDTH, &:to_i).max
56
+ end
57
+
58
+ def height
59
+ selectcolumn(COL_HEIGHT, &:to_i).max
60
+ end
61
+
62
+ def color
63
+ # desc is either 'gray', 'cmyk', 'rgb', but 1-bit gray is black/white
64
+ # so caller may want all of this information, and in case of
65
+ # mixed color spaces across images, this returns maximum
66
+ desc = entries.any? { |e| e[COL_COLOR] != 'gray' } ? 'rgb' : 'gray'
67
+ channels = entries.map { |e| e[COL_CHANNELS].to_i }.max
68
+ bits = entries.map { |e| e[COL_BITS].to_i }.max
69
+ [desc, channels, bits]
70
+ end
71
+
72
+ def ppi
73
+ if entries[0].size <= 12
74
+ # poppler < 0.25
75
+ pdf = MiniMagick::Image.open(@path)
76
+ width_points = pdf.width
77
+ width_px = width
78
+ return (72 * width_px / width_points).to_i
79
+ end
80
+ # with poppler 0.25+, pdfimages just gives us this:
81
+ selectcolumn(COL_XPPI, &:to_i).max
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,20 @@
1
+ require 'date'
2
+
3
+ module NewspaperWorks
4
+ module Ingest
5
+ class PDFIssue
6
+ attr_accessor :path, :publication
7
+
8
+ # most acccessors for issue/edition metadata, publication metadata
9
+ # provided by including this mixin:
10
+ include NewspaperWorks::Ingest::NamedIssueMetadata
11
+
12
+ def initialize(path, publication)
13
+ @path = path
14
+ validate_path
15
+ # as a NewspaperWorks::Ingest::PublicationInfo object:
16
+ @publication = publication
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,39 @@
1
+ require 'find'
2
+
3
+ module NewspaperWorks
4
+ module Ingest
5
+ class PDFIssues
6
+ include Enumerable
7
+ include NewspaperWorks::Ingest::PathEnumeration
8
+
9
+ attr_accessor :path, :publication, :pdf_paths
10
+
11
+ alias paths pdf_paths
12
+
13
+ def initialize(path, publication)
14
+ @path = path
15
+ # as a NewspaperWorks::Ingest::PublicationInfo object:
16
+ @publication = publication
17
+ @pdf_paths = valid_pdfs(path)
18
+ end
19
+
20
+ def valid_pdfs(path)
21
+ target = []
22
+ Find.find(path) do |p|
23
+ next if File.directory?(p)
24
+ next unless p.end_with?('.pdf')
25
+ target.push(p)
26
+ end
27
+ target
28
+ end
29
+
30
+ def lccn
31
+ @publication.lccn
32
+ end
33
+
34
+ def info(path)
35
+ NewspaperWorks::Ingest::PDFIssue.new(path, @publication)
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,114 @@
1
+ require 'open3'
2
+ require 'securerandom'
3
+ require 'tmpdir'
4
+
5
+ module NewspaperWorks
6
+ module Ingest
7
+ class PdfPages
8
+ include Enumerable
9
+
10
+ def initialize(path)
11
+ @baseid = SecureRandom.uuid
12
+ @pdfpath = path
13
+ @info = nil
14
+ @entries = nil
15
+ @tmpdir = nil
16
+ @size = nil
17
+ @pagecount = nil
18
+ @pdftext = nil
19
+ end
20
+
21
+ # return
22
+ def pdfinfo
23
+ @info = PdfImages.new(@pdfpath) if @info.nil?
24
+ @info
25
+ end
26
+
27
+ def tmpdir
28
+ @tmpdir = Dir.mktmpdir if @tmpdir.nil?
29
+ @tmpdir
30
+ end
31
+
32
+ def colordevice(channels, bpc)
33
+ bits = bpc * channels
34
+ # will be either 8bpc/16bpd color TIFF,
35
+ # with any CMYK source transformed to 8bpc RBG
36
+ bits = 24 unless [24, 48].include? bits
37
+ "tiff#{bits}nc"
38
+ end
39
+
40
+ def gsdevice
41
+ color, channels, bpc = pdfinfo.color
42
+ device = nil
43
+ # CCITT Group 4 Black and White, if applicable:
44
+ device = 'tiffg4' if color == 'gray' && bpc == 1
45
+ # 8 Bit Grayscale, if applicable:
46
+ device = 'tiffgray' if color == 'gray' && bpc > 1
47
+ # otherwise color:
48
+ device = colordevice(channels, bpc) if device.nil?
49
+ device
50
+ end
51
+
52
+ def gstext
53
+ cmd = "gs -q -dNOPAUSE -dBATCH -sDEVICE=txtwrite " \
54
+ "-sOutputFile=- -f #{@pdfpath}"
55
+ Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
56
+ @pdftext = stdout.read
57
+ end
58
+ @pdftext
59
+ end
60
+
61
+ def pagecount
62
+ cmd = "pdfinfo #{@pdfpath}"
63
+ Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
64
+ output = stdout.read.split("\n")
65
+ pages_e = output.select { |e| e.start_with?('Pages:') }[0]
66
+ @pagecount = pages_e.split[-1].to_i
67
+ end
68
+ @pagecount
69
+ end
70
+
71
+ def looks_scanned
72
+ max_image_px = pdfinfo.width * pdfinfo.height
73
+ single_image_per_page = pdfinfo.entries.length == pagecount
74
+ # single 10mp+ image per page?
75
+ single_image_per_page && max_image_px > 1024 * 1024 * 10
76
+ end
77
+
78
+ def ppi
79
+ unless looks_scanned
80
+ # 400 dpi for something that does not look like scanned media:
81
+ return 400
82
+ end
83
+ # For scanned media, defer to detected image PPI:
84
+ pdfinfo.ppi
85
+ end
86
+
87
+ # ghostscript convert all pages to TIFF
88
+ def gsconvert
89
+ output_base = File.join(tmpdir, "#{@baseid}-page%d.tiff")
90
+ cmd = "gs -dNOPAUSE -dBATCH -sDEVICE=#{gsdevice} " \
91
+ "-dTextAlphaBits=4 " \
92
+ "-sOutputFile=#{output_base} -r#{ppi} -f #{@pdfpath}"
93
+ Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
94
+ output = stdout.read.split("\n")
95
+ @size = output.select { |e| e.start_with?('Page ') }.length
96
+ end
97
+ # Return an array of expected filenames
98
+ (1..@size).map { |n| File.join(tmpdir, "#{@baseid}-page#{n}.tiff") }
99
+ end
100
+
101
+ # entries for each page
102
+ def entries
103
+ @entries = gsconvert if @entries.nil?
104
+ @entries
105
+ end
106
+
107
+ def each
108
+ entries.each do |e|
109
+ yield(e)
110
+ end
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,89 @@
1
+ require 'newspaper_works/logging'
2
+ require 'newspaper_works/ingest'
3
+
4
+ module NewspaperWorks
5
+ module Ingest
6
+ # mixin for find-or-create of publication, for use by various ingests
7
+ module PubFinder
8
+ include NewspaperWorks::Logging
9
+
10
+ COPY_FIELDS = [
11
+ :title,
12
+ :lccn,
13
+ :oclcnum,
14
+ :issn,
15
+ :place_of_publication,
16
+ :language,
17
+ :preceded_by,
18
+ :succeeded_by
19
+ ].freeze
20
+
21
+ MULTI_VALUED = [
22
+ :title,
23
+ :language,
24
+ :preceded_by,
25
+ :succeeded_by,
26
+ :place_of_publication
27
+ ].freeze
28
+
29
+ WRAPPERS = {
30
+ place_of_publication: Hyrax::ControlledVocabularies::Location
31
+ }.freeze
32
+
33
+ # @param lccn [String] Library of Congress Control Number
34
+ # of Publication
35
+ # @return [NewspaperTitle, NilClass] publication or nil if not found
36
+ def find_publication(lccn)
37
+ NewspaperTitle.where(lccn: lccn).first
38
+ end
39
+
40
+ # Copy publication metadata from authority lookup for LCCN
41
+ # @param publication [NewspaperTitle]
42
+ # @param metadata [NewspaperWorks::Ingest::PublicationInfo]
43
+ def copy_publication_metadata(publication, metadata, lccn, title = nil)
44
+ COPY_FIELDS.each do |name|
45
+ value = metadata.send(name)
46
+ next if value.nil?
47
+ # wrapped value, if applicable:
48
+ value = WRAPPERS[name].new(value) if WRAPPERS.include?(name)
49
+ # value in array, if applicable:
50
+ value = [value] if MULTI_VALUED.include?(name)
51
+ publication.send("#{name}=", value)
52
+ end
53
+ # prefer locally-specified title to looked-up title:
54
+ publication.title = [title] unless title.nil?
55
+ # final fallback, nothing specified, title mandatory: use LCCN
56
+ publication.title = [lccn] if publication.title.empty?
57
+ end
58
+
59
+ def create_publication(lccn, title = nil, opts = {})
60
+ publication = NewspaperTitle.create
61
+ info = NewspaperWorks::Ingest::PublicationInfo.new(lccn)
62
+ copy_publication_metadata(publication, info, lccn, title)
63
+ publication.lccn ||= lccn
64
+ NewspaperWorks::Ingest.assign_administrative_metadata(publication, opts)
65
+ publication.save!
66
+ write_log(
67
+ "Created NewspaperTitle work #{publication.id} for LCCN #{lccn}"
68
+ )
69
+ publication
70
+ end
71
+
72
+ def find_or_create_publication_for_issue(issue, lccn, title, opts)
73
+ publication = find_publication(lccn)
74
+ unless publication.nil?
75
+ write_log(
76
+ "Found existing NewspaperTitle #{publication.id}, LCCN #{lccn}"
77
+ )
78
+ end
79
+ publication = create_publication(lccn, title, opts) if publication.nil?
80
+ publication.members << issue
81
+ publication.save!
82
+ write_log(
83
+ "Linked NewspaperIssue #{issue.id} to "\
84
+ "NewspaperTitle work #{publication.id}"
85
+ )
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,44 @@
1
+ require 'faraday'
2
+ require 'nokogiri'
3
+ require 'uri'
4
+
5
+ module NewspaperWorks
6
+ module Ingest
7
+ class PublicationInfo
8
+ attr_accessor :implementation, :lccn
9
+
10
+ def initialize(lccn)
11
+ @lccn = lccn
12
+ @implementation = nil
13
+ load
14
+ end
15
+
16
+ def load_chronam_fallback
17
+ @implementation = ChronAmPublicationInfo.new(@lccn)
18
+ end
19
+
20
+ def load
21
+ @implementation = LCPublicationInfo.new(@lccn)
22
+ @implementation.load
23
+ # Empty mods is equivalent to 404 for LCCN in LC Catalog:
24
+ load_chronam_fallback if @implementation.empty?
25
+ end
26
+
27
+ def respond_to_missing?(symbol, include_priv = false)
28
+ @implementation.respond_to?(symbol, include_priv)
29
+ end
30
+
31
+ def method_missing(method, *args, &block)
32
+ # proxy call to underlying implementation:
33
+ if respond_to_missing?(method)
34
+ return @implementation.send(
35
+ method,
36
+ *args,
37
+ &block
38
+ )
39
+ end
40
+ super
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,90 @@
1
+ require 'faraday'
2
+ require 'nokogiri'
3
+ require 'uri'
4
+ require 'newspaper_works/ingest/from_command'
5
+ require 'newspaper_works/ingest/base_publication_info'
6
+ require 'newspaper_works/ingest/chronam_publication_info'
7
+ require 'newspaper_works/ingest/lc_publication_info'
8
+ require 'newspaper_works/ingest/publication_info'
9
+ require 'newspaper_works/ingest/pub_finder'
10
+ require 'newspaper_works/ingest/pdf_images'
11
+ require 'newspaper_works/ingest/named_issue_metadata'
12
+ require 'newspaper_works/ingest/path_enumeration'
13
+ require 'newspaper_works/ingest/pdf_issue'
14
+ require 'newspaper_works/ingest/pdf_issues'
15
+ require 'newspaper_works/ingest/batch_ingest_helper'
16
+ require 'newspaper_works/ingest/batch_issue_ingester'
17
+ require 'newspaper_works/ingest/pdf_pages'
18
+ require 'newspaper_works/ingest/issue_images'
19
+ require 'newspaper_works/ingest/page_image'
20
+ require 'newspaper_works/ingest/image_ingest_issues'
21
+ require 'newspaper_works/ingest/base_ingest'
22
+ require 'newspaper_works/ingest/ndnp'
23
+ require 'newspaper_works/ingest/newspaper_page_ingest'
24
+ require 'newspaper_works/ingest/newspaper_issue_ingest'
25
+
26
+ module NewspaperWorks
27
+ # Module for Ingest adapters that import files into model objects
28
+ module Ingest
29
+ # Get Geonames URI for closest place match
30
+ # Requires Qa::Authorities::Geonames.username is set, likely via
31
+ # `Hyrax.config.geonames_username=` setter in
32
+ # config/initializers/hyrax.rb of consuming app.
33
+ # @param place_name [String] Name of place as human-readable text
34
+ # @return [String, NilClass] URI to Geonames RDF or nil
35
+ def self.geonames_place_uri(place_name)
36
+ username = Qa::Authorities::Geonames.username
37
+ return if username.nil? || username.empty?
38
+ place_name = place_name.delete('.').split(/[\[\(]/)[0].strip
39
+ query = URI.encode(place_name)
40
+ geo_qs = "q=#{query}&username=#{username}"
41
+ url = "http://api.geonames.org/search?#{geo_qs}"
42
+ resp = NewspaperWorks::ResourceFetcher.get url
43
+ doc = Nokogiri.XML(resp['body'])
44
+ geonames_id = doc.xpath('//geonames/geoname[1]/geonameId').first
45
+ return if geonames_id.nil?
46
+ "http://sws.geonames.org/#{geonames_id.text}/"
47
+ end
48
+
49
+ # Normalize publication title from catalog data
50
+ # Presently strips trailing period
51
+ # @param title [String]
52
+ # @return [String] normalized title
53
+ def self.normalize_title(title)
54
+ title.strip.sub(/[.]+$/, '')
55
+ end
56
+
57
+ # Get publication metadata from LC catalog MODS data, if available,
58
+ # and from ChronAm, as a fallback.
59
+ # @param lccn [String] Library of Congress Control number for publication
60
+ # @return [NewspaperWorks::Ingest::PublicationInfo] proxy to metadata
61
+ # source, an object for accessors for publication fields.
62
+ def self.publication_metadata(lccn)
63
+ PublicationInfo.new(lccn)
64
+ end
65
+
66
+ def self.find_admin_set(admin_set = nil)
67
+ return admin_set if admin_set.class == AdminSet
68
+ admin_set = AdminSet::DEFAULT_ID if admin_set.nil?
69
+ begin
70
+ AdminSet.find(admin_set)
71
+ rescue
72
+ # only create if default admin set
73
+ raise unless admin_set == AdminSet::DEFAULT_ID
74
+ AdminSet.find(AdminSet.find_or_create_default_admin_set_id)
75
+ end
76
+ end
77
+
78
+ def self.assign_administrative_metadata(work, opts = {})
79
+ work.depositor = opts.fetch(:email, User.batch_user.user_key)
80
+ work.admin_set = find_admin_set(opts.fetch(:admin_set, nil))
81
+ work.visibility = opts.fetch(:visibility, 'open')
82
+ work.resource_type = ['Newspapers']
83
+ work.date_modified ||= Hyrax::TimeService.time_in_utc
84
+ work.date_uploaded ||= work.date_modified
85
+ work.state = RDF::URI(
86
+ 'http://fedora.info/definitions/1/0/access/ObjState#active'
87
+ )
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,111 @@
1
+ require 'open3'
2
+ require 'tmpdir'
3
+
4
+ module NewspaperWorks
5
+ # Adapter class composes a PDF derivative for issue, if it requires one.
6
+ class IssuePDFComposer
7
+ attr_accessor :issue, :page_pdfs
8
+
9
+ CMD_BASE = "gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite".freeze
10
+
11
+ # @param issue [NewspaperIssue] adapts issue work object
12
+ def initialize(issue)
13
+ @issue = issue
14
+ # paths to page PDFs
15
+ @page_pdfs = []
16
+ end
17
+
18
+ def compose
19
+ # we will not step on any existing PDF
20
+ return if issue_pdf_exists?
21
+ # we can not compose a multi-page issue PDF if constituent page PDFs
22
+ # do not exist (yet == not ready, possibly waiting on an async job).
23
+ @page_pdfs = validated_page_pdfs
24
+ # Compose a Ghostscript command to merge all paths in @page_pdfs into
25
+ # a single output document, execute:
26
+ compose_from_pages
27
+ end
28
+
29
+ def compose_from_pages
30
+ outfile = File.join(Dir.mktmpdir, output_filename)
31
+ sources = @page_pdfs.join(' ')
32
+ cmd = "#{CMD_BASE} -sOutputFile=#{outfile} #{sources}"
33
+ # rubocop:disable Lint/UnusedBlockArgument
34
+ Open3.popen3(cmd) do |stdin, stdout, stderr, wait_thr|
35
+ unless wait_thr.value.success?
36
+ e = "Ghostscript Error: \n#{stderr.read}"
37
+ raise NewspaperWorks::DataError, e
38
+ end
39
+ end
40
+ # rubocop:enable Lint/UnusedBlockArgument
41
+ # at this point, something should exist and validate at path `outfile`:
42
+ raise NewspaperWorks::DataError, "Generated PDF invalid" unless validate_pdf(outfile)
43
+ # Assign for attachment to issue, commit:
44
+ attach_to_issue(outfile)
45
+ end
46
+
47
+ def output_filename
48
+ "#{@issue.id}_full-issue.pdf"
49
+ end
50
+
51
+ # Validate PDF with poppler `pdfinfo` command, which will detect
52
+ # error conditions in cases like truncated PDF, and only in those
53
+ # error conditions will write to stderr.
54
+ # @param path [String] path to PDF file
55
+ # @return [Boolean] true or false
56
+ def validate_pdf(path)
57
+ return false if path.nil? || !File.exist?(path)
58
+ return false if File.size(path).zero?
59
+ result = ''
60
+ cmd = "pdfinfo #{path}"
61
+ # rubocop:disable Lint/UnusedBlockArgument
62
+ Open3.popen3(cmd) do |stdin, stdout, stderr, wait_thr|
63
+ result = stderr.read
64
+ end
65
+ # rubocop:enable Lint/UnusedBlockArgument
66
+ # only zero bytes stderr output from `pdfinfo` considered valid PDF:
67
+ result.size.zero?
68
+ end
69
+
70
+ private
71
+
72
+ # @return [Array] list of paths to page PDFs, in page order
73
+ # @raises [NewspaperWorks::PagesNotReady] if any page has invalid
74
+ # or non-ready PDF source.
75
+ def validated_page_pdfs
76
+ result = []
77
+ # if any page PDF invalid, raise; otherwise append to result:
78
+ issue.pages.to_a.each_with_index do |page, idx|
79
+ e = "Page PDFs not ready for issue "\
80
+ "(Issue id: #{issue.id}, Page index: #{idx})"
81
+ path = derivatives_of(page).path('pdf')
82
+ raise NewspaperWorks::PagesNotReady, e unless validate_pdf(path)
83
+ result.push(path)
84
+ end
85
+ result
86
+ end
87
+
88
+ def issue_pdf_exists?
89
+ derivatives_of(@issue).exist?('pdf')
90
+ end
91
+
92
+ def derivatives_of(work)
93
+ NewspaperWorks::Data::WorkDerivatives.of(work)
94
+ end
95
+
96
+ def ensure_whitelist
97
+ whitelist = Hyrax.config.whitelisted_ingest_dirs
98
+ whitelist.push(Dir.tmpdir) unless whitelist.include?(Dir.tmpdir)
99
+ end
100
+
101
+ def attach_to_issue(path)
102
+ ensure_whitelist
103
+ # We rely upon WorkFiles to create fileset, and by consequence of
104
+ # running primary file attachment through actor stack,
105
+ # visibility of the FileSet is copied from the work:
106
+ attachment = NewspaperWorks::Data::WorkFiles.of(@issue)
107
+ attachment.assign(path)
108
+ attachment.commit!
109
+ end
110
+ end
111
+ end