newspaper_works 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (461) hide show
  1. checksums.yaml +7 -0
  2. data/.fcrepo_wrapper +4 -0
  3. data/.gitignore +43 -0
  4. data/.rubocop.yml +143 -0
  5. data/.solr_wrapper +8 -0
  6. data/.travis.yml +50 -0
  7. data/Gemfile +47 -0
  8. data/LICENSE +203 -0
  9. data/README.md +159 -0
  10. data/Rakefile +38 -0
  11. data/app/actors/hyrax/actors/newspaper_article_actor.rb +8 -0
  12. data/app/actors/hyrax/actors/newspaper_container_actor.rb +8 -0
  13. data/app/actors/hyrax/actors/newspaper_issue_actor.rb +8 -0
  14. data/app/actors/hyrax/actors/newspaper_page_actor.rb +8 -0
  15. data/app/actors/hyrax/actors/newspaper_title_actor.rb +8 -0
  16. data/app/actors/newspaper_works/actors/newspaper_works_upload_actor.rb +88 -0
  17. data/app/assets/config/newspaper_works_manifest.js +2 -0
  18. data/app/assets/images/newspaper_works/.keep +0 -0
  19. data/app/assets/javascripts/newspaper_works/autocomplete_fix.js +33 -0
  20. data/app/assets/javascripts/newspaper_works/ocr_search.js.erb +6 -0
  21. data/app/assets/javascripts/newspaper_works/thumbnail_highlights.js.erb +102 -0
  22. data/app/assets/javascripts/newspaper_works.js +4 -0
  23. data/app/assets/stylesheets/newspaper_works/_issue_search.scss +13 -0
  24. data/app/assets/stylesheets/newspaper_works/_issues_calendar.scss +18 -0
  25. data/app/assets/stylesheets/newspaper_works/_newspaper_works.scss +4 -0
  26. data/app/assets/stylesheets/newspaper_works/_newspapers_search.scss +38 -0
  27. data/app/assets/stylesheets/newspaper_works/_search_results.scss +12 -0
  28. data/app/controllers/hyrax/newspaper_articles_controller.rb +14 -0
  29. data/app/controllers/hyrax/newspaper_containers_controller.rb +14 -0
  30. data/app/controllers/hyrax/newspaper_issues_controller.rb +14 -0
  31. data/app/controllers/hyrax/newspaper_pages_controller.rb +14 -0
  32. data/app/controllers/hyrax/newspaper_titles_controller.rb +13 -0
  33. data/app/controllers/newspaper_works/newspapers_controller.rb +117 -0
  34. data/app/controllers/newspaper_works/newspapers_search_controller.rb +26 -0
  35. data/app/forms/hyrax/newspaper_article_form.rb +11 -0
  36. data/app/forms/hyrax/newspaper_container_form.rb +11 -0
  37. data/app/forms/hyrax/newspaper_issue_form.rb +11 -0
  38. data/app/forms/hyrax/newspaper_page_form.rb +15 -0
  39. data/app/forms/hyrax/newspaper_title_form.rb +12 -0
  40. data/app/forms/newspaper_works/newspaper_core_form_data.rb +17 -0
  41. data/app/helpers/hyrax/newspaper_articles_helper.rb +5 -0
  42. data/app/helpers/hyrax/newspaper_containers_helper.rb +5 -0
  43. data/app/helpers/hyrax/newspaper_issues_helper.rb +5 -0
  44. data/app/helpers/hyrax/newspaper_pages_helper.rb +5 -0
  45. data/app/helpers/newspaper_works/application_helper.rb +5 -0
  46. data/app/helpers/newspaper_works/breadcrumb_helper.rb +92 -0
  47. data/app/helpers/newspaper_works/newspaper_works_helper_behavior.rb +103 -0
  48. data/app/helpers/newspaper_works/newspapers_helper.rb +5 -0
  49. data/app/indexers/concerns/newspaper_works/indexes_full_text.rb +17 -0
  50. data/app/indexers/concerns/newspaper_works/indexes_place_of_publication.rb +67 -0
  51. data/app/indexers/concerns/newspaper_works/indexes_publication_date_range.rb +35 -0
  52. data/app/indexers/concerns/newspaper_works/indexes_relationships.rb +125 -0
  53. data/app/indexers/newspaper_article_indexer.rb +16 -0
  54. data/app/indexers/newspaper_container_indexer.rb +18 -0
  55. data/app/indexers/newspaper_issue_indexer.rb +26 -0
  56. data/app/indexers/newspaper_page_indexer.rb +9 -0
  57. data/app/indexers/newspaper_title_indexer.rb +19 -0
  58. data/app/indexers/newspaper_works/newspaper_core_indexer.rb +21 -0
  59. data/app/jobs/newspaper_works/application_job.rb +4 -0
  60. data/app/jobs/newspaper_works/compose_issue_pdf_job.rb +13 -0
  61. data/app/jobs/newspaper_works/create_issue_pages_job.rb +19 -0
  62. data/app/mailers/newspaper_works/application_mailer.rb +8 -0
  63. data/app/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior.rb +82 -0
  64. data/app/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior.rb +27 -0
  65. data/app/models/concerns/newspaper_works/newspaper_core_metadata.rb +67 -0
  66. data/app/models/concerns/newspaper_works/place_of_publication_behavior.rb +15 -0
  67. data/app/models/concerns/newspaper_works/scanned_media_metadata.rb +43 -0
  68. data/app/models/concerns/newspaper_works/solr/document.rb +25 -0
  69. data/app/models/file_set.rb +10 -0
  70. data/app/models/newspaper_article.rb +158 -0
  71. data/app/models/newspaper_container.rb +86 -0
  72. data/app/models/newspaper_issue.rb +115 -0
  73. data/app/models/newspaper_page.rb +70 -0
  74. data/app/models/newspaper_title.rb +111 -0
  75. data/app/models/newspaper_works/application_record.rb +6 -0
  76. data/app/models/newspaper_works/derivative_attachment.rb +8 -0
  77. data/app/models/newspaper_works/ingest_file_relation.rb +14 -0
  78. data/app/presenters/hyrax/newspaper_article_presenter.rb +38 -0
  79. data/app/presenters/hyrax/newspaper_container_presenter.rb +11 -0
  80. data/app/presenters/hyrax/newspaper_issue_presenter.rb +62 -0
  81. data/app/presenters/hyrax/newspaper_page_presenter.rb +72 -0
  82. data/app/presenters/hyrax/newspaper_title_presenter.rb +86 -0
  83. data/app/presenters/newspaper_works/iiif_manifest_presenter_behavior.rb +29 -0
  84. data/app/presenters/newspaper_works/issue_info_presenter.rb +29 -0
  85. data/app/presenters/newspaper_works/newspaper_core_presenter.rb +9 -0
  86. data/app/presenters/newspaper_works/persistent_url_presenter_behavior.rb +16 -0
  87. data/app/presenters/newspaper_works/place_of_publication_presenter_behavior.rb +8 -0
  88. data/app/presenters/newspaper_works/scanned_media_presenter.rb +7 -0
  89. data/app/presenters/newspaper_works/title_info_presenter.rb +13 -0
  90. data/app/search_builders/concerns/newspaper_works/exclude_models.rb +16 -0
  91. data/app/search_builders/concerns/newspaper_works/highlight_search_params.rb +14 -0
  92. data/app/search_builders/newspaper_works/newspapers_search_builder.rb +26 -0
  93. data/app/services/hyrax/article_genre_service.rb +9 -0
  94. data/app/services/newspaper_works/jp2_derivative_service.rb +120 -0
  95. data/app/services/newspaper_works/newspaper_page_derivative_service.rb +91 -0
  96. data/app/services/newspaper_works/pdf_derivative_service.rb +45 -0
  97. data/app/services/newspaper_works/pluggable_derivative_service.rb +114 -0
  98. data/app/services/newspaper_works/text_extraction_derivative_service.rb +56 -0
  99. data/app/services/newspaper_works/text_formats_from_alto_service.rb +77 -0
  100. data/app/services/newspaper_works/tiff_derivative_service.rb +54 -0
  101. data/app/validators/newspaper_works/publication_date_start_end_validator.rb +48 -0
  102. data/app/validators/newspaper_works/publication_date_validator.rb +16 -0
  103. data/app/views/catalog/_index_gallery_newspaper_article_wrapper.html.erb +9 -0
  104. data/app/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb +9 -0
  105. data/app/views/catalog/_index_header_gallery_newspaper_article.html.erb +23 -0
  106. data/app/views/catalog/_index_header_gallery_newspaper_page.html.erb +23 -0
  107. data/app/views/catalog/_index_header_list_newspaper_article.html.erb +7 -0
  108. data/app/views/catalog/_index_header_list_newspaper_page.html.erb +7 -0
  109. data/app/views/catalog/_snippets_more.html.erb +16 -0
  110. data/app/views/catalog/_thumbnail_list_newspaper_article.html.erb +6 -0
  111. data/app/views/catalog/_thumbnail_list_newspaper_page.html.erb +6 -0
  112. data/app/views/hyrax/file_sets/_actions.html.erb +45 -0
  113. data/app/views/hyrax/newspaper_articles/_newspaper_article.html.erb +2 -0
  114. data/app/views/hyrax/newspaper_articles/show.html.erb +1 -0
  115. data/app/views/hyrax/newspaper_containers/_newspaper_container.html.erb +2 -0
  116. data/app/views/hyrax/newspaper_containers/show.html.erb +1 -0
  117. data/app/views/hyrax/newspaper_issues/_newspaper_issue.html.erb +2 -0
  118. data/app/views/hyrax/newspaper_issues/show.html.erb +1 -0
  119. data/app/views/hyrax/newspaper_pages/_newspaper_page.html.erb +2 -0
  120. data/app/views/hyrax/newspaper_pages/show.html.erb +1 -0
  121. data/app/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb +5 -0
  122. data/app/views/hyrax/newspaper_titles/_issue_search_form.html.erb +33 -0
  123. data/app/views/hyrax/newspaper_titles/_issues_calendar.html.erb +63 -0
  124. data/app/views/hyrax/newspaper_titles/_newspaper_title.html.erb +2 -0
  125. data/app/views/hyrax/newspaper_titles/show.html.erb +54 -0
  126. data/app/views/newspaper_works/base/_attribute_rows.html.erb +42 -0
  127. data/app/views/newspaper_works/base/_attributes.html.erb +16 -0
  128. data/app/views/newspaper_works/base/_metadata.html.erb +6 -0
  129. data/app/views/newspaper_works/base/_newspaper_hierarchy.html.erb +14 -0
  130. data/app/views/newspaper_works/base/_persistent_url.html.erb +1 -0
  131. data/app/views/newspaper_works/base/_show.html.erb +45 -0
  132. data/app/views/newspaper_works/newspapers_search/_date_fields.html.erb +29 -0
  133. data/app/views/newspaper_works/newspapers_search/_facet_layout.html.erb +8 -0
  134. data/app/views/newspaper_works/newspapers_search/_facet_limit.html.erb +17 -0
  135. data/app/views/newspaper_works/newspapers_search/_front_pages_input.html.erb +5 -0
  136. data/app/views/newspaper_works/newspapers_search/_keyword_input.html.erb +18 -0
  137. data/app/views/newspaper_works/newspapers_search/_newspapers_facets.html.erb +5 -0
  138. data/app/views/newspaper_works/newspapers_search/_newspapers_search_form.html.erb +13 -0
  139. data/app/views/newspaper_works/newspapers_search/_newspapers_search_help.html.erb +8 -0
  140. data/app/views/newspaper_works/newspapers_search/search.html.erb +13 -0
  141. data/app/views/records/edit_fields/_alternate_title.html.erb +4 -0
  142. data/app/views/records/edit_fields/_genre.html.erb +4 -0
  143. data/app/views/records/edit_fields/_place_of_publication.html.erb +14 -0
  144. data/app/views/records/edit_fields/_subtitle.html.erb +4 -0
  145. data/bin/rails +13 -0
  146. data/config/fcrepo_wrapper_test.yml +5 -0
  147. data/config/initializers/assets.rb +2 -0
  148. data/config/locales/newspaper_article.de.yml +12 -0
  149. data/config/locales/newspaper_article.en.yml +12 -0
  150. data/config/locales/newspaper_article.es.yml +12 -0
  151. data/config/locales/newspaper_article.fr.yml +12 -0
  152. data/config/locales/newspaper_article.it.yml +12 -0
  153. data/config/locales/newspaper_article.pt-BR.yml +12 -0
  154. data/config/locales/newspaper_article.zh.yml +12 -0
  155. data/config/locales/newspaper_container.de.yml +8 -0
  156. data/config/locales/newspaper_container.en.yml +8 -0
  157. data/config/locales/newspaper_container.es.yml +8 -0
  158. data/config/locales/newspaper_container.fr.yml +8 -0
  159. data/config/locales/newspaper_container.it.yml +8 -0
  160. data/config/locales/newspaper_container.pt-BR.yml +8 -0
  161. data/config/locales/newspaper_container.zh.yml +8 -0
  162. data/config/locales/newspaper_issue.de.yml +8 -0
  163. data/config/locales/newspaper_issue.en.yml +8 -0
  164. data/config/locales/newspaper_issue.es.yml +8 -0
  165. data/config/locales/newspaper_issue.fr.yml +8 -0
  166. data/config/locales/newspaper_issue.it.yml +8 -0
  167. data/config/locales/newspaper_issue.pt-BR.yml +8 -0
  168. data/config/locales/newspaper_issue.zh.yml +8 -0
  169. data/config/locales/newspaper_page.de.yml +15 -0
  170. data/config/locales/newspaper_page.en.yml +15 -0
  171. data/config/locales/newspaper_page.es.yml +15 -0
  172. data/config/locales/newspaper_page.fr.yml +15 -0
  173. data/config/locales/newspaper_page.it.yml +15 -0
  174. data/config/locales/newspaper_page.pt-BR.yml +15 -0
  175. data/config/locales/newspaper_page.zh.yml +15 -0
  176. data/config/locales/newspaper_title.de.yml +8 -0
  177. data/config/locales/newspaper_title.en.yml +8 -0
  178. data/config/locales/newspaper_title.es.yml +8 -0
  179. data/config/locales/newspaper_title.fr.yml +8 -0
  180. data/config/locales/newspaper_title.it.yml +8 -0
  181. data/config/locales/newspaper_title.pt-BR.yml +8 -0
  182. data/config/locales/newspaper_title.zh.yml +8 -0
  183. data/config/locales/newspaper_works.de.yml +50 -0
  184. data/config/locales/newspaper_works.en.yml +52 -0
  185. data/config/locales/newspaper_works.es.yml +52 -0
  186. data/config/locales/newspaper_works.fr.yml +52 -0
  187. data/config/locales/newspaper_works.it.yml +52 -0
  188. data/config/locales/newspaper_works.pt-BR.yml +52 -0
  189. data/config/locales/newspaper_works.zh.yml +52 -0
  190. data/config/routes.rb +9 -0
  191. data/config/solr_wrapper_test.yml +9 -0
  192. data/config/test-fixture/solr-config/_rest_managed.json +3 -0
  193. data/config/test-fixture/solr-config/admin-extra.html +31 -0
  194. data/config/test-fixture/solr-config/elevate.xml +36 -0
  195. data/config/test-fixture/solr-config/mapping-ISOLatin1Accent.txt +246 -0
  196. data/config/test-fixture/solr-config/protwords.txt +21 -0
  197. data/config/test-fixture/solr-config/schema.xml +366 -0
  198. data/config/test-fixture/solr-config/scripts.conf +24 -0
  199. data/config/test-fixture/solr-config/solrconfig.xml +322 -0
  200. data/config/test-fixture/solr-config/spellings.txt +2 -0
  201. data/config/test-fixture/solr-config/stopwords.txt +58 -0
  202. data/config/test-fixture/solr-config/stopwords_en.txt +58 -0
  203. data/config/test-fixture/solr-config/synonyms.txt +31 -0
  204. data/config/test-fixture/solr-config/xslt/example.xsl +132 -0
  205. data/config/test-fixture/solr-config/xslt/example_atom.xsl +67 -0
  206. data/config/test-fixture/solr-config/xslt/example_rss.xsl +66 -0
  207. data/config/test-fixture/solr-config/xslt/luke.xsl +337 -0
  208. data/config/vendor/imagemagick-6-policy.xml +76 -0
  209. data/db/migrate/20181214181358_create_newspaper_works_derivative_attachments.rb +12 -0
  210. data/db/migrate/20190107165909_create_newspaper_works_ingest_file_relations.rb +11 -0
  211. data/lib/generators/newspaper_works/assets_generator.rb +29 -0
  212. data/lib/generators/newspaper_works/blacklight_advanced_search_generator.rb +44 -0
  213. data/lib/generators/newspaper_works/blacklight_iiif_search_generator.rb +41 -0
  214. data/lib/generators/newspaper_works/catalog_controller_generator.rb +60 -0
  215. data/lib/generators/newspaper_works/install_generator.rb +97 -0
  216. data/lib/generators/newspaper_works/templates/annotation_behavior.rb +6 -0
  217. data/lib/generators/newspaper_works/templates/config/authorities/newspaper_article_genres.yml +86 -0
  218. data/lib/generators/newspaper_works/templates/config/initializers/newspaper_works.rb +12 -0
  219. data/lib/generators/newspaper_works/templates/config/initializers/patch_blacklight_advanced_search.rb +74 -0
  220. data/lib/generators/newspaper_works/templates/custom_search_builder.rb +23 -0
  221. data/lib/generators/newspaper_works/templates/newspaper_works.scss +1 -0
  222. data/lib/generators/newspaper_works/templates/newspaper_works_helper.rb +3 -0
  223. data/lib/generators/newspaper_works/templates/search_behavior.rb +6 -0
  224. data/lib/newspaper_works/configuration.rb +14 -0
  225. data/lib/newspaper_works/data/fileset_helper.rb +25 -0
  226. data/lib/newspaper_works/data/path_helper.rb +40 -0
  227. data/lib/newspaper_works/data/work_derivatives.rb +314 -0
  228. data/lib/newspaper_works/data/work_file.rb +92 -0
  229. data/lib/newspaper_works/data/work_files.rb +181 -0
  230. data/lib/newspaper_works/data.rb +35 -0
  231. data/lib/newspaper_works/engine.rb +42 -0
  232. data/lib/newspaper_works/errors.rb +14 -0
  233. data/lib/newspaper_works/ingest/base_ingest.rb +69 -0
  234. data/lib/newspaper_works/ingest/base_publication_info.rb +35 -0
  235. data/lib/newspaper_works/ingest/batch_ingest_helper.rb +44 -0
  236. data/lib/newspaper_works/ingest/batch_issue_ingester.rb +129 -0
  237. data/lib/newspaper_works/ingest/chronam_publication_info.rb +133 -0
  238. data/lib/newspaper_works/ingest/from_command.rb +52 -0
  239. data/lib/newspaper_works/ingest/image_ingest_issues.rb +43 -0
  240. data/lib/newspaper_works/ingest/issue_images.rb +51 -0
  241. data/lib/newspaper_works/ingest/lc_publication_info.rb +144 -0
  242. data/lib/newspaper_works/ingest/named_issue_metadata.rb +60 -0
  243. data/lib/newspaper_works/ingest/ndnp/batch_ingester.rb +64 -0
  244. data/lib/newspaper_works/ingest/ndnp/batch_xml_ingest.rb +72 -0
  245. data/lib/newspaper_works/ingest/ndnp/container_ingest.rb +99 -0
  246. data/lib/newspaper_works/ingest/ndnp/container_ingester.rb +84 -0
  247. data/lib/newspaper_works/ingest/ndnp/container_metadata.rb +87 -0
  248. data/lib/newspaper_works/ingest/ndnp/issue_ingest.rb +81 -0
  249. data/lib/newspaper_works/ingest/ndnp/issue_ingester.rb +101 -0
  250. data/lib/newspaper_works/ingest/ndnp/issue_metadata.rb +96 -0
  251. data/lib/newspaper_works/ingest/ndnp/ndnp_asset_helper.rb +20 -0
  252. data/lib/newspaper_works/ingest/ndnp/ndnp_mets_helper.rb +70 -0
  253. data/lib/newspaper_works/ingest/ndnp/page_ingest.rb +47 -0
  254. data/lib/newspaper_works/ingest/ndnp/page_ingester.rb +157 -0
  255. data/lib/newspaper_works/ingest/ndnp/page_metadata.rb +112 -0
  256. data/lib/newspaper_works/ingest/ndnp.rb +21 -0
  257. data/lib/newspaper_works/ingest/newspaper_issue_ingest.rb +56 -0
  258. data/lib/newspaper_works/ingest/newspaper_page_ingest.rb +6 -0
  259. data/lib/newspaper_works/ingest/page_image.rb +52 -0
  260. data/lib/newspaper_works/ingest/path_enumeration.rb +52 -0
  261. data/lib/newspaper_works/ingest/pdf_images.rb +85 -0
  262. data/lib/newspaper_works/ingest/pdf_issue.rb +20 -0
  263. data/lib/newspaper_works/ingest/pdf_issues.rb +39 -0
  264. data/lib/newspaper_works/ingest/pdf_pages.rb +114 -0
  265. data/lib/newspaper_works/ingest/pub_finder.rb +89 -0
  266. data/lib/newspaper_works/ingest/publication_info.rb +44 -0
  267. data/lib/newspaper_works/ingest.rb +90 -0
  268. data/lib/newspaper_works/issue_pdf_composer.rb +111 -0
  269. data/lib/newspaper_works/logging.rb +54 -0
  270. data/lib/newspaper_works/page_finder.rb +62 -0
  271. data/lib/newspaper_works/resource_fetcher.rb +78 -0
  272. data/lib/newspaper_works/text_extraction/alto_reader.rb +122 -0
  273. data/lib/newspaper_works/text_extraction/page_ocr.rb +100 -0
  274. data/lib/newspaper_works/text_extraction/render_alto.rb +84 -0
  275. data/lib/newspaper_works/text_extraction/word_coords_builder.rb +30 -0
  276. data/lib/newspaper_works/text_extraction.rb +10 -0
  277. data/lib/newspaper_works/version.rb +3 -0
  278. data/lib/newspaper_works.rb +19 -0
  279. data/lib/tasks/newspaper_works_tasks.rake +39 -0
  280. data/newspaper_works.gemspec +49 -0
  281. data/spec/.keep.txt +1 -0
  282. data/spec/actors/newspaper_works/actors/newspaper_works_upload_actor_spec.rb +69 -0
  283. data/spec/controllers/catalog_controller_spec.rb +63 -0
  284. data/spec/controllers/newspaper_works/newspapers_controller_spec.rb +114 -0
  285. data/spec/controllers/newspaper_works/newspapers_search_controller_spec.rb +21 -0
  286. data/spec/factories/ability.rb +6 -0
  287. data/spec/factories/newspaper_issue.rb +7 -0
  288. data/spec/factories/newspaper_issue_ingest.rb +6 -0
  289. data/spec/factories/newspaper_page.rb +7 -0
  290. data/spec/factories/newspaper_page_ingest.rb +6 -0
  291. data/spec/factories/newspaper_page_solr_document.rb +12 -0
  292. data/spec/factories/newspaper_title.rb +8 -0
  293. data/spec/factories/uploaded_pdf_file.rb +9 -0
  294. data/spec/factories/user.rb +13 -0
  295. data/spec/features/front_pages_for_title_spec.rb +19 -0
  296. data/spec/features/newspaper_title_search_spec.rb +30 -0
  297. data/spec/features/newspapers_search_spec.rb +49 -0
  298. data/spec/features/search_results_thumbnail_highlights_spec.rb +33 -0
  299. data/spec/features_shared.rb +71 -0
  300. data/spec/fixtures/files/4.1.07.jp2 +0 -0
  301. data/spec/fixtures/files/4.1.07.tiff +0 -0
  302. data/spec/fixtures/files/README.md +7 -0
  303. data/spec/fixtures/files/alto-2-0.xsd +714 -0
  304. data/spec/fixtures/files/broken-truncated.pdf +0 -0
  305. data/spec/fixtures/files/credits.md +16 -0
  306. data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
  307. data/spec/fixtures/files/minimal-1-page.pdf +0 -0
  308. data/spec/fixtures/files/minimal-2-page.pdf +0 -0
  309. data/spec/fixtures/files/minimal-alto.xml +31 -0
  310. data/spec/fixtures/files/ndnp-alto-sample.xml +24 -0
  311. data/spec/fixtures/files/ndnp-sample1-json.json +1 -0
  312. data/spec/fixtures/files/ndnp-sample1-txt.txt +1 -0
  313. data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
  314. data/spec/fixtures/files/ocr_alto.xml +202 -0
  315. data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +202 -0
  316. data/spec/fixtures/files/ocr_color.tiff +0 -0
  317. data/spec/fixtures/files/ocr_gray.jp2 +0 -0
  318. data/spec/fixtures/files/ocr_gray.tiff +0 -0
  319. data/spec/fixtures/files/ocr_mono.tiff +0 -0
  320. data/spec/fixtures/files/page1.tiff +0 -0
  321. data/spec/fixtures/files/resource_mocks/chronam/http404-expected +0 -0
  322. data/spec/fixtures/files/resource_mocks/chronam/sn84038814.rdf +1028 -0
  323. data/spec/fixtures/files/resource_mocks/chronam/sn93059126.rdf +36 -0
  324. data/spec/fixtures/files/resource_mocks/chronam/sn94051019.rdf +37 -0
  325. data/spec/fixtures/files/resource_mocks/geonames/Chicopee +1104 -0
  326. data/spec/fixtures/files/resource_mocks/geonames/Denver +1104 -0
  327. data/spec/fixtures/files/resource_mocks/geonames/Marysville +279 -0
  328. data/spec/fixtures/files/resource_mocks/geonames/Marysville2 +279 -0
  329. data/spec/fixtures/files/resource_mocks/geonames/SLC +1104 -0
  330. data/spec/fixtures/files/resource_mocks/lccn/sn2099999999 +1 -0
  331. data/spec/fixtures/files/resource_mocks/lccn/sn82014496 +2 -0
  332. data/spec/fixtures/files/resource_mocks/lccn/sn83020109 +1 -0
  333. data/spec/fixtures/files/resource_mocks/lccn/sn83021453 +2 -0
  334. data/spec/fixtures/files/resource_mocks/lccn/sn83045396 +2 -0
  335. data/spec/fixtures/files/resource_mocks/lccn/sn84038814 +2 -0
  336. data/spec/fixtures/files/resource_mocks/lccn/sn93059126 +1 -0
  337. data/spec/fixtures/files/resource_mocks/lccn/sn94051019 +1 -0
  338. data/spec/fixtures/files/resource_mocks/lccn/sn99999999 +1 -0
  339. data/spec/fixtures/files/resource_mocks/urls.json +82 -0
  340. data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
  341. data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
  342. data/spec/fixtures/files/thumbnail.jpg +0 -0
  343. data/spec/forms/hyrax/newspaper_article_form_spec.rb +33 -0
  344. data/spec/forms/hyrax/newspaper_container_form_spec.rb +30 -0
  345. data/spec/forms/hyrax/newspaper_issue_form_spec.rb +31 -0
  346. data/spec/forms/hyrax/newspaper_page_form_spec.rb +28 -0
  347. data/spec/forms/hyrax/newspaper_title_form_spec.rb +31 -0
  348. data/spec/forms/newspaper_works/newspaper_core_form_data_spec.rb +12 -0
  349. data/spec/helpers/newspaper_works/breadcrumb_helper_spec.rb +82 -0
  350. data/spec/helpers/newspaper_works_helper_spec.rb +57 -0
  351. data/spec/indexers/concerns/newspaper_works/indexes_full_text_spec.rb +31 -0
  352. data/spec/indexers/concerns/newspaper_works/indexes_place_of_publication_spec.rb +53 -0
  353. data/spec/indexers/concerns/newspaper_works/indexes_publication_date_range_spec.rb +39 -0
  354. data/spec/indexers/concerns/newspaper_works/indexes_relationships_spec.rb +86 -0
  355. data/spec/indexers/newspaper_article_indexer_spec.rb +29 -0
  356. data/spec/indexers/newspaper_issue_indexer_spec.rb +19 -0
  357. data/spec/indexers/newspaper_title_indexer_spec.rb +22 -0
  358. data/spec/indexers/newspaper_works/newspaper_core_indexer_spec.rb +23 -0
  359. data/spec/lib/newspaper_works/configuration_spec.rb +18 -0
  360. data/spec/lib/newspaper_works/data/work_derivatives_spec.rb +245 -0
  361. data/spec/lib/newspaper_works/data/work_file_spec.rb +99 -0
  362. data/spec/lib/newspaper_works/data/work_files_spec.rb +224 -0
  363. data/spec/lib/newspaper_works/ingest/batch_issue_ingester_spec.rb +158 -0
  364. data/spec/lib/newspaper_works/ingest/chronam_publication_info_spec.rb +35 -0
  365. data/spec/lib/newspaper_works/ingest/from_command_spec.rb +75 -0
  366. data/spec/lib/newspaper_works/ingest/image_ingest_issues_spec.rb +62 -0
  367. data/spec/lib/newspaper_works/ingest/ingest_shared.rb +75 -0
  368. data/spec/lib/newspaper_works/ingest/issue_images_spec.rb +65 -0
  369. data/spec/lib/newspaper_works/ingest/lc_publication_info_spec.rb +34 -0
  370. data/spec/lib/newspaper_works/ingest/ndnp/batch_ingester_spec.rb +131 -0
  371. data/spec/lib/newspaper_works/ingest/ndnp/batch_xml_ingest_spec.rb +64 -0
  372. data/spec/lib/newspaper_works/ingest/ndnp/container_ingest_spec.rb +44 -0
  373. data/spec/lib/newspaper_works/ingest/ndnp/container_ingester_spec.rb +126 -0
  374. data/spec/lib/newspaper_works/ingest/ndnp/container_metadata_spec.rb +36 -0
  375. data/spec/lib/newspaper_works/ingest/ndnp/issue_ingest_spec.rb +108 -0
  376. data/spec/lib/newspaper_works/ingest/ndnp/issue_ingester_spec.rb +155 -0
  377. data/spec/lib/newspaper_works/ingest/ndnp/issue_metadata_spec.rb +84 -0
  378. data/spec/lib/newspaper_works/ingest/ndnp/page_ingest_spec.rb +79 -0
  379. data/spec/lib/newspaper_works/ingest/ndnp/page_ingester_spec.rb +184 -0
  380. data/spec/lib/newspaper_works/ingest/ndnp/page_metadata_spec.rb +85 -0
  381. data/spec/lib/newspaper_works/ingest/newspaper_issue_ingest_spec.rb +83 -0
  382. data/spec/lib/newspaper_works/ingest/newspaper_page_ingest_spec.rb +77 -0
  383. data/spec/lib/newspaper_works/ingest/page_image_spec.rb +29 -0
  384. data/spec/lib/newspaper_works/ingest/pdf_images_spec.rb +32 -0
  385. data/spec/lib/newspaper_works/ingest/pdf_issue_spec.rb +29 -0
  386. data/spec/lib/newspaper_works/ingest/pdf_issues_spec.rb +62 -0
  387. data/spec/lib/newspaper_works/ingest/pdf_pages_spec.rb +110 -0
  388. data/spec/lib/newspaper_works/ingest/pub_finder_spec.rb +58 -0
  389. data/spec/lib/newspaper_works/ingest/publication_info_spec.rb +61 -0
  390. data/spec/lib/newspaper_works/ingest_spec.rb +45 -0
  391. data/spec/lib/newspaper_works/issue_pdf_composer_spec.rb +101 -0
  392. data/spec/lib/newspaper_works/logging_spec.rb +53 -0
  393. data/spec/lib/newspaper_works/page_finder_spec.rb +53 -0
  394. data/spec/lib/newspaper_works/resource_fetcher_spec.rb +65 -0
  395. data/spec/lib/newspaper_works/text_extraction/alto_reader_spec.rb +49 -0
  396. data/spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb +84 -0
  397. data/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb +54 -0
  398. data/spec/lib/newspaper_works/text_extraction/word_coords_builder_spec.rb +30 -0
  399. data/spec/lib/tasks/newspaper_works_rake_spec.rb +124 -0
  400. data/spec/misc_shared.rb +109 -0
  401. data/spec/model_shared.rb +134 -0
  402. data/spec/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior_spec.rb +45 -0
  403. data/spec/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior_spec.rb +27 -0
  404. data/spec/models/concerns/newspaper_works/newspaper_core_metadata_spec.rb +45 -0
  405. data/spec/models/concerns/newspaper_works/place_of_publication_behavior_spec.rb +17 -0
  406. data/spec/models/concerns/newspaper_works/scanned_media_metadata_spec.rb +35 -0
  407. data/spec/models/newspaper_article_spec.rb +73 -0
  408. data/spec/models/newspaper_container_spec.rb +111 -0
  409. data/spec/models/newspaper_issue_spec.rb +91 -0
  410. data/spec/models/newspaper_page_spec.rb +44 -0
  411. data/spec/models/newspaper_title_spec.rb +116 -0
  412. data/spec/models/newspaper_works/derivative_attachment_spec.rb +37 -0
  413. data/spec/models/newspaper_works/ingest_file_relation_spec.rb +56 -0
  414. data/spec/models/solr_document_spec.rb +14 -0
  415. data/spec/ndnp_shared.rb +48 -0
  416. data/spec/presenters/hyrax/newspaper_article_presenter_spec.rb +53 -0
  417. data/spec/presenters/hyrax/newspaper_container_presenter_spec.rb +20 -0
  418. data/spec/presenters/hyrax/newspaper_issue_presenter_spec.rb +65 -0
  419. data/spec/presenters/hyrax/newspaper_page_presenter_spec.rb +75 -0
  420. data/spec/presenters/hyrax/newspaper_title_presenter_spec.rb +153 -0
  421. data/spec/presenters/newspaper_works/iiif_manifest_presenter_behavior_spec.rb +32 -0
  422. data/spec/presenters/newspaper_works/issue_info_presenter_spec.rb +51 -0
  423. data/spec/presenters/newspaper_works/newspaper_core_presenter_spec.rb +22 -0
  424. data/spec/presenters/newspaper_works/persistent_url_presenter_behavior_spec.rb +24 -0
  425. data/spec/presenters/newspaper_works/place_of_publication_presenter_behavior_spec.rb +17 -0
  426. data/spec/presenters/newspaper_works/scanned_media_presenter_spec.rb +18 -0
  427. data/spec/presenters/newspaper_works/title_info_presenter_spec.rb +23 -0
  428. data/spec/routing/route_spec.rb +52 -0
  429. data/spec/search_builders/custom_search_builder_spec.rb +34 -0
  430. data/spec/search_builders/newspaper_works/newspapers_search_builder_spec.rb +33 -0
  431. data/spec/services/hyrax/article_genre_service_spec.rb +12 -0
  432. data/spec/services/hyrax/resource_types_service_spec.rb +12 -0
  433. data/spec/services/newspaper_works/jp2_derivative_service_spec.rb +62 -0
  434. data/spec/services/newspaper_works/newspaper_page_derivative_service_spec.rb +125 -0
  435. data/spec/services/newspaper_works/pdf_derivative_service_spec.rb +62 -0
  436. data/spec/services/newspaper_works/pluggable_derivative_service_spec.rb +204 -0
  437. data/spec/services/newspaper_works/text_extraction_derivative_service_spec.rb +82 -0
  438. data/spec/services/newspaper_works/text_formats_from_alto_service_spec.rb +129 -0
  439. data/spec/services/newspaper_works/tiff_derivative_service_spec.rb +58 -0
  440. data/spec/spec_helper.rb +261 -0
  441. data/spec/support/controller_level_helpers.rb +28 -0
  442. data/spec/test_app_templates/lib/generators/test_app_generator.rb +22 -0
  443. data/spec/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb_spec.rb +36 -0
  444. data/spec/views/catalog/_index_header_list_newspaper_page.html.erb_spec.rb +26 -0
  445. data/spec/views/catalog/_thumbnail_list_newspaper_page.html.erb_spec.rb +35 -0
  446. data/spec/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb_spec.rb +16 -0
  447. data/spec/views/hyrax/newspaper_titles/_issue_search_form.html.erb_spec.rb +33 -0
  448. data/spec/views/hyrax/newspaper_titles/_issues_calendar.html.erb_spec.rb +37 -0
  449. data/spec/views/hyrax/newspaper_titles/show.html.erb_spec.rb +87 -0
  450. data/spec/views/newspaper_works/base/_attribute_rows.html.erb_spec.rb +60 -0
  451. data/spec/views/newspaper_works/base/_newspaper_hierarchy.html.erb_spec.rb +80 -0
  452. data/spec/views/newspaper_works/base/_show.html.erb_spec.rb +78 -0
  453. data/spec/views/newspaper_works/newspapers_search/search.html.erb_spec.rb +54 -0
  454. data/spec/views/records/edit_fields/_place_of_publication.html.erb_spec.rb +26 -0
  455. data/tasks/newspaperworks_dev.rake +26 -0
  456. data/test/integration/navigation_test.rb +7 -0
  457. data/test/lib/generators/newspaper_works/install_generator_test.rb +16 -0
  458. data/test/newspaper_works_test.rb +7 -0
  459. data/test/test_helper.rb +17 -0
  460. data/tmp/.keep +0 -0
  461. metadata +1037 -0
@@ -0,0 +1,52 @@
1
+ module NewspaperWorks
2
+ module Ingest
3
+ # Represents TIFF/JP2 page, access to file, page-numbering metadata
4
+ class PageImage
5
+ attr_accessor :path, :issue, :sequence
6
+
7
+ delegate :lccn, to: :issue
8
+
9
+ def initialize(path, issue, sequence)
10
+ # path to image:
11
+ @path = path
12
+ validate_path
13
+ # Issue is NewspaperWorks::Ingest::IssueImages object
14
+ @issue = issue
15
+ # sequence is page sequence number (Integer)
16
+ @sequence = sequence.to_i
17
+ end
18
+
19
+ # Page number inferred from image filename, or nil, presuming that:
20
+ # - The page number follows the actual word "page" (case-insenstive)
21
+ # in filename, possibly separated by a dash or underscore.
22
+ # - The page number is terminated by the period-plus-file-extension.
23
+ # - Both of the above can be determined by regular expression match.
24
+ # - Extraneous leading information in filename (e.g. datestamp) will
25
+ # be ignored.
26
+ # - Examples:
27
+ # - 'Page1.tiff'
28
+ # - '2019091801-page_1.jp2'
29
+ # - 'page_C2.tiff'
30
+ # @return [String, NilClass] page number string, or nil if indecipherable
31
+ def named_page_number
32
+ pattern = /(page)([_-]?)([^.]+)([.])/i
33
+ match = pattern.match(path)
34
+ match.nil? ? nil : match[3]
35
+ end
36
+
37
+ def page_number
38
+ named_page_number || @sequence.to_s
39
+ end
40
+
41
+ def title
42
+ ["#{@issue.title.first}: Page #{page_number}"]
43
+ end
44
+
45
+ def validate_path
46
+ # expect path to be regular file, that exists:
47
+ raise ArgumentError unless File.exist?(path)
48
+ raise ArgumentError unless File.ftype(path) == 'file'
49
+ end
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,52 @@
1
+ module NewspaperWorks
2
+ module Ingest
3
+ # Provides enumeration of path keys to object values, where:
4
+ # - Consuming class:
5
+ # - Defines a `paths` method returning array of paths.
6
+ # - Defines an `info` method that returns an object for a path.
7
+ # - Also mixes in Enumerable
8
+ module PathEnumeration
9
+ delegate :size, :include?, to: :_paths
10
+
11
+ def _paths
12
+ paths
13
+ end
14
+
15
+ def _info(path)
16
+ info(path)
17
+ end
18
+
19
+ def each
20
+ return enum_for(:each) unless block_given?
21
+ paths.each do |path|
22
+ yield [path, info(path)]
23
+ end
24
+ end
25
+
26
+ def each_key
27
+ enum_for(:each_key) unless block_given?
28
+ paths.each { |path| yield path }
29
+ end
30
+
31
+ def each_value
32
+ return enum_for(:each_value) unless block_given?
33
+ paths.each do |path|
34
+ yield info(path)
35
+ end
36
+ end
37
+
38
+ def values
39
+ each_value.to_a
40
+ end
41
+
42
+ def entries
43
+ each.to_a
44
+ end
45
+
46
+ alias each_pair each
47
+ alias keys _paths
48
+ alias has_key? include?
49
+ alias [] _info
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,85 @@
1
+ require 'open3'
2
+ require 'mini_magick'
3
+
4
+ module NewspaperWorks
5
+ module Ingest
6
+ # PdfImages uses poppler 0.19+ pdfimages command to extract image
7
+ # listing metadata from PDF files.
8
+ # For dpi extraction, falls back to calculating using MiniMagick,
9
+ # if neccessary.
10
+ class PdfImages
11
+ # class constant column numbers
12
+ COL_WIDTH = 3
13
+ COL_HEIGHT = 4
14
+ COL_COLOR = 5
15
+ COL_CHANNELS = 6
16
+ COL_BITS = 7
17
+ # only poppler 0.25+ has this column in output:
18
+ COL_XPPI = 12
19
+
20
+ def initialize(path)
21
+ @path = path
22
+ @cmd = format('pdfimages -list %<path>s', path: path)
23
+ @output = nil
24
+ @entries = nil
25
+ end
26
+
27
+ def process
28
+ # call just once
29
+ if @output.nil?
30
+ Open3.popen3(@cmd) do |_stdin, stdout, _stderr, _wait_thr|
31
+ @output = stdout.read.split("\n")
32
+ end
33
+ end
34
+ @output.slice(2, @output.size - 1)
35
+ end
36
+
37
+ def entries
38
+ if @entries.nil?
39
+ @entries = []
40
+ output = process
41
+ (0..output.size - 1).each do |i|
42
+ @entries.push(output[i].gsub(/\s+/m, ' ').strip.split(" "))
43
+ end
44
+ end
45
+ @entries
46
+ end
47
+
48
+ def selectcolumn(i, &block)
49
+ result = entries.map { |e| e[i] }
50
+ return result.map!(&block) if block_given?
51
+ result
52
+ end
53
+
54
+ def width
55
+ selectcolumn(COL_WIDTH, &:to_i).max
56
+ end
57
+
58
+ def height
59
+ selectcolumn(COL_HEIGHT, &:to_i).max
60
+ end
61
+
62
+ def color
63
+ # desc is either 'gray', 'cmyk', 'rgb', but 1-bit gray is black/white
64
+ # so caller may want all of this information, and in case of
65
+ # mixed color spaces across images, this returns maximum
66
+ desc = entries.any? { |e| e[COL_COLOR] != 'gray' } ? 'rgb' : 'gray'
67
+ channels = entries.map { |e| e[COL_CHANNELS].to_i }.max
68
+ bits = entries.map { |e| e[COL_BITS].to_i }.max
69
+ [desc, channels, bits]
70
+ end
71
+
72
+ def ppi
73
+ if entries[0].size <= 12
74
+ # poppler < 0.25
75
+ pdf = MiniMagick::Image.open(@path)
76
+ width_points = pdf.width
77
+ width_px = width
78
+ return (72 * width_px / width_points).to_i
79
+ end
80
+ # with poppler 0.25+, pdfimages just gives us this:
81
+ selectcolumn(COL_XPPI, &:to_i).max
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,20 @@
1
+ require 'date'
2
+
3
+ module NewspaperWorks
4
+ module Ingest
5
+ class PDFIssue
6
+ attr_accessor :path, :publication
7
+
8
+ # most acccessors for issue/edition metadata, publication metadata
9
+ # provided by including this mixin:
10
+ include NewspaperWorks::Ingest::NamedIssueMetadata
11
+
12
+ def initialize(path, publication)
13
+ @path = path
14
+ validate_path
15
+ # as a NewspaperWorks::Ingest::PublicationInfo object:
16
+ @publication = publication
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,39 @@
1
+ require 'find'
2
+
3
+ module NewspaperWorks
4
+ module Ingest
5
+ class PDFIssues
6
+ include Enumerable
7
+ include NewspaperWorks::Ingest::PathEnumeration
8
+
9
+ attr_accessor :path, :publication, :pdf_paths
10
+
11
+ alias paths pdf_paths
12
+
13
+ def initialize(path, publication)
14
+ @path = path
15
+ # as a NewspaperWorks::Ingest::PublicationInfo object:
16
+ @publication = publication
17
+ @pdf_paths = valid_pdfs(path)
18
+ end
19
+
20
+ def valid_pdfs(path)
21
+ target = []
22
+ Find.find(path) do |p|
23
+ next if File.directory?(p)
24
+ next unless p.end_with?('.pdf')
25
+ target.push(p)
26
+ end
27
+ target
28
+ end
29
+
30
+ def lccn
31
+ @publication.lccn
32
+ end
33
+
34
+ def info(path)
35
+ NewspaperWorks::Ingest::PDFIssue.new(path, @publication)
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,114 @@
1
+ require 'open3'
2
+ require 'securerandom'
3
+ require 'tmpdir'
4
+
5
+ module NewspaperWorks
6
+ module Ingest
7
+ class PdfPages
8
+ include Enumerable
9
+
10
+ def initialize(path)
11
+ @baseid = SecureRandom.uuid
12
+ @pdfpath = path
13
+ @info = nil
14
+ @entries = nil
15
+ @tmpdir = nil
16
+ @size = nil
17
+ @pagecount = nil
18
+ @pdftext = nil
19
+ end
20
+
21
+ # return
22
+ def pdfinfo
23
+ @info = PdfImages.new(@pdfpath) if @info.nil?
24
+ @info
25
+ end
26
+
27
+ def tmpdir
28
+ @tmpdir = Dir.mktmpdir if @tmpdir.nil?
29
+ @tmpdir
30
+ end
31
+
32
+ def colordevice(channels, bpc)
33
+ bits = bpc * channels
34
+ # will be either 8bpc/16bpd color TIFF,
35
+ # with any CMYK source transformed to 8bpc RBG
36
+ bits = 24 unless [24, 48].include? bits
37
+ "tiff#{bits}nc"
38
+ end
39
+
40
+ def gsdevice
41
+ color, channels, bpc = pdfinfo.color
42
+ device = nil
43
+ # CCITT Group 4 Black and White, if applicable:
44
+ device = 'tiffg4' if color == 'gray' && bpc == 1
45
+ # 8 Bit Grayscale, if applicable:
46
+ device = 'tiffgray' if color == 'gray' && bpc > 1
47
+ # otherwise color:
48
+ device = colordevice(channels, bpc) if device.nil?
49
+ device
50
+ end
51
+
52
+ def gstext
53
+ cmd = "gs -q -dNOPAUSE -dBATCH -sDEVICE=txtwrite " \
54
+ "-sOutputFile=- -f #{@pdfpath}"
55
+ Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
56
+ @pdftext = stdout.read
57
+ end
58
+ @pdftext
59
+ end
60
+
61
+ def pagecount
62
+ cmd = "pdfinfo #{@pdfpath}"
63
+ Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
64
+ output = stdout.read.split("\n")
65
+ pages_e = output.select { |e| e.start_with?('Pages:') }[0]
66
+ @pagecount = pages_e.split[-1].to_i
67
+ end
68
+ @pagecount
69
+ end
70
+
71
+ def looks_scanned
72
+ max_image_px = pdfinfo.width * pdfinfo.height
73
+ single_image_per_page = pdfinfo.entries.length == pagecount
74
+ # single 10mp+ image per page?
75
+ single_image_per_page && max_image_px > 1024 * 1024 * 10
76
+ end
77
+
78
+ def ppi
79
+ unless looks_scanned
80
+ # 400 dpi for something that does not look like scanned media:
81
+ return 400
82
+ end
83
+ # For scanned media, defer to detected image PPI:
84
+ pdfinfo.ppi
85
+ end
86
+
87
+ # ghostscript convert all pages to TIFF
88
+ def gsconvert
89
+ output_base = File.join(tmpdir, "#{@baseid}-page%d.tiff")
90
+ cmd = "gs -dNOPAUSE -dBATCH -sDEVICE=#{gsdevice} " \
91
+ "-dTextAlphaBits=4 " \
92
+ "-sOutputFile=#{output_base} -r#{ppi} -f #{@pdfpath}"
93
+ Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
94
+ output = stdout.read.split("\n")
95
+ @size = output.select { |e| e.start_with?('Page ') }.length
96
+ end
97
+ # Return an array of expected filenames
98
+ (1..@size).map { |n| File.join(tmpdir, "#{@baseid}-page#{n}.tiff") }
99
+ end
100
+
101
+ # entries for each page
102
+ def entries
103
+ @entries = gsconvert if @entries.nil?
104
+ @entries
105
+ end
106
+
107
+ def each
108
+ entries.each do |e|
109
+ yield(e)
110
+ end
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,89 @@
1
+ require 'newspaper_works/logging'
2
+ require 'newspaper_works/ingest'
3
+
4
+ module NewspaperWorks
5
+ module Ingest
6
+ # mixin for find-or-create of publication, for use by various ingests
7
+ module PubFinder
8
+ include NewspaperWorks::Logging
9
+
10
+ COPY_FIELDS = [
11
+ :title,
12
+ :lccn,
13
+ :oclcnum,
14
+ :issn,
15
+ :place_of_publication,
16
+ :language,
17
+ :preceded_by,
18
+ :succeeded_by
19
+ ].freeze
20
+
21
+ MULTI_VALUED = [
22
+ :title,
23
+ :language,
24
+ :preceded_by,
25
+ :succeeded_by,
26
+ :place_of_publication
27
+ ].freeze
28
+
29
+ WRAPPERS = {
30
+ place_of_publication: Hyrax::ControlledVocabularies::Location
31
+ }.freeze
32
+
33
+ # @param lccn [String] Library of Congress Control Number
34
+ # of Publication
35
+ # @return [NewspaperTitle, NilClass] publication or nil if not found
36
+ def find_publication(lccn)
37
+ NewspaperTitle.where(lccn: lccn).first
38
+ end
39
+
40
+ # Copy publication metadata from authority lookup for LCCN
41
+ # @param publication [NewspaperTitle]
42
+ # @param metadata [NewspaperWorks::Ingest::PublicationInfo]
43
+ def copy_publication_metadata(publication, metadata, lccn, title = nil)
44
+ COPY_FIELDS.each do |name|
45
+ value = metadata.send(name)
46
+ next if value.nil?
47
+ # wrapped value, if applicable:
48
+ value = WRAPPERS[name].new(value) if WRAPPERS.include?(name)
49
+ # value in array, if applicable:
50
+ value = [value] if MULTI_VALUED.include?(name)
51
+ publication.send("#{name}=", value)
52
+ end
53
+ # prefer locally-specified title to looked-up title:
54
+ publication.title = [title] unless title.nil?
55
+ # final fallback, nothing specified, title mandatory: use LCCN
56
+ publication.title = [lccn] if publication.title.empty?
57
+ end
58
+
59
+ def create_publication(lccn, title = nil, opts = {})
60
+ publication = NewspaperTitle.create
61
+ info = NewspaperWorks::Ingest::PublicationInfo.new(lccn)
62
+ copy_publication_metadata(publication, info, lccn, title)
63
+ publication.lccn ||= lccn
64
+ NewspaperWorks::Ingest.assign_administrative_metadata(publication, opts)
65
+ publication.save!
66
+ write_log(
67
+ "Created NewspaperTitle work #{publication.id} for LCCN #{lccn}"
68
+ )
69
+ publication
70
+ end
71
+
72
+ def find_or_create_publication_for_issue(issue, lccn, title, opts)
73
+ publication = find_publication(lccn)
74
+ unless publication.nil?
75
+ write_log(
76
+ "Found existing NewspaperTitle #{publication.id}, LCCN #{lccn}"
77
+ )
78
+ end
79
+ publication = create_publication(lccn, title, opts) if publication.nil?
80
+ publication.members << issue
81
+ publication.save!
82
+ write_log(
83
+ "Linked NewspaperIssue #{issue.id} to "\
84
+ "NewspaperTitle work #{publication.id}"
85
+ )
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,44 @@
1
+ require 'faraday'
2
+ require 'nokogiri'
3
+ require 'uri'
4
+
5
+ module NewspaperWorks
6
+ module Ingest
7
+ class PublicationInfo
8
+ attr_accessor :implementation, :lccn
9
+
10
+ def initialize(lccn)
11
+ @lccn = lccn
12
+ @implementation = nil
13
+ load
14
+ end
15
+
16
+ def load_chronam_fallback
17
+ @implementation = ChronAmPublicationInfo.new(@lccn)
18
+ end
19
+
20
+ def load
21
+ @implementation = LCPublicationInfo.new(@lccn)
22
+ @implementation.load
23
+ # Empty mods is equivalent to 404 for LCCN in LC Catalog:
24
+ load_chronam_fallback if @implementation.empty?
25
+ end
26
+
27
+ def respond_to_missing?(symbol, include_priv = false)
28
+ @implementation.respond_to?(symbol, include_priv)
29
+ end
30
+
31
+ def method_missing(method, *args, &block)
32
+ # proxy call to underlying implementation:
33
+ if respond_to_missing?(method)
34
+ return @implementation.send(
35
+ method,
36
+ *args,
37
+ &block
38
+ )
39
+ end
40
+ super
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,90 @@
1
+ require 'faraday'
2
+ require 'nokogiri'
3
+ require 'uri'
4
+ require 'newspaper_works/ingest/from_command'
5
+ require 'newspaper_works/ingest/base_publication_info'
6
+ require 'newspaper_works/ingest/chronam_publication_info'
7
+ require 'newspaper_works/ingest/lc_publication_info'
8
+ require 'newspaper_works/ingest/publication_info'
9
+ require 'newspaper_works/ingest/pub_finder'
10
+ require 'newspaper_works/ingest/pdf_images'
11
+ require 'newspaper_works/ingest/named_issue_metadata'
12
+ require 'newspaper_works/ingest/path_enumeration'
13
+ require 'newspaper_works/ingest/pdf_issue'
14
+ require 'newspaper_works/ingest/pdf_issues'
15
+ require 'newspaper_works/ingest/batch_ingest_helper'
16
+ require 'newspaper_works/ingest/batch_issue_ingester'
17
+ require 'newspaper_works/ingest/pdf_pages'
18
+ require 'newspaper_works/ingest/issue_images'
19
+ require 'newspaper_works/ingest/page_image'
20
+ require 'newspaper_works/ingest/image_ingest_issues'
21
+ require 'newspaper_works/ingest/base_ingest'
22
+ require 'newspaper_works/ingest/ndnp'
23
+ require 'newspaper_works/ingest/newspaper_page_ingest'
24
+ require 'newspaper_works/ingest/newspaper_issue_ingest'
25
+
26
+ module NewspaperWorks
27
+ # Module for Ingest adapters that import files into model objects
28
+ module Ingest
29
+ # Get Geonames URI for closest place match
30
+ # Requires Qa::Authorities::Geonames.username is set, likely via
31
+ # `Hyrax.config.geonames_username=` setter in
32
+ # config/initializers/hyrax.rb of consuming app.
33
+ # @param place_name [String] Name of place as human-readable text
34
+ # @return [String, NilClass] URI to Geonames RDF or nil
35
+ def self.geonames_place_uri(place_name)
36
+ username = Qa::Authorities::Geonames.username
37
+ return if username.nil? || username.empty?
38
+ place_name = place_name.delete('.').split(/[\[\(]/)[0].strip
39
+ query = URI.encode(place_name)
40
+ geo_qs = "q=#{query}&username=#{username}"
41
+ url = "http://api.geonames.org/search?#{geo_qs}"
42
+ resp = NewspaperWorks::ResourceFetcher.get url
43
+ doc = Nokogiri.XML(resp['body'])
44
+ geonames_id = doc.xpath('//geonames/geoname[1]/geonameId').first
45
+ return if geonames_id.nil?
46
+ "http://sws.geonames.org/#{geonames_id.text}/"
47
+ end
48
+
49
+ # Normalize publication title from catalog data
50
+ # Presently strips trailing period
51
+ # @param title [String]
52
+ # @return [String] normalized title
53
+ def self.normalize_title(title)
54
+ title.strip.sub(/[.]+$/, '')
55
+ end
56
+
57
+ # Get publication metadata from LC catalog MODS data, if available,
58
+ # and from ChronAm, as a fallback.
59
+ # @param lccn [String] Library of Congress Control number for publication
60
+ # @return [NewspaperWorks::Ingest::PublicationInfo] proxy to metadata
61
+ # source, an object for accessors for publication fields.
62
+ def self.publication_metadata(lccn)
63
+ PublicationInfo.new(lccn)
64
+ end
65
+
66
+ def self.find_admin_set(admin_set = nil)
67
+ return admin_set if admin_set.class == AdminSet
68
+ admin_set = AdminSet::DEFAULT_ID if admin_set.nil?
69
+ begin
70
+ AdminSet.find(admin_set)
71
+ rescue
72
+ # only create if default admin set
73
+ raise unless admin_set == AdminSet::DEFAULT_ID
74
+ AdminSet.find(AdminSet.find_or_create_default_admin_set_id)
75
+ end
76
+ end
77
+
78
+ def self.assign_administrative_metadata(work, opts = {})
79
+ work.depositor = opts.fetch(:email, User.batch_user.user_key)
80
+ work.admin_set = find_admin_set(opts.fetch(:admin_set, nil))
81
+ work.visibility = opts.fetch(:visibility, 'open')
82
+ work.resource_type = ['Newspapers']
83
+ work.date_modified ||= Hyrax::TimeService.time_in_utc
84
+ work.date_uploaded ||= work.date_modified
85
+ work.state = RDF::URI(
86
+ 'http://fedora.info/definitions/1/0/access/ObjState#active'
87
+ )
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,111 @@
1
+ require 'open3'
2
+ require 'tmpdir'
3
+
4
+ module NewspaperWorks
5
+ # Adapter class composes a PDF derivative for issue, if it requires one.
6
+ class IssuePDFComposer
7
+ attr_accessor :issue, :page_pdfs
8
+
9
+ CMD_BASE = "gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite".freeze
10
+
11
+ # @param issue [NewspaperIssue] adapts issue work object
12
+ def initialize(issue)
13
+ @issue = issue
14
+ # paths to page PDFs
15
+ @page_pdfs = []
16
+ end
17
+
18
+ def compose
19
+ # we will not step on any existing PDF
20
+ return if issue_pdf_exists?
21
+ # we can not compose a multi-page issue PDF if constituent page PDFs
22
+ # do not exist (yet == not ready, possibly waiting on an async job).
23
+ @page_pdfs = validated_page_pdfs
24
+ # Compose a Ghostscript command to merge all paths in @page_pdfs into
25
+ # a single output document, execute:
26
+ compose_from_pages
27
+ end
28
+
29
+ def compose_from_pages
30
+ outfile = File.join(Dir.mktmpdir, output_filename)
31
+ sources = @page_pdfs.join(' ')
32
+ cmd = "#{CMD_BASE} -sOutputFile=#{outfile} #{sources}"
33
+ # rubocop:disable Lint/UnusedBlockArgument
34
+ Open3.popen3(cmd) do |stdin, stdout, stderr, wait_thr|
35
+ unless wait_thr.value.success?
36
+ e = "Ghostscript Error: \n#{stderr.read}"
37
+ raise NewspaperWorks::DataError, e
38
+ end
39
+ end
40
+ # rubocop:enable Lint/UnusedBlockArgument
41
+ # at this point, something should exist and validate at path `outfile`:
42
+ raise NewspaperWorks::DataError, "Generated PDF invalid" unless validate_pdf(outfile)
43
+ # Assign for attachment to issue, commit:
44
+ attach_to_issue(outfile)
45
+ end
46
+
47
+ def output_filename
48
+ "#{@issue.id}_full-issue.pdf"
49
+ end
50
+
51
+ # Validate PDF with poppler `pdfinfo` command, which will detect
52
+ # error conditions in cases like truncated PDF, and only in those
53
+ # error conditions will write to stderr.
54
+ # @param path [String] path to PDF file
55
+ # @return [Boolean] true or false
56
+ def validate_pdf(path)
57
+ return false if path.nil? || !File.exist?(path)
58
+ return false if File.size(path).zero?
59
+ result = ''
60
+ cmd = "pdfinfo #{path}"
61
+ # rubocop:disable Lint/UnusedBlockArgument
62
+ Open3.popen3(cmd) do |stdin, stdout, stderr, wait_thr|
63
+ result = stderr.read
64
+ end
65
+ # rubocop:enable Lint/UnusedBlockArgument
66
+ # only zero bytes stderr output from `pdfinfo` considered valid PDF:
67
+ result.size.zero?
68
+ end
69
+
70
+ private
71
+
72
+ # @return [Array] list of paths to page PDFs, in page order
73
+ # @raises [NewspaperWorks::PagesNotReady] if any page has invalid
74
+ # or non-ready PDF source.
75
+ def validated_page_pdfs
76
+ result = []
77
+ # if any page PDF invalid, raise; otherwise append to result:
78
+ issue.pages.to_a.each_with_index do |page, idx|
79
+ e = "Page PDFs not ready for issue "\
80
+ "(Issue id: #{issue.id}, Page index: #{idx})"
81
+ path = derivatives_of(page).path('pdf')
82
+ raise NewspaperWorks::PagesNotReady, e unless validate_pdf(path)
83
+ result.push(path)
84
+ end
85
+ result
86
+ end
87
+
88
+ def issue_pdf_exists?
89
+ derivatives_of(@issue).exist?('pdf')
90
+ end
91
+
92
+ def derivatives_of(work)
93
+ NewspaperWorks::Data::WorkDerivatives.of(work)
94
+ end
95
+
96
+ def ensure_whitelist
97
+ whitelist = Hyrax.config.whitelisted_ingest_dirs
98
+ whitelist.push(Dir.tmpdir) unless whitelist.include?(Dir.tmpdir)
99
+ end
100
+
101
+ def attach_to_issue(path)
102
+ ensure_whitelist
103
+ # We rely upon WorkFiles to create fileset, and by consequence of
104
+ # running primary file attachment through actor stack,
105
+ # visibility of the FileSet is copied from the work:
106
+ attachment = NewspaperWorks::Data::WorkFiles.of(@issue)
107
+ attachment.assign(path)
108
+ attachment.commit!
109
+ end
110
+ end
111
+ end