newspaper_works 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (461) hide show
  1. checksums.yaml +7 -0
  2. data/.fcrepo_wrapper +4 -0
  3. data/.gitignore +43 -0
  4. data/.rubocop.yml +143 -0
  5. data/.solr_wrapper +8 -0
  6. data/.travis.yml +50 -0
  7. data/Gemfile +47 -0
  8. data/LICENSE +203 -0
  9. data/README.md +159 -0
  10. data/Rakefile +38 -0
  11. data/app/actors/hyrax/actors/newspaper_article_actor.rb +8 -0
  12. data/app/actors/hyrax/actors/newspaper_container_actor.rb +8 -0
  13. data/app/actors/hyrax/actors/newspaper_issue_actor.rb +8 -0
  14. data/app/actors/hyrax/actors/newspaper_page_actor.rb +8 -0
  15. data/app/actors/hyrax/actors/newspaper_title_actor.rb +8 -0
  16. data/app/actors/newspaper_works/actors/newspaper_works_upload_actor.rb +88 -0
  17. data/app/assets/config/newspaper_works_manifest.js +2 -0
  18. data/app/assets/images/newspaper_works/.keep +0 -0
  19. data/app/assets/javascripts/newspaper_works/autocomplete_fix.js +33 -0
  20. data/app/assets/javascripts/newspaper_works/ocr_search.js.erb +6 -0
  21. data/app/assets/javascripts/newspaper_works/thumbnail_highlights.js.erb +102 -0
  22. data/app/assets/javascripts/newspaper_works.js +4 -0
  23. data/app/assets/stylesheets/newspaper_works/_issue_search.scss +13 -0
  24. data/app/assets/stylesheets/newspaper_works/_issues_calendar.scss +18 -0
  25. data/app/assets/stylesheets/newspaper_works/_newspaper_works.scss +4 -0
  26. data/app/assets/stylesheets/newspaper_works/_newspapers_search.scss +38 -0
  27. data/app/assets/stylesheets/newspaper_works/_search_results.scss +12 -0
  28. data/app/controllers/hyrax/newspaper_articles_controller.rb +14 -0
  29. data/app/controllers/hyrax/newspaper_containers_controller.rb +14 -0
  30. data/app/controllers/hyrax/newspaper_issues_controller.rb +14 -0
  31. data/app/controllers/hyrax/newspaper_pages_controller.rb +14 -0
  32. data/app/controllers/hyrax/newspaper_titles_controller.rb +13 -0
  33. data/app/controllers/newspaper_works/newspapers_controller.rb +117 -0
  34. data/app/controllers/newspaper_works/newspapers_search_controller.rb +26 -0
  35. data/app/forms/hyrax/newspaper_article_form.rb +11 -0
  36. data/app/forms/hyrax/newspaper_container_form.rb +11 -0
  37. data/app/forms/hyrax/newspaper_issue_form.rb +11 -0
  38. data/app/forms/hyrax/newspaper_page_form.rb +15 -0
  39. data/app/forms/hyrax/newspaper_title_form.rb +12 -0
  40. data/app/forms/newspaper_works/newspaper_core_form_data.rb +17 -0
  41. data/app/helpers/hyrax/newspaper_articles_helper.rb +5 -0
  42. data/app/helpers/hyrax/newspaper_containers_helper.rb +5 -0
  43. data/app/helpers/hyrax/newspaper_issues_helper.rb +5 -0
  44. data/app/helpers/hyrax/newspaper_pages_helper.rb +5 -0
  45. data/app/helpers/newspaper_works/application_helper.rb +5 -0
  46. data/app/helpers/newspaper_works/breadcrumb_helper.rb +92 -0
  47. data/app/helpers/newspaper_works/newspaper_works_helper_behavior.rb +103 -0
  48. data/app/helpers/newspaper_works/newspapers_helper.rb +5 -0
  49. data/app/indexers/concerns/newspaper_works/indexes_full_text.rb +17 -0
  50. data/app/indexers/concerns/newspaper_works/indexes_place_of_publication.rb +67 -0
  51. data/app/indexers/concerns/newspaper_works/indexes_publication_date_range.rb +35 -0
  52. data/app/indexers/concerns/newspaper_works/indexes_relationships.rb +125 -0
  53. data/app/indexers/newspaper_article_indexer.rb +16 -0
  54. data/app/indexers/newspaper_container_indexer.rb +18 -0
  55. data/app/indexers/newspaper_issue_indexer.rb +26 -0
  56. data/app/indexers/newspaper_page_indexer.rb +9 -0
  57. data/app/indexers/newspaper_title_indexer.rb +19 -0
  58. data/app/indexers/newspaper_works/newspaper_core_indexer.rb +21 -0
  59. data/app/jobs/newspaper_works/application_job.rb +4 -0
  60. data/app/jobs/newspaper_works/compose_issue_pdf_job.rb +13 -0
  61. data/app/jobs/newspaper_works/create_issue_pages_job.rb +19 -0
  62. data/app/mailers/newspaper_works/application_mailer.rb +8 -0
  63. data/app/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior.rb +82 -0
  64. data/app/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior.rb +27 -0
  65. data/app/models/concerns/newspaper_works/newspaper_core_metadata.rb +67 -0
  66. data/app/models/concerns/newspaper_works/place_of_publication_behavior.rb +15 -0
  67. data/app/models/concerns/newspaper_works/scanned_media_metadata.rb +43 -0
  68. data/app/models/concerns/newspaper_works/solr/document.rb +25 -0
  69. data/app/models/file_set.rb +10 -0
  70. data/app/models/newspaper_article.rb +158 -0
  71. data/app/models/newspaper_container.rb +86 -0
  72. data/app/models/newspaper_issue.rb +115 -0
  73. data/app/models/newspaper_page.rb +70 -0
  74. data/app/models/newspaper_title.rb +111 -0
  75. data/app/models/newspaper_works/application_record.rb +6 -0
  76. data/app/models/newspaper_works/derivative_attachment.rb +8 -0
  77. data/app/models/newspaper_works/ingest_file_relation.rb +14 -0
  78. data/app/presenters/hyrax/newspaper_article_presenter.rb +38 -0
  79. data/app/presenters/hyrax/newspaper_container_presenter.rb +11 -0
  80. data/app/presenters/hyrax/newspaper_issue_presenter.rb +62 -0
  81. data/app/presenters/hyrax/newspaper_page_presenter.rb +72 -0
  82. data/app/presenters/hyrax/newspaper_title_presenter.rb +86 -0
  83. data/app/presenters/newspaper_works/iiif_manifest_presenter_behavior.rb +29 -0
  84. data/app/presenters/newspaper_works/issue_info_presenter.rb +29 -0
  85. data/app/presenters/newspaper_works/newspaper_core_presenter.rb +9 -0
  86. data/app/presenters/newspaper_works/persistent_url_presenter_behavior.rb +16 -0
  87. data/app/presenters/newspaper_works/place_of_publication_presenter_behavior.rb +8 -0
  88. data/app/presenters/newspaper_works/scanned_media_presenter.rb +7 -0
  89. data/app/presenters/newspaper_works/title_info_presenter.rb +13 -0
  90. data/app/search_builders/concerns/newspaper_works/exclude_models.rb +16 -0
  91. data/app/search_builders/concerns/newspaper_works/highlight_search_params.rb +14 -0
  92. data/app/search_builders/newspaper_works/newspapers_search_builder.rb +26 -0
  93. data/app/services/hyrax/article_genre_service.rb +9 -0
  94. data/app/services/newspaper_works/jp2_derivative_service.rb +120 -0
  95. data/app/services/newspaper_works/newspaper_page_derivative_service.rb +91 -0
  96. data/app/services/newspaper_works/pdf_derivative_service.rb +45 -0
  97. data/app/services/newspaper_works/pluggable_derivative_service.rb +114 -0
  98. data/app/services/newspaper_works/text_extraction_derivative_service.rb +56 -0
  99. data/app/services/newspaper_works/text_formats_from_alto_service.rb +77 -0
  100. data/app/services/newspaper_works/tiff_derivative_service.rb +54 -0
  101. data/app/validators/newspaper_works/publication_date_start_end_validator.rb +48 -0
  102. data/app/validators/newspaper_works/publication_date_validator.rb +16 -0
  103. data/app/views/catalog/_index_gallery_newspaper_article_wrapper.html.erb +9 -0
  104. data/app/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb +9 -0
  105. data/app/views/catalog/_index_header_gallery_newspaper_article.html.erb +23 -0
  106. data/app/views/catalog/_index_header_gallery_newspaper_page.html.erb +23 -0
  107. data/app/views/catalog/_index_header_list_newspaper_article.html.erb +7 -0
  108. data/app/views/catalog/_index_header_list_newspaper_page.html.erb +7 -0
  109. data/app/views/catalog/_snippets_more.html.erb +16 -0
  110. data/app/views/catalog/_thumbnail_list_newspaper_article.html.erb +6 -0
  111. data/app/views/catalog/_thumbnail_list_newspaper_page.html.erb +6 -0
  112. data/app/views/hyrax/file_sets/_actions.html.erb +45 -0
  113. data/app/views/hyrax/newspaper_articles/_newspaper_article.html.erb +2 -0
  114. data/app/views/hyrax/newspaper_articles/show.html.erb +1 -0
  115. data/app/views/hyrax/newspaper_containers/_newspaper_container.html.erb +2 -0
  116. data/app/views/hyrax/newspaper_containers/show.html.erb +1 -0
  117. data/app/views/hyrax/newspaper_issues/_newspaper_issue.html.erb +2 -0
  118. data/app/views/hyrax/newspaper_issues/show.html.erb +1 -0
  119. data/app/views/hyrax/newspaper_pages/_newspaper_page.html.erb +2 -0
  120. data/app/views/hyrax/newspaper_pages/show.html.erb +1 -0
  121. data/app/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb +5 -0
  122. data/app/views/hyrax/newspaper_titles/_issue_search_form.html.erb +33 -0
  123. data/app/views/hyrax/newspaper_titles/_issues_calendar.html.erb +63 -0
  124. data/app/views/hyrax/newspaper_titles/_newspaper_title.html.erb +2 -0
  125. data/app/views/hyrax/newspaper_titles/show.html.erb +54 -0
  126. data/app/views/newspaper_works/base/_attribute_rows.html.erb +42 -0
  127. data/app/views/newspaper_works/base/_attributes.html.erb +16 -0
  128. data/app/views/newspaper_works/base/_metadata.html.erb +6 -0
  129. data/app/views/newspaper_works/base/_newspaper_hierarchy.html.erb +14 -0
  130. data/app/views/newspaper_works/base/_persistent_url.html.erb +1 -0
  131. data/app/views/newspaper_works/base/_show.html.erb +45 -0
  132. data/app/views/newspaper_works/newspapers_search/_date_fields.html.erb +29 -0
  133. data/app/views/newspaper_works/newspapers_search/_facet_layout.html.erb +8 -0
  134. data/app/views/newspaper_works/newspapers_search/_facet_limit.html.erb +17 -0
  135. data/app/views/newspaper_works/newspapers_search/_front_pages_input.html.erb +5 -0
  136. data/app/views/newspaper_works/newspapers_search/_keyword_input.html.erb +18 -0
  137. data/app/views/newspaper_works/newspapers_search/_newspapers_facets.html.erb +5 -0
  138. data/app/views/newspaper_works/newspapers_search/_newspapers_search_form.html.erb +13 -0
  139. data/app/views/newspaper_works/newspapers_search/_newspapers_search_help.html.erb +8 -0
  140. data/app/views/newspaper_works/newspapers_search/search.html.erb +13 -0
  141. data/app/views/records/edit_fields/_alternate_title.html.erb +4 -0
  142. data/app/views/records/edit_fields/_genre.html.erb +4 -0
  143. data/app/views/records/edit_fields/_place_of_publication.html.erb +14 -0
  144. data/app/views/records/edit_fields/_subtitle.html.erb +4 -0
  145. data/bin/rails +13 -0
  146. data/config/fcrepo_wrapper_test.yml +5 -0
  147. data/config/initializers/assets.rb +2 -0
  148. data/config/locales/newspaper_article.de.yml +12 -0
  149. data/config/locales/newspaper_article.en.yml +12 -0
  150. data/config/locales/newspaper_article.es.yml +12 -0
  151. data/config/locales/newspaper_article.fr.yml +12 -0
  152. data/config/locales/newspaper_article.it.yml +12 -0
  153. data/config/locales/newspaper_article.pt-BR.yml +12 -0
  154. data/config/locales/newspaper_article.zh.yml +12 -0
  155. data/config/locales/newspaper_container.de.yml +8 -0
  156. data/config/locales/newspaper_container.en.yml +8 -0
  157. data/config/locales/newspaper_container.es.yml +8 -0
  158. data/config/locales/newspaper_container.fr.yml +8 -0
  159. data/config/locales/newspaper_container.it.yml +8 -0
  160. data/config/locales/newspaper_container.pt-BR.yml +8 -0
  161. data/config/locales/newspaper_container.zh.yml +8 -0
  162. data/config/locales/newspaper_issue.de.yml +8 -0
  163. data/config/locales/newspaper_issue.en.yml +8 -0
  164. data/config/locales/newspaper_issue.es.yml +8 -0
  165. data/config/locales/newspaper_issue.fr.yml +8 -0
  166. data/config/locales/newspaper_issue.it.yml +8 -0
  167. data/config/locales/newspaper_issue.pt-BR.yml +8 -0
  168. data/config/locales/newspaper_issue.zh.yml +8 -0
  169. data/config/locales/newspaper_page.de.yml +15 -0
  170. data/config/locales/newspaper_page.en.yml +15 -0
  171. data/config/locales/newspaper_page.es.yml +15 -0
  172. data/config/locales/newspaper_page.fr.yml +15 -0
  173. data/config/locales/newspaper_page.it.yml +15 -0
  174. data/config/locales/newspaper_page.pt-BR.yml +15 -0
  175. data/config/locales/newspaper_page.zh.yml +15 -0
  176. data/config/locales/newspaper_title.de.yml +8 -0
  177. data/config/locales/newspaper_title.en.yml +8 -0
  178. data/config/locales/newspaper_title.es.yml +8 -0
  179. data/config/locales/newspaper_title.fr.yml +8 -0
  180. data/config/locales/newspaper_title.it.yml +8 -0
  181. data/config/locales/newspaper_title.pt-BR.yml +8 -0
  182. data/config/locales/newspaper_title.zh.yml +8 -0
  183. data/config/locales/newspaper_works.de.yml +50 -0
  184. data/config/locales/newspaper_works.en.yml +52 -0
  185. data/config/locales/newspaper_works.es.yml +52 -0
  186. data/config/locales/newspaper_works.fr.yml +52 -0
  187. data/config/locales/newspaper_works.it.yml +52 -0
  188. data/config/locales/newspaper_works.pt-BR.yml +52 -0
  189. data/config/locales/newspaper_works.zh.yml +52 -0
  190. data/config/routes.rb +9 -0
  191. data/config/solr_wrapper_test.yml +9 -0
  192. data/config/test-fixture/solr-config/_rest_managed.json +3 -0
  193. data/config/test-fixture/solr-config/admin-extra.html +31 -0
  194. data/config/test-fixture/solr-config/elevate.xml +36 -0
  195. data/config/test-fixture/solr-config/mapping-ISOLatin1Accent.txt +246 -0
  196. data/config/test-fixture/solr-config/protwords.txt +21 -0
  197. data/config/test-fixture/solr-config/schema.xml +366 -0
  198. data/config/test-fixture/solr-config/scripts.conf +24 -0
  199. data/config/test-fixture/solr-config/solrconfig.xml +322 -0
  200. data/config/test-fixture/solr-config/spellings.txt +2 -0
  201. data/config/test-fixture/solr-config/stopwords.txt +58 -0
  202. data/config/test-fixture/solr-config/stopwords_en.txt +58 -0
  203. data/config/test-fixture/solr-config/synonyms.txt +31 -0
  204. data/config/test-fixture/solr-config/xslt/example.xsl +132 -0
  205. data/config/test-fixture/solr-config/xslt/example_atom.xsl +67 -0
  206. data/config/test-fixture/solr-config/xslt/example_rss.xsl +66 -0
  207. data/config/test-fixture/solr-config/xslt/luke.xsl +337 -0
  208. data/config/vendor/imagemagick-6-policy.xml +76 -0
  209. data/db/migrate/20181214181358_create_newspaper_works_derivative_attachments.rb +12 -0
  210. data/db/migrate/20190107165909_create_newspaper_works_ingest_file_relations.rb +11 -0
  211. data/lib/generators/newspaper_works/assets_generator.rb +29 -0
  212. data/lib/generators/newspaper_works/blacklight_advanced_search_generator.rb +44 -0
  213. data/lib/generators/newspaper_works/blacklight_iiif_search_generator.rb +41 -0
  214. data/lib/generators/newspaper_works/catalog_controller_generator.rb +60 -0
  215. data/lib/generators/newspaper_works/install_generator.rb +97 -0
  216. data/lib/generators/newspaper_works/templates/annotation_behavior.rb +6 -0
  217. data/lib/generators/newspaper_works/templates/config/authorities/newspaper_article_genres.yml +86 -0
  218. data/lib/generators/newspaper_works/templates/config/initializers/newspaper_works.rb +12 -0
  219. data/lib/generators/newspaper_works/templates/config/initializers/patch_blacklight_advanced_search.rb +74 -0
  220. data/lib/generators/newspaper_works/templates/custom_search_builder.rb +23 -0
  221. data/lib/generators/newspaper_works/templates/newspaper_works.scss +1 -0
  222. data/lib/generators/newspaper_works/templates/newspaper_works_helper.rb +3 -0
  223. data/lib/generators/newspaper_works/templates/search_behavior.rb +6 -0
  224. data/lib/newspaper_works/configuration.rb +14 -0
  225. data/lib/newspaper_works/data/fileset_helper.rb +25 -0
  226. data/lib/newspaper_works/data/path_helper.rb +40 -0
  227. data/lib/newspaper_works/data/work_derivatives.rb +314 -0
  228. data/lib/newspaper_works/data/work_file.rb +92 -0
  229. data/lib/newspaper_works/data/work_files.rb +181 -0
  230. data/lib/newspaper_works/data.rb +35 -0
  231. data/lib/newspaper_works/engine.rb +42 -0
  232. data/lib/newspaper_works/errors.rb +14 -0
  233. data/lib/newspaper_works/ingest/base_ingest.rb +69 -0
  234. data/lib/newspaper_works/ingest/base_publication_info.rb +35 -0
  235. data/lib/newspaper_works/ingest/batch_ingest_helper.rb +44 -0
  236. data/lib/newspaper_works/ingest/batch_issue_ingester.rb +129 -0
  237. data/lib/newspaper_works/ingest/chronam_publication_info.rb +133 -0
  238. data/lib/newspaper_works/ingest/from_command.rb +52 -0
  239. data/lib/newspaper_works/ingest/image_ingest_issues.rb +43 -0
  240. data/lib/newspaper_works/ingest/issue_images.rb +51 -0
  241. data/lib/newspaper_works/ingest/lc_publication_info.rb +144 -0
  242. data/lib/newspaper_works/ingest/named_issue_metadata.rb +60 -0
  243. data/lib/newspaper_works/ingest/ndnp/batch_ingester.rb +64 -0
  244. data/lib/newspaper_works/ingest/ndnp/batch_xml_ingest.rb +72 -0
  245. data/lib/newspaper_works/ingest/ndnp/container_ingest.rb +99 -0
  246. data/lib/newspaper_works/ingest/ndnp/container_ingester.rb +84 -0
  247. data/lib/newspaper_works/ingest/ndnp/container_metadata.rb +87 -0
  248. data/lib/newspaper_works/ingest/ndnp/issue_ingest.rb +81 -0
  249. data/lib/newspaper_works/ingest/ndnp/issue_ingester.rb +101 -0
  250. data/lib/newspaper_works/ingest/ndnp/issue_metadata.rb +96 -0
  251. data/lib/newspaper_works/ingest/ndnp/ndnp_asset_helper.rb +20 -0
  252. data/lib/newspaper_works/ingest/ndnp/ndnp_mets_helper.rb +70 -0
  253. data/lib/newspaper_works/ingest/ndnp/page_ingest.rb +47 -0
  254. data/lib/newspaper_works/ingest/ndnp/page_ingester.rb +157 -0
  255. data/lib/newspaper_works/ingest/ndnp/page_metadata.rb +112 -0
  256. data/lib/newspaper_works/ingest/ndnp.rb +21 -0
  257. data/lib/newspaper_works/ingest/newspaper_issue_ingest.rb +56 -0
  258. data/lib/newspaper_works/ingest/newspaper_page_ingest.rb +6 -0
  259. data/lib/newspaper_works/ingest/page_image.rb +52 -0
  260. data/lib/newspaper_works/ingest/path_enumeration.rb +52 -0
  261. data/lib/newspaper_works/ingest/pdf_images.rb +85 -0
  262. data/lib/newspaper_works/ingest/pdf_issue.rb +20 -0
  263. data/lib/newspaper_works/ingest/pdf_issues.rb +39 -0
  264. data/lib/newspaper_works/ingest/pdf_pages.rb +114 -0
  265. data/lib/newspaper_works/ingest/pub_finder.rb +89 -0
  266. data/lib/newspaper_works/ingest/publication_info.rb +44 -0
  267. data/lib/newspaper_works/ingest.rb +90 -0
  268. data/lib/newspaper_works/issue_pdf_composer.rb +111 -0
  269. data/lib/newspaper_works/logging.rb +54 -0
  270. data/lib/newspaper_works/page_finder.rb +62 -0
  271. data/lib/newspaper_works/resource_fetcher.rb +78 -0
  272. data/lib/newspaper_works/text_extraction/alto_reader.rb +122 -0
  273. data/lib/newspaper_works/text_extraction/page_ocr.rb +100 -0
  274. data/lib/newspaper_works/text_extraction/render_alto.rb +84 -0
  275. data/lib/newspaper_works/text_extraction/word_coords_builder.rb +30 -0
  276. data/lib/newspaper_works/text_extraction.rb +10 -0
  277. data/lib/newspaper_works/version.rb +3 -0
  278. data/lib/newspaper_works.rb +19 -0
  279. data/lib/tasks/newspaper_works_tasks.rake +39 -0
  280. data/newspaper_works.gemspec +49 -0
  281. data/spec/.keep.txt +1 -0
  282. data/spec/actors/newspaper_works/actors/newspaper_works_upload_actor_spec.rb +69 -0
  283. data/spec/controllers/catalog_controller_spec.rb +63 -0
  284. data/spec/controllers/newspaper_works/newspapers_controller_spec.rb +114 -0
  285. data/spec/controllers/newspaper_works/newspapers_search_controller_spec.rb +21 -0
  286. data/spec/factories/ability.rb +6 -0
  287. data/spec/factories/newspaper_issue.rb +7 -0
  288. data/spec/factories/newspaper_issue_ingest.rb +6 -0
  289. data/spec/factories/newspaper_page.rb +7 -0
  290. data/spec/factories/newspaper_page_ingest.rb +6 -0
  291. data/spec/factories/newspaper_page_solr_document.rb +12 -0
  292. data/spec/factories/newspaper_title.rb +8 -0
  293. data/spec/factories/uploaded_pdf_file.rb +9 -0
  294. data/spec/factories/user.rb +13 -0
  295. data/spec/features/front_pages_for_title_spec.rb +19 -0
  296. data/spec/features/newspaper_title_search_spec.rb +30 -0
  297. data/spec/features/newspapers_search_spec.rb +49 -0
  298. data/spec/features/search_results_thumbnail_highlights_spec.rb +33 -0
  299. data/spec/features_shared.rb +71 -0
  300. data/spec/fixtures/files/4.1.07.jp2 +0 -0
  301. data/spec/fixtures/files/4.1.07.tiff +0 -0
  302. data/spec/fixtures/files/README.md +7 -0
  303. data/spec/fixtures/files/alto-2-0.xsd +714 -0
  304. data/spec/fixtures/files/broken-truncated.pdf +0 -0
  305. data/spec/fixtures/files/credits.md +16 -0
  306. data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
  307. data/spec/fixtures/files/minimal-1-page.pdf +0 -0
  308. data/spec/fixtures/files/minimal-2-page.pdf +0 -0
  309. data/spec/fixtures/files/minimal-alto.xml +31 -0
  310. data/spec/fixtures/files/ndnp-alto-sample.xml +24 -0
  311. data/spec/fixtures/files/ndnp-sample1-json.json +1 -0
  312. data/spec/fixtures/files/ndnp-sample1-txt.txt +1 -0
  313. data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
  314. data/spec/fixtures/files/ocr_alto.xml +202 -0
  315. data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +202 -0
  316. data/spec/fixtures/files/ocr_color.tiff +0 -0
  317. data/spec/fixtures/files/ocr_gray.jp2 +0 -0
  318. data/spec/fixtures/files/ocr_gray.tiff +0 -0
  319. data/spec/fixtures/files/ocr_mono.tiff +0 -0
  320. data/spec/fixtures/files/page1.tiff +0 -0
  321. data/spec/fixtures/files/resource_mocks/chronam/http404-expected +0 -0
  322. data/spec/fixtures/files/resource_mocks/chronam/sn84038814.rdf +1028 -0
  323. data/spec/fixtures/files/resource_mocks/chronam/sn93059126.rdf +36 -0
  324. data/spec/fixtures/files/resource_mocks/chronam/sn94051019.rdf +37 -0
  325. data/spec/fixtures/files/resource_mocks/geonames/Chicopee +1104 -0
  326. data/spec/fixtures/files/resource_mocks/geonames/Denver +1104 -0
  327. data/spec/fixtures/files/resource_mocks/geonames/Marysville +279 -0
  328. data/spec/fixtures/files/resource_mocks/geonames/Marysville2 +279 -0
  329. data/spec/fixtures/files/resource_mocks/geonames/SLC +1104 -0
  330. data/spec/fixtures/files/resource_mocks/lccn/sn2099999999 +1 -0
  331. data/spec/fixtures/files/resource_mocks/lccn/sn82014496 +2 -0
  332. data/spec/fixtures/files/resource_mocks/lccn/sn83020109 +1 -0
  333. data/spec/fixtures/files/resource_mocks/lccn/sn83021453 +2 -0
  334. data/spec/fixtures/files/resource_mocks/lccn/sn83045396 +2 -0
  335. data/spec/fixtures/files/resource_mocks/lccn/sn84038814 +2 -0
  336. data/spec/fixtures/files/resource_mocks/lccn/sn93059126 +1 -0
  337. data/spec/fixtures/files/resource_mocks/lccn/sn94051019 +1 -0
  338. data/spec/fixtures/files/resource_mocks/lccn/sn99999999 +1 -0
  339. data/spec/fixtures/files/resource_mocks/urls.json +82 -0
  340. data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
  341. data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
  342. data/spec/fixtures/files/thumbnail.jpg +0 -0
  343. data/spec/forms/hyrax/newspaper_article_form_spec.rb +33 -0
  344. data/spec/forms/hyrax/newspaper_container_form_spec.rb +30 -0
  345. data/spec/forms/hyrax/newspaper_issue_form_spec.rb +31 -0
  346. data/spec/forms/hyrax/newspaper_page_form_spec.rb +28 -0
  347. data/spec/forms/hyrax/newspaper_title_form_spec.rb +31 -0
  348. data/spec/forms/newspaper_works/newspaper_core_form_data_spec.rb +12 -0
  349. data/spec/helpers/newspaper_works/breadcrumb_helper_spec.rb +82 -0
  350. data/spec/helpers/newspaper_works_helper_spec.rb +57 -0
  351. data/spec/indexers/concerns/newspaper_works/indexes_full_text_spec.rb +31 -0
  352. data/spec/indexers/concerns/newspaper_works/indexes_place_of_publication_spec.rb +53 -0
  353. data/spec/indexers/concerns/newspaper_works/indexes_publication_date_range_spec.rb +39 -0
  354. data/spec/indexers/concerns/newspaper_works/indexes_relationships_spec.rb +86 -0
  355. data/spec/indexers/newspaper_article_indexer_spec.rb +29 -0
  356. data/spec/indexers/newspaper_issue_indexer_spec.rb +19 -0
  357. data/spec/indexers/newspaper_title_indexer_spec.rb +22 -0
  358. data/spec/indexers/newspaper_works/newspaper_core_indexer_spec.rb +23 -0
  359. data/spec/lib/newspaper_works/configuration_spec.rb +18 -0
  360. data/spec/lib/newspaper_works/data/work_derivatives_spec.rb +245 -0
  361. data/spec/lib/newspaper_works/data/work_file_spec.rb +99 -0
  362. data/spec/lib/newspaper_works/data/work_files_spec.rb +224 -0
  363. data/spec/lib/newspaper_works/ingest/batch_issue_ingester_spec.rb +158 -0
  364. data/spec/lib/newspaper_works/ingest/chronam_publication_info_spec.rb +35 -0
  365. data/spec/lib/newspaper_works/ingest/from_command_spec.rb +75 -0
  366. data/spec/lib/newspaper_works/ingest/image_ingest_issues_spec.rb +62 -0
  367. data/spec/lib/newspaper_works/ingest/ingest_shared.rb +75 -0
  368. data/spec/lib/newspaper_works/ingest/issue_images_spec.rb +65 -0
  369. data/spec/lib/newspaper_works/ingest/lc_publication_info_spec.rb +34 -0
  370. data/spec/lib/newspaper_works/ingest/ndnp/batch_ingester_spec.rb +131 -0
  371. data/spec/lib/newspaper_works/ingest/ndnp/batch_xml_ingest_spec.rb +64 -0
  372. data/spec/lib/newspaper_works/ingest/ndnp/container_ingest_spec.rb +44 -0
  373. data/spec/lib/newspaper_works/ingest/ndnp/container_ingester_spec.rb +126 -0
  374. data/spec/lib/newspaper_works/ingest/ndnp/container_metadata_spec.rb +36 -0
  375. data/spec/lib/newspaper_works/ingest/ndnp/issue_ingest_spec.rb +108 -0
  376. data/spec/lib/newspaper_works/ingest/ndnp/issue_ingester_spec.rb +155 -0
  377. data/spec/lib/newspaper_works/ingest/ndnp/issue_metadata_spec.rb +84 -0
  378. data/spec/lib/newspaper_works/ingest/ndnp/page_ingest_spec.rb +79 -0
  379. data/spec/lib/newspaper_works/ingest/ndnp/page_ingester_spec.rb +184 -0
  380. data/spec/lib/newspaper_works/ingest/ndnp/page_metadata_spec.rb +85 -0
  381. data/spec/lib/newspaper_works/ingest/newspaper_issue_ingest_spec.rb +83 -0
  382. data/spec/lib/newspaper_works/ingest/newspaper_page_ingest_spec.rb +77 -0
  383. data/spec/lib/newspaper_works/ingest/page_image_spec.rb +29 -0
  384. data/spec/lib/newspaper_works/ingest/pdf_images_spec.rb +32 -0
  385. data/spec/lib/newspaper_works/ingest/pdf_issue_spec.rb +29 -0
  386. data/spec/lib/newspaper_works/ingest/pdf_issues_spec.rb +62 -0
  387. data/spec/lib/newspaper_works/ingest/pdf_pages_spec.rb +110 -0
  388. data/spec/lib/newspaper_works/ingest/pub_finder_spec.rb +58 -0
  389. data/spec/lib/newspaper_works/ingest/publication_info_spec.rb +61 -0
  390. data/spec/lib/newspaper_works/ingest_spec.rb +45 -0
  391. data/spec/lib/newspaper_works/issue_pdf_composer_spec.rb +101 -0
  392. data/spec/lib/newspaper_works/logging_spec.rb +53 -0
  393. data/spec/lib/newspaper_works/page_finder_spec.rb +53 -0
  394. data/spec/lib/newspaper_works/resource_fetcher_spec.rb +65 -0
  395. data/spec/lib/newspaper_works/text_extraction/alto_reader_spec.rb +49 -0
  396. data/spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb +84 -0
  397. data/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb +54 -0
  398. data/spec/lib/newspaper_works/text_extraction/word_coords_builder_spec.rb +30 -0
  399. data/spec/lib/tasks/newspaper_works_rake_spec.rb +124 -0
  400. data/spec/misc_shared.rb +109 -0
  401. data/spec/model_shared.rb +134 -0
  402. data/spec/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior_spec.rb +45 -0
  403. data/spec/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior_spec.rb +27 -0
  404. data/spec/models/concerns/newspaper_works/newspaper_core_metadata_spec.rb +45 -0
  405. data/spec/models/concerns/newspaper_works/place_of_publication_behavior_spec.rb +17 -0
  406. data/spec/models/concerns/newspaper_works/scanned_media_metadata_spec.rb +35 -0
  407. data/spec/models/newspaper_article_spec.rb +73 -0
  408. data/spec/models/newspaper_container_spec.rb +111 -0
  409. data/spec/models/newspaper_issue_spec.rb +91 -0
  410. data/spec/models/newspaper_page_spec.rb +44 -0
  411. data/spec/models/newspaper_title_spec.rb +116 -0
  412. data/spec/models/newspaper_works/derivative_attachment_spec.rb +37 -0
  413. data/spec/models/newspaper_works/ingest_file_relation_spec.rb +56 -0
  414. data/spec/models/solr_document_spec.rb +14 -0
  415. data/spec/ndnp_shared.rb +48 -0
  416. data/spec/presenters/hyrax/newspaper_article_presenter_spec.rb +53 -0
  417. data/spec/presenters/hyrax/newspaper_container_presenter_spec.rb +20 -0
  418. data/spec/presenters/hyrax/newspaper_issue_presenter_spec.rb +65 -0
  419. data/spec/presenters/hyrax/newspaper_page_presenter_spec.rb +75 -0
  420. data/spec/presenters/hyrax/newspaper_title_presenter_spec.rb +153 -0
  421. data/spec/presenters/newspaper_works/iiif_manifest_presenter_behavior_spec.rb +32 -0
  422. data/spec/presenters/newspaper_works/issue_info_presenter_spec.rb +51 -0
  423. data/spec/presenters/newspaper_works/newspaper_core_presenter_spec.rb +22 -0
  424. data/spec/presenters/newspaper_works/persistent_url_presenter_behavior_spec.rb +24 -0
  425. data/spec/presenters/newspaper_works/place_of_publication_presenter_behavior_spec.rb +17 -0
  426. data/spec/presenters/newspaper_works/scanned_media_presenter_spec.rb +18 -0
  427. data/spec/presenters/newspaper_works/title_info_presenter_spec.rb +23 -0
  428. data/spec/routing/route_spec.rb +52 -0
  429. data/spec/search_builders/custom_search_builder_spec.rb +34 -0
  430. data/spec/search_builders/newspaper_works/newspapers_search_builder_spec.rb +33 -0
  431. data/spec/services/hyrax/article_genre_service_spec.rb +12 -0
  432. data/spec/services/hyrax/resource_types_service_spec.rb +12 -0
  433. data/spec/services/newspaper_works/jp2_derivative_service_spec.rb +62 -0
  434. data/spec/services/newspaper_works/newspaper_page_derivative_service_spec.rb +125 -0
  435. data/spec/services/newspaper_works/pdf_derivative_service_spec.rb +62 -0
  436. data/spec/services/newspaper_works/pluggable_derivative_service_spec.rb +204 -0
  437. data/spec/services/newspaper_works/text_extraction_derivative_service_spec.rb +82 -0
  438. data/spec/services/newspaper_works/text_formats_from_alto_service_spec.rb +129 -0
  439. data/spec/services/newspaper_works/tiff_derivative_service_spec.rb +58 -0
  440. data/spec/spec_helper.rb +261 -0
  441. data/spec/support/controller_level_helpers.rb +28 -0
  442. data/spec/test_app_templates/lib/generators/test_app_generator.rb +22 -0
  443. data/spec/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb_spec.rb +36 -0
  444. data/spec/views/catalog/_index_header_list_newspaper_page.html.erb_spec.rb +26 -0
  445. data/spec/views/catalog/_thumbnail_list_newspaper_page.html.erb_spec.rb +35 -0
  446. data/spec/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb_spec.rb +16 -0
  447. data/spec/views/hyrax/newspaper_titles/_issue_search_form.html.erb_spec.rb +33 -0
  448. data/spec/views/hyrax/newspaper_titles/_issues_calendar.html.erb_spec.rb +37 -0
  449. data/spec/views/hyrax/newspaper_titles/show.html.erb_spec.rb +87 -0
  450. data/spec/views/newspaper_works/base/_attribute_rows.html.erb_spec.rb +60 -0
  451. data/spec/views/newspaper_works/base/_newspaper_hierarchy.html.erb_spec.rb +80 -0
  452. data/spec/views/newspaper_works/base/_show.html.erb_spec.rb +78 -0
  453. data/spec/views/newspaper_works/newspapers_search/search.html.erb_spec.rb +54 -0
  454. data/spec/views/records/edit_fields/_place_of_publication.html.erb_spec.rb +26 -0
  455. data/tasks/newspaperworks_dev.rake +26 -0
  456. data/test/integration/navigation_test.rb +7 -0
  457. data/test/lib/generators/newspaper_works/install_generator_test.rb +16 -0
  458. data/test/newspaper_works_test.rb +7 -0
  459. data/test/test_helper.rb +17 -0
  460. data/tmp/.keep +0 -0
  461. metadata +1037 -0
@@ -0,0 +1,120 @@
1
+ require 'open3'
2
+
3
+ module NewspaperWorks
4
+ class JP2DerivativeService < NewspaperPageDerivativeService
5
+ # OpenJPEG 2000 Command to make NDNP-compliant grayscale JP2:
6
+ CMD_GRAY = 'opj_compress -i %<source_file>s -o %<out_file>s ' \
7
+ '-d 0,0 -b 64,64 -n 6 -p RLCP -t 1024,1024 -I -M 1 ' \
8
+ '-r 64,53.821,45.249,40,32,26.911,22.630,20,16,14.286,' \
9
+ '11.364,10,8,6.667,5.556,4.762,4,3.333,2.857,2.500,2,' \
10
+ '1.667,1.429,1.190,1'.freeze
11
+
12
+ # OpenJPEG 2000 Command to make RGB JP2:
13
+ CMD_COLOR = 'opj_compress -i %<source_file>s -o %<out_file>s ' \
14
+ '-d 0,0 -b 64,64 -n 6 -p RPCL -t 1024,1024 -I -M 1 '\
15
+ '-r 2.4,1.48331273,.91673033,.56657224,.35016049,.21641118,' \
16
+ '.13374944,.0944,.08266171'.freeze
17
+
18
+ # OpenJPEG 1.x command replacement for 2.x opj_compress, takes same options;
19
+ # this is necessary on Ubuntu Trusty (e.g. Travis CI)
20
+ CMD_1X = 'image_to_j2k'.freeze
21
+
22
+ # Target file extension of this service plugin:
23
+ TARGET_EXT = 'jp2'.freeze
24
+
25
+ attr_accessor :source_meta
26
+ attr_reader :file_set
27
+ delegate :uri, :mime_type, to: :file_set
28
+
29
+ def initialize(file_set)
30
+ # cached result string for imagemagick `identify` command
31
+ @source_meta = nil
32
+ @command = nil
33
+ @unlink_after_creation = []
34
+ super(file_set)
35
+ end
36
+
37
+ def create_derivatives(filename)
38
+ # Base class takes care of loading @source_path, @dest_path
39
+ super(filename)
40
+
41
+ # no creation if jp2 master => deemed unnecessary/duplicative
42
+ return if mime_type == 'image/jp2'
43
+
44
+ # if we have a non-TIFF source, or a 1-bit monochrome source, we need
45
+ # to make a NetPBM-based intermediate (temporary) file for OpenJPEG
46
+ # to consume.
47
+ needs_intermediate = !tiff_source? || one_bit?
48
+
49
+ # We use either intermediate temp file, or temp symlink (to work
50
+ # around OpenJPEG 2000 file naming quirk).
51
+ needs_intermediate ? make_intermediate_source : make_symlink
52
+
53
+ # Get OpenJPEG command, rendered with source, destination, appropriate
54
+ # to either color or grayscale source
55
+ render_cmd = opj_command
56
+
57
+ # Run the generated command to make derivative file at @dest_path
58
+ `#{render_cmd}`
59
+
60
+ # Clean up any intermediate files or symlinks used during creation
61
+ cleanup_intermediate
62
+ end
63
+
64
+ private
65
+
66
+ # source introspection:
67
+
68
+ def tiff_source?
69
+ identify.include?('TIFF')
70
+ end
71
+
72
+ def make_symlink
73
+ # OpenJPEG binaries have annoying quirk of only using TIFF input
74
+ # files whose name ends in .TIF or .tif (three letter); for all
75
+ # non-monochrome TIFF files, we just assume we need to symlink
76
+ # to such a filename.
77
+ tmpname = File.join(Dir.tmpdir, "#{SecureRandom.uuid}.tif")
78
+ FileUtils.ln_s(@source_path, tmpname)
79
+ @unlink_after_creation.push(tmpname)
80
+ # finally, point @source_path for command at intermediate link:
81
+ @source_path = tmpname
82
+ end
83
+
84
+ def make_intermediate_source
85
+ # generate a random filename to be made, with appropriate extension,
86
+ # inside /tmp dir:
87
+ tmpname = File.join(
88
+ Dir.tmpdir,
89
+ format(
90
+ "#{SecureRandom.uuid}.%<ext>s",
91
+ ext: use_color? ? 'ppm' : 'pgm'
92
+ )
93
+ )
94
+ # if pdf source, get only first page
95
+ source_path = @source_path
96
+ source_path += '[0]' if @source_path.ends_with?('pdf')
97
+ # Use ImageMagick `convert` to create intermediate bitmap:
98
+ `convert #{source_path} #{tmpname}`
99
+ @unlink_after_creation.push(tmpname)
100
+ # finally, point @source_path for command at intermediate file:
101
+ @source_path = tmpname
102
+ end
103
+
104
+ def opj_command
105
+ # Get a command template appropriate to OpenJPEG 1.x or 2.x
106
+ use_openjpeg_1x = `which opj_compress`.empty?
107
+ cmd = use_color? ? CMD_COLOR : CMD_GRAY
108
+ cmd = cmd.sub('opj_compress', 'image_to_j2k') if use_openjpeg_1x
109
+ # return command with source and destination file names injected
110
+ format(cmd, source_file: @source_path, out_file: @dest_path)
111
+ end
112
+
113
+ def cleanup_intermediate
114
+ # remove symlink or intermediate file once we no longer need
115
+ @unlink_after_creation.each do |path|
116
+ FileUtils.rm(path)
117
+ end
118
+ end
119
+ end
120
+ end
@@ -0,0 +1,91 @@
1
+ module NewspaperWorks
2
+ # Base type for derivative services specific to NewspaperPage only
3
+ class NewspaperPageDerivativeService
4
+ attr_reader :file_set, :master_format
5
+ delegate :uri, :mime_type, to: :file_set
6
+
7
+ TARGET_EXT = nil
8
+
9
+ def self.target_ext
10
+ self::TARGET_EXT
11
+ end
12
+
13
+ def initialize(file_set)
14
+ @file_set = file_set
15
+ @dest_path = nil
16
+ @source_path = nil
17
+ @source_meta = nil
18
+ end
19
+
20
+ def valid?
21
+ parent = file_set.in_works[0]
22
+ # fallback to Fedora-stored relationships if work's aggregation of
23
+ # file set is not indexed in Solr
24
+ parent = file_set.member_of.select(&:work?)[0] if parent.nil?
25
+ parent.class == NewspaperPage
26
+ end
27
+
28
+ def derivative_path_factory
29
+ Hyrax::DerivativePath
30
+ end
31
+
32
+ # prepare full path for passed extension/destination name, return path
33
+ def prepare_path(extension)
34
+ dest_path = derivative_path_factory.derivative_path_for_reference(
35
+ @file_set,
36
+ extension
37
+ )
38
+ dir = File.join(dest_path.split('/')[0..-2])
39
+ FileUtils.mkdir_p(dir) unless Dir.exist?(dir)
40
+ dest_path
41
+ end
42
+
43
+ # calculate and ensure directory components for singular @dest_path
44
+ # should only be used by subclasses producing a single derivative
45
+ def load_destpath
46
+ @dest_path = prepare_path(self.class.target_ext)
47
+ end
48
+
49
+ def identify
50
+ if @source_meta.nil?
51
+ path = @source_path
52
+ cmd = "identify #{path}"
53
+ # fallback to graphicsmagick if source is jp2, as Ubuntu 16.10
54
+ # ImageMagick has no jp2 support.
55
+ cmd = 'gm ' + cmd if path.ends_with?('jp2')
56
+ Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
57
+ @source_meta = stdout.read
58
+ end
59
+ end
60
+ @source_meta
61
+ end
62
+
63
+ def use_color?
64
+ # imagemagick `identify` output describes color space:
65
+ !(identify.include?('Gray') || one_bit?)
66
+ end
67
+
68
+ # is source one-bit monochrome?
69
+ def one_bit?
70
+ identify.include?('1-bit')
71
+ end
72
+
73
+ def create_derivatives(filename)
74
+ # presuming that filename is full path to source file
75
+ @source_path = filename
76
+
77
+ # Get destination path from Hyrax for file extension defined in
78
+ # TARGET_EXT constant on respective derivative service subclass.
79
+ load_destpath
80
+ end
81
+
82
+ def cleanup_derivatives(*args)
83
+ target_ext = args && args[0] ? args[0] : self.class.target_ext
84
+ derivative_path_factory.derivatives_for_reference(file_set).each do |path|
85
+ FileUtils.rm_f(path) if path.ends_with?(target_ext)
86
+ end
87
+ end
88
+
89
+ # def cleanup_derivatives; end
90
+ end
91
+ end
@@ -0,0 +1,45 @@
1
+ require 'open3'
2
+
3
+ module NewspaperWorks
4
+ class PDFDerivativeService < NewspaperPageDerivativeService
5
+ TARGET_EXT = 'pdf'.freeze
6
+
7
+ # PDF (JPEG, 8 bit grayscale), 150ppi
8
+ GRAY_PDF_CMD = 'convert %<source_file>s ' \
9
+ '-resize 1800 -density 150 ' \
10
+ '-depth 8 -colorspace Gray ' \
11
+ '-compress jpeg %<out_file>s'.freeze
12
+
13
+ # sRBG color PDF (JPEG, 8 bits per channel), 150ppi
14
+ COLOR_PDF_CMD = 'convert %<source_file>s ' \
15
+ '-resize 1800 -density 150 ' \
16
+ '-depth 8 ' \
17
+ '-compress jpeg %<out_file>s'.freeze
18
+
19
+ # graphicsmagick prefix, may be needed for jp2 source on Ubuntu
20
+ GM_PREFX = 'gm '.freeze
21
+
22
+ def initialize(file_set)
23
+ super(file_set)
24
+ end
25
+
26
+ # Get conversion command; command varies on whether or not we have
27
+ # JP2 source, and whether we have color or grayscale material.
28
+ def convert_cmd
29
+ template = use_color? ? COLOR_PDF_CMD : GRAY_PDF_CMD
30
+ cmd = format(template, source_file: @source_path, out_file: @dest_path)
31
+ @source_path.ends_with?('jp2') ? GM_PREFIX + cmd : cmd
32
+ end
33
+
34
+ def create_derivatives(filename)
35
+ # Base class takes care of loading @source_path, @dest_path
36
+ super(filename)
37
+
38
+ # no creation if pdf master
39
+ return if mime_type == 'application/pdf'
40
+
41
+ # Get and run imagemagick or graphicsmagick command
42
+ `#{convert_cmd}`
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,114 @@
1
+ # General derivative service for NewspaperWorks, which is meant to wrap
2
+ # and replace the stock Hyrax::FileSetDerivativeService with a proxy
3
+ # that runs one or more derivative service "plugin" components.
4
+ #
5
+ # Note: Hyrax::DerivativeService consumes this, instead of (directly)
6
+ # consuming Hyrax::FileSetDerivativeService.
7
+ #
8
+ # Unlike the "run the first valid plugin" arrangement that the
9
+ # Hyrax::DerivativeService uses to run an actual derivative creation
10
+ # service component, this component is:
11
+ #
12
+ # (a) Consumed by Hyrax::DerivativeService as that first valid plugin;
13
+ #
14
+ # (b) Wraps and runs 0..* plugins, not just the first.
15
+ #
16
+ # This should be registered to take precedence over default by:
17
+ # Hyrax::DerivativeService.services.unshift(
18
+ # NewspaperWorks::PluggableDerivativeService
19
+ # )
20
+ #
21
+ # Modify NewspaperWorks::PluggableDerivativeService.plugins
22
+ # to add, remove, or reorder plugin (derivative service) classes.
23
+ #
24
+ class NewspaperWorks::PluggableDerivativeService
25
+ attr_reader :file_set
26
+ delegate :uri, :mime_type, to: :file_set
27
+
28
+ # default plugin Hyrax OOTB, makes thumbnails and sometimes extracts text:
29
+ default_plugin = Hyrax::FileSetDerivativesService
30
+
31
+ # make and expose an array of plugins
32
+ @plugins = [default_plugin]
33
+ @allowed_methods = [:cleanup_derivatives, :create_derivatives]
34
+ class << self
35
+ attr_accessor :plugins, :allowed_methods
36
+ end
37
+
38
+ def plugins
39
+ self.class.plugins
40
+ end
41
+
42
+ def initialize(file_set)
43
+ @file_set = file_set
44
+ end
45
+
46
+ def valid?
47
+ # this wrapper/proxy/composite is always valid, but it may compose
48
+ # multiple plugins, some of which may or may not be valid, so
49
+ # validity checks happen within as well.
50
+ true
51
+ end
52
+
53
+ def respond_to_missing?(method_name)
54
+ self.class.allowed_methods.include?(method_name) || super
55
+ end
56
+
57
+ # get derivative services relevant to method name and file_set context
58
+ # -- omits plugins if particular destination exists or will soon.
59
+ def services(method_name)
60
+ result = plugins.map { |plugin| plugin.new(file_set) }.select(&:valid?)
61
+ result.select do |plugin|
62
+ dest = nil
63
+ dest = plugin.class.target_ext if plugin.class.respond_to?(:target_ext)
64
+ !skip_destination?(method_name, dest)
65
+ end
66
+ end
67
+
68
+ def method_missing(name, *args, **opts, &block)
69
+ if respond_to_missing?(name)
70
+ # we have an allowed method, construct services and include all valid
71
+ # services for the file_set
72
+ # services = plugins.map { |plugin| plugin.new(file_set) }.select(&:valid?)
73
+ # run all valid services, in order:
74
+ services(name).each do |plugin|
75
+ plugin.send(name, *args)
76
+ end
77
+ else
78
+ super
79
+ end
80
+ end
81
+
82
+ private
83
+
84
+ def skip_destination?(method_name, destination_name)
85
+ return false if file_set.id.nil? || destination_name.nil?
86
+ return false unless method_name == :create_derivatives
87
+ # skip :create_derivatives if existing --> do not re-create
88
+ existing_derivative?(destination_name) ||
89
+ impending_derivative?(destination_name)
90
+ end
91
+
92
+ def existing_derivative?(name)
93
+ path = derivative_path_factory.derivative_path_for_reference(
94
+ file_set,
95
+ name
96
+ )
97
+ File.exist?(path)
98
+ end
99
+
100
+ # is there an impending attachment from ingest logged to db?
101
+ # -- avoids stomping over pre-made derivative
102
+ # for which an attachment is still in-progress.
103
+ def impending_derivative?(name)
104
+ result = NewspaperWorks::DerivativeAttachment.find_by(
105
+ fileset_id: file_set.id,
106
+ destination_name: name
107
+ )
108
+ !result.nil?
109
+ end
110
+
111
+ def derivative_path_factory
112
+ Hyrax::DerivativePath
113
+ end
114
+ end
@@ -0,0 +1,56 @@
1
+ module NewspaperWorks
2
+ class TextExtractionDerivativeService < NewspaperPageDerivativeService
3
+ def initialize(file_set)
4
+ super(file_set)
5
+ @alto_path = nil
6
+ @txt_path = nil
7
+ end
8
+
9
+ def create_derivatives(src)
10
+ from_alto = NewspaperWorks::TextFormatsFromALTOService.new(
11
+ file_set
12
+ )
13
+ return from_alto.create_derivatives(src) unless from_alto.alto_path.nil?
14
+ create_derivatives_from_ocr(src)
15
+ end
16
+
17
+ def create_derivatives_from_ocr(filename)
18
+ @source_path = filename
19
+ # prepare destination directory for ALTO (as .xml files):
20
+ @alto_path = prepare_path('xml')
21
+ # prepare destination directory for plain text (as .txt files):
22
+ @txt_path = prepare_path('txt')
23
+ # prepare destination directory for flat JSON (as .json files):
24
+ @json_path = prepare_path('json')
25
+ ocr = NewspaperWorks::TextExtraction::PageOCR.new(filename)
26
+ # OCR will run once, on first method call to either .alto or .plain:
27
+ write_plain_text(ocr.plain)
28
+ write_alto(ocr.alto)
29
+ write_json(ocr.word_json)
30
+ end
31
+
32
+ def write_alto(xml)
33
+ File.open(@alto_path, 'w') do |outfile|
34
+ outfile.write(xml)
35
+ end
36
+ end
37
+
38
+ def write_plain_text(text)
39
+ File.open(@txt_path, 'w') do |outfile|
40
+ outfile.write(text)
41
+ end
42
+ end
43
+
44
+ def write_json(text)
45
+ File.open(@json_path, 'w') do |outfile|
46
+ outfile.write(text)
47
+ end
48
+ end
49
+
50
+ def cleanup_derivatives
51
+ super('txt')
52
+ super('xml')
53
+ super('json')
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,77 @@
1
+ module NewspaperWorks
2
+ # Plugin to make text format derviatives (JSON, plain-text) from ALTO,
3
+ # either existing derivative, or an impending attachment.
4
+ # NOTE: to keep this from conflicting with TextExtractionDerivativeService,
5
+ # this class should be invoked by it, not PluggableDerivativeService.
6
+ class TextFormatsFromALTOService < NewspaperPageDerivativeService
7
+ TARGET_EXT = 'tiff'.freeze
8
+
9
+ def save_derivative(destination, data)
10
+ # Load/prepare base of "pairtree" dir structure for extension, fileset
11
+ prepare_path(destination)
12
+ #
13
+ save_path = derivative_path_factory.derivative_path_for_reference(
14
+ @file_set,
15
+ destination
16
+ )
17
+ # Write data as UTF-8 encoded text
18
+ File.open(save_path, "w:UTF-8") do |f|
19
+ f.write(data)
20
+ end
21
+ end
22
+
23
+ def nonempty_file?(path)
24
+ return false if path.nil?
25
+ return false unless File.exist?(path)
26
+ !File.size(path).zero?
27
+ end
28
+
29
+ # if there was no derivative yet, there might be one in-transit from
30
+ # an ingest, so check for that, and use its source if applicable:
31
+ def incoming_alto_path
32
+ path = NewspaperWorks::DerivativeAttachment.where(
33
+ fileset_id: @file_set.id,
34
+ destination_name: 'xml'
35
+ ).pluck(:path).uniq.first
36
+ path if nonempty_file?(path)
37
+ end
38
+
39
+ def alto_path
40
+ # check first for existing, non-empty derivative data:
41
+ path = derivative_path_factory.derivative_path_for_reference(
42
+ @file_set,
43
+ 'xml'
44
+ )
45
+ return path if nonempty_file?(path)
46
+ incoming_alto_path
47
+ end
48
+
49
+ def alto
50
+ path = alto_path
51
+ File.read(path, encoding: 'UTF-8') unless path.nil?
52
+ end
53
+
54
+ def create_derivatives(_filename)
55
+ # as this plugin makes derivatives of derivative, _filename is ignored
56
+ source_file = alto
57
+ return if source_file.nil?
58
+ # Image width from characterized primary file helps ensure proper scaling:
59
+ file = @file_set.original_file
60
+ width = file.nil? ? nil : file.width[0].to_i
61
+ height = file.nil? ? nil : file.height[0].to_i
62
+ # ALTOReader is responsible for transcoding, this class just saves result
63
+ reader = NewspaperWorks::TextExtraction::AltoReader.new(
64
+ source_file,
65
+ width,
66
+ height
67
+ )
68
+ save_derivative('json', reader.json)
69
+ save_derivative('txt', reader.text)
70
+ end
71
+
72
+ def cleanup_derivatives(*args)
73
+ # do nothing here; NewspaperWorks::TextExtractionDerivativeService
74
+ # has this job instead for cleaning ALTO, JSON, TXT.
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,54 @@
1
+ require 'open3'
2
+
3
+ module NewspaperWorks
4
+ class TIFFDerivativeService < NewspaperPageDerivativeService
5
+ TARGET_EXT = 'tiff'.freeze
6
+
7
+ # For imagemagick commands, the output type is determined by the
8
+ # output file's extension.
9
+ # TIFF (LZW, 8 bit grayscale)
10
+ GRAY_CMD = 'convert %<source_file>s ' \
11
+ '-depth 8 -colorspace Gray ' \
12
+ '-compress lzw %<out_file>s'.freeze
13
+
14
+ # Monochrome one-bit black/white TIFF, Group 4 compressed:
15
+ MONO_CMD = 'convert %<source_file>s ' \
16
+ '-depth 1 -monochrome -compress Group4 -type bilevel ' \
17
+ '%<out_file>s'.freeze
18
+
19
+ # sRBG color TIFF (8 bits per channel, lzw)
20
+ COLOR_CMD = 'convert %<source_file>s ' \
21
+ '-depth 24 ' \
22
+ '-compress lzw %<out_file>s'.freeze
23
+
24
+ # graphicsmagick prefix, may be needed for jp2 source on Ubuntu
25
+ GM_PREFX = 'gm '.freeze
26
+
27
+ def initialize(file_set)
28
+ super(file_set)
29
+ end
30
+
31
+ # Get conversion command; command varies on whether or not we have
32
+ # JP2 source, and whether we have color or grayscale material.
33
+ def convert_cmd
34
+ source_path = @source_path
35
+ source_path += '[0]' if @source_path.ends_with?('pdf')
36
+ template = use_color? ? COLOR_CMD : GRAY_CMD
37
+ template = MONO_CMD if one_bit?
38
+ cmd = format(template, source_file: source_path, out_file: @dest_path)
39
+ # normalization of command based on source
40
+ @source_path.ends_with?('jp2') ? GM_PREFIX + cmd : cmd
41
+ end
42
+
43
+ def create_derivatives(filename)
44
+ # Base class takes care of loading @source_path, @dest_path
45
+ super(filename)
46
+
47
+ # no creation if pdf master
48
+ return if mime_type == 'image/tiff'
49
+
50
+ # Get and run imagemagick or graphicsmagick command
51
+ `#{convert_cmd}`
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,48 @@
1
+ module NewspaperWorks
2
+ # validates start and end date are properly formatted and end date comes after
3
+ # or on the same date as the start date.
4
+ class PublicationDateStartEndValidator < ActiveModel::Validator
5
+ DATE_RANGE_REGEX = /\A\d{4}(-((0[1-9])|(1[0-2])))?(-(([0-2][1-9])|3[0-1]))?\z/
6
+
7
+ def validate(record)
8
+ start_date = record.publication_date_start
9
+ end_date = record.publication_date_end
10
+ valid_dates?(start_date, end_date, record) && start_before_end?(start_date, end_date, record)
11
+ end
12
+
13
+ private
14
+
15
+ def publication_date_valid?(pub_date)
16
+ return false unless DATE_RANGE_REGEX.match(pub_date)
17
+ date_split = pub_date.split("-").map(&:to_i)
18
+ return false if date_split.length == 3 &&
19
+ !Date.valid_date?(date_split[0], date_split[1], date_split[2])
20
+ true
21
+ end
22
+
23
+ def start_before_end?(start_date, end_date, record)
24
+ return true unless start_date && end_date
25
+ date_error = "Publication start date must be earlier or the same as end date."
26
+ pub_start = start_date.split("-")
27
+ pub_end = end_date.split("-")
28
+ (0..2).each do |i|
29
+ if pub_start[i] && pub_end[i] && pub_end[i] < pub_start[i]
30
+ record.errors[:publication_date_start] << date_error
31
+ break
32
+ end
33
+ end
34
+ record.errors[:publication_date_start].blank?
35
+ end
36
+
37
+ def valid_dates?(start_date, end_date, record)
38
+ date_error = "Incorrect Date. Date input should be formatted yyyy[-mm][-dd] and be a valid date."
39
+ if start_date
40
+ record.errors[:publication_date_start] << date_error unless publication_date_valid?(start_date)
41
+ end
42
+ if end_date
43
+ record.errors[:publication_date_end] << date_error unless publication_date_valid?(end_date)
44
+ end
45
+ record.errors[:publication_date_start].blank? && record.errors[:publication_date_end].blank?
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,16 @@
1
+ module NewspaperWorks
2
+ # validates that a properly formatted date has been entered
3
+ class PublicationDateValidator < ActiveModel::Validator
4
+ DATE_REGEX = /\A\d{4}-((0[1-9])|(1[0-2]))-((0[1-9])|([1-2][0-9])|(3[0-1]))\z/
5
+ def validate(record)
6
+ error_msg = "Incorrect Date. Date input should be formatted yyyy-mm-dd and be a valid date."
7
+ return unless record.publication_date.present?
8
+ unless DATE_REGEX.match(record.publication_date)
9
+ record.errors[:publication_date] << error_msg
10
+ return
11
+ end
12
+ date_split = record.publication_date.split("-").map(&:to_i)
13
+ record.errors[:publication_date] << error_msg unless Date.valid_date?(date_split[0], date_split[1], date_split[2])
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,9 @@
1
+ <div class="document col-xs-6 col-md-3">
2
+ <div class="thumbnail" data-fileset="<%= document.file_set_ids&.first %>" data-query="<%= highlight_matches(document, 'all_text_tsimv', 'em') || search_query(current_search_session.query_params) %>">
3
+ <%= render_newspaper_thumbnail_tag(document,
4
+ current_search_session.query_params) %>
5
+ <div class="caption">
6
+ <%= render_document_partials document, blacklight_config.view_config(:gallery).partials, :document_counter => document_counter %>
7
+ </div>
8
+ </div>
9
+ </div>
@@ -0,0 +1,9 @@
1
+ <div class="document col-xs-6 col-md-3">
2
+ <div class="thumbnail" data-fileset="<%= document.file_set_ids&.first %>" data-query="<%= highlight_matches(document, 'all_text_tsimv', 'em') || search_query(current_search_session.query_params) %>">
3
+ <%= render_newspaper_thumbnail_tag(document,
4
+ current_search_session.query_params) %>
5
+ <div class="caption">
6
+ <%= render_document_partials document, blacklight_config.view_config(:gallery).partials, :document_counter => document_counter %>
7
+ </div>
8
+ </div>
9
+ </div>
@@ -0,0 +1,23 @@
1
+ <%# based on blacklight/app/views/catalog/_index_header_default.html.erb %>
2
+ <%# header bar for doc items in index view -%>
3
+ <div class="documentHeader row">
4
+ <%# main title container for doc partial view
5
+ How many bootstrap columns need to be reserved
6
+ for bookmarks control depends on size.
7
+ -%>
8
+ <% document_actions = capture do %>
9
+ <% # bookmark functions for items/docs -%>
10
+ <%= render_index_doc_actions document, wrapping_class: "index-document-functions col-sm-3 col-lg-2" %>
11
+ <% end %>
12
+ <h3 class="index_title document-title-heading <%= document_actions.present? ? "col-sm-9 col-lg-10" : "col-md-12" %>">
13
+ <% if counter = document_counter_with_offset(document_counter) %>
14
+ <span class="document-counter">
15
+ <%= t('blacklight.search.documents.counter', counter: counter) %>
16
+ </span>
17
+ <% end %>
18
+ <%= link_to document.title_or_label,
19
+ hyrax_newspaper_article_path(document.id,
20
+ anchor: iiif_search_anchor(current_search_session.query_params)) %>
21
+ </h3>
22
+ <%= document_actions %>
23
+ </div>