newspaper_works 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (461) hide show
  1. checksums.yaml +7 -0
  2. data/.fcrepo_wrapper +4 -0
  3. data/.gitignore +43 -0
  4. data/.rubocop.yml +143 -0
  5. data/.solr_wrapper +8 -0
  6. data/.travis.yml +50 -0
  7. data/Gemfile +47 -0
  8. data/LICENSE +203 -0
  9. data/README.md +159 -0
  10. data/Rakefile +38 -0
  11. data/app/actors/hyrax/actors/newspaper_article_actor.rb +8 -0
  12. data/app/actors/hyrax/actors/newspaper_container_actor.rb +8 -0
  13. data/app/actors/hyrax/actors/newspaper_issue_actor.rb +8 -0
  14. data/app/actors/hyrax/actors/newspaper_page_actor.rb +8 -0
  15. data/app/actors/hyrax/actors/newspaper_title_actor.rb +8 -0
  16. data/app/actors/newspaper_works/actors/newspaper_works_upload_actor.rb +88 -0
  17. data/app/assets/config/newspaper_works_manifest.js +2 -0
  18. data/app/assets/images/newspaper_works/.keep +0 -0
  19. data/app/assets/javascripts/newspaper_works/autocomplete_fix.js +33 -0
  20. data/app/assets/javascripts/newspaper_works/ocr_search.js.erb +6 -0
  21. data/app/assets/javascripts/newspaper_works/thumbnail_highlights.js.erb +102 -0
  22. data/app/assets/javascripts/newspaper_works.js +4 -0
  23. data/app/assets/stylesheets/newspaper_works/_issue_search.scss +13 -0
  24. data/app/assets/stylesheets/newspaper_works/_issues_calendar.scss +18 -0
  25. data/app/assets/stylesheets/newspaper_works/_newspaper_works.scss +4 -0
  26. data/app/assets/stylesheets/newspaper_works/_newspapers_search.scss +38 -0
  27. data/app/assets/stylesheets/newspaper_works/_search_results.scss +12 -0
  28. data/app/controllers/hyrax/newspaper_articles_controller.rb +14 -0
  29. data/app/controllers/hyrax/newspaper_containers_controller.rb +14 -0
  30. data/app/controllers/hyrax/newspaper_issues_controller.rb +14 -0
  31. data/app/controllers/hyrax/newspaper_pages_controller.rb +14 -0
  32. data/app/controllers/hyrax/newspaper_titles_controller.rb +13 -0
  33. data/app/controllers/newspaper_works/newspapers_controller.rb +117 -0
  34. data/app/controllers/newspaper_works/newspapers_search_controller.rb +26 -0
  35. data/app/forms/hyrax/newspaper_article_form.rb +11 -0
  36. data/app/forms/hyrax/newspaper_container_form.rb +11 -0
  37. data/app/forms/hyrax/newspaper_issue_form.rb +11 -0
  38. data/app/forms/hyrax/newspaper_page_form.rb +15 -0
  39. data/app/forms/hyrax/newspaper_title_form.rb +12 -0
  40. data/app/forms/newspaper_works/newspaper_core_form_data.rb +17 -0
  41. data/app/helpers/hyrax/newspaper_articles_helper.rb +5 -0
  42. data/app/helpers/hyrax/newspaper_containers_helper.rb +5 -0
  43. data/app/helpers/hyrax/newspaper_issues_helper.rb +5 -0
  44. data/app/helpers/hyrax/newspaper_pages_helper.rb +5 -0
  45. data/app/helpers/newspaper_works/application_helper.rb +5 -0
  46. data/app/helpers/newspaper_works/breadcrumb_helper.rb +92 -0
  47. data/app/helpers/newspaper_works/newspaper_works_helper_behavior.rb +103 -0
  48. data/app/helpers/newspaper_works/newspapers_helper.rb +5 -0
  49. data/app/indexers/concerns/newspaper_works/indexes_full_text.rb +17 -0
  50. data/app/indexers/concerns/newspaper_works/indexes_place_of_publication.rb +67 -0
  51. data/app/indexers/concerns/newspaper_works/indexes_publication_date_range.rb +35 -0
  52. data/app/indexers/concerns/newspaper_works/indexes_relationships.rb +125 -0
  53. data/app/indexers/newspaper_article_indexer.rb +16 -0
  54. data/app/indexers/newspaper_container_indexer.rb +18 -0
  55. data/app/indexers/newspaper_issue_indexer.rb +26 -0
  56. data/app/indexers/newspaper_page_indexer.rb +9 -0
  57. data/app/indexers/newspaper_title_indexer.rb +19 -0
  58. data/app/indexers/newspaper_works/newspaper_core_indexer.rb +21 -0
  59. data/app/jobs/newspaper_works/application_job.rb +4 -0
  60. data/app/jobs/newspaper_works/compose_issue_pdf_job.rb +13 -0
  61. data/app/jobs/newspaper_works/create_issue_pages_job.rb +19 -0
  62. data/app/mailers/newspaper_works/application_mailer.rb +8 -0
  63. data/app/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior.rb +82 -0
  64. data/app/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior.rb +27 -0
  65. data/app/models/concerns/newspaper_works/newspaper_core_metadata.rb +67 -0
  66. data/app/models/concerns/newspaper_works/place_of_publication_behavior.rb +15 -0
  67. data/app/models/concerns/newspaper_works/scanned_media_metadata.rb +43 -0
  68. data/app/models/concerns/newspaper_works/solr/document.rb +25 -0
  69. data/app/models/file_set.rb +10 -0
  70. data/app/models/newspaper_article.rb +158 -0
  71. data/app/models/newspaper_container.rb +86 -0
  72. data/app/models/newspaper_issue.rb +115 -0
  73. data/app/models/newspaper_page.rb +70 -0
  74. data/app/models/newspaper_title.rb +111 -0
  75. data/app/models/newspaper_works/application_record.rb +6 -0
  76. data/app/models/newspaper_works/derivative_attachment.rb +8 -0
  77. data/app/models/newspaper_works/ingest_file_relation.rb +14 -0
  78. data/app/presenters/hyrax/newspaper_article_presenter.rb +38 -0
  79. data/app/presenters/hyrax/newspaper_container_presenter.rb +11 -0
  80. data/app/presenters/hyrax/newspaper_issue_presenter.rb +62 -0
  81. data/app/presenters/hyrax/newspaper_page_presenter.rb +72 -0
  82. data/app/presenters/hyrax/newspaper_title_presenter.rb +86 -0
  83. data/app/presenters/newspaper_works/iiif_manifest_presenter_behavior.rb +29 -0
  84. data/app/presenters/newspaper_works/issue_info_presenter.rb +29 -0
  85. data/app/presenters/newspaper_works/newspaper_core_presenter.rb +9 -0
  86. data/app/presenters/newspaper_works/persistent_url_presenter_behavior.rb +16 -0
  87. data/app/presenters/newspaper_works/place_of_publication_presenter_behavior.rb +8 -0
  88. data/app/presenters/newspaper_works/scanned_media_presenter.rb +7 -0
  89. data/app/presenters/newspaper_works/title_info_presenter.rb +13 -0
  90. data/app/search_builders/concerns/newspaper_works/exclude_models.rb +16 -0
  91. data/app/search_builders/concerns/newspaper_works/highlight_search_params.rb +14 -0
  92. data/app/search_builders/newspaper_works/newspapers_search_builder.rb +26 -0
  93. data/app/services/hyrax/article_genre_service.rb +9 -0
  94. data/app/services/newspaper_works/jp2_derivative_service.rb +120 -0
  95. data/app/services/newspaper_works/newspaper_page_derivative_service.rb +91 -0
  96. data/app/services/newspaper_works/pdf_derivative_service.rb +45 -0
  97. data/app/services/newspaper_works/pluggable_derivative_service.rb +114 -0
  98. data/app/services/newspaper_works/text_extraction_derivative_service.rb +56 -0
  99. data/app/services/newspaper_works/text_formats_from_alto_service.rb +77 -0
  100. data/app/services/newspaper_works/tiff_derivative_service.rb +54 -0
  101. data/app/validators/newspaper_works/publication_date_start_end_validator.rb +48 -0
  102. data/app/validators/newspaper_works/publication_date_validator.rb +16 -0
  103. data/app/views/catalog/_index_gallery_newspaper_article_wrapper.html.erb +9 -0
  104. data/app/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb +9 -0
  105. data/app/views/catalog/_index_header_gallery_newspaper_article.html.erb +23 -0
  106. data/app/views/catalog/_index_header_gallery_newspaper_page.html.erb +23 -0
  107. data/app/views/catalog/_index_header_list_newspaper_article.html.erb +7 -0
  108. data/app/views/catalog/_index_header_list_newspaper_page.html.erb +7 -0
  109. data/app/views/catalog/_snippets_more.html.erb +16 -0
  110. data/app/views/catalog/_thumbnail_list_newspaper_article.html.erb +6 -0
  111. data/app/views/catalog/_thumbnail_list_newspaper_page.html.erb +6 -0
  112. data/app/views/hyrax/file_sets/_actions.html.erb +45 -0
  113. data/app/views/hyrax/newspaper_articles/_newspaper_article.html.erb +2 -0
  114. data/app/views/hyrax/newspaper_articles/show.html.erb +1 -0
  115. data/app/views/hyrax/newspaper_containers/_newspaper_container.html.erb +2 -0
  116. data/app/views/hyrax/newspaper_containers/show.html.erb +1 -0
  117. data/app/views/hyrax/newspaper_issues/_newspaper_issue.html.erb +2 -0
  118. data/app/views/hyrax/newspaper_issues/show.html.erb +1 -0
  119. data/app/views/hyrax/newspaper_pages/_newspaper_page.html.erb +2 -0
  120. data/app/views/hyrax/newspaper_pages/show.html.erb +1 -0
  121. data/app/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb +5 -0
  122. data/app/views/hyrax/newspaper_titles/_issue_search_form.html.erb +33 -0
  123. data/app/views/hyrax/newspaper_titles/_issues_calendar.html.erb +63 -0
  124. data/app/views/hyrax/newspaper_titles/_newspaper_title.html.erb +2 -0
  125. data/app/views/hyrax/newspaper_titles/show.html.erb +54 -0
  126. data/app/views/newspaper_works/base/_attribute_rows.html.erb +42 -0
  127. data/app/views/newspaper_works/base/_attributes.html.erb +16 -0
  128. data/app/views/newspaper_works/base/_metadata.html.erb +6 -0
  129. data/app/views/newspaper_works/base/_newspaper_hierarchy.html.erb +14 -0
  130. data/app/views/newspaper_works/base/_persistent_url.html.erb +1 -0
  131. data/app/views/newspaper_works/base/_show.html.erb +45 -0
  132. data/app/views/newspaper_works/newspapers_search/_date_fields.html.erb +29 -0
  133. data/app/views/newspaper_works/newspapers_search/_facet_layout.html.erb +8 -0
  134. data/app/views/newspaper_works/newspapers_search/_facet_limit.html.erb +17 -0
  135. data/app/views/newspaper_works/newspapers_search/_front_pages_input.html.erb +5 -0
  136. data/app/views/newspaper_works/newspapers_search/_keyword_input.html.erb +18 -0
  137. data/app/views/newspaper_works/newspapers_search/_newspapers_facets.html.erb +5 -0
  138. data/app/views/newspaper_works/newspapers_search/_newspapers_search_form.html.erb +13 -0
  139. data/app/views/newspaper_works/newspapers_search/_newspapers_search_help.html.erb +8 -0
  140. data/app/views/newspaper_works/newspapers_search/search.html.erb +13 -0
  141. data/app/views/records/edit_fields/_alternate_title.html.erb +4 -0
  142. data/app/views/records/edit_fields/_genre.html.erb +4 -0
  143. data/app/views/records/edit_fields/_place_of_publication.html.erb +14 -0
  144. data/app/views/records/edit_fields/_subtitle.html.erb +4 -0
  145. data/bin/rails +13 -0
  146. data/config/fcrepo_wrapper_test.yml +5 -0
  147. data/config/initializers/assets.rb +2 -0
  148. data/config/locales/newspaper_article.de.yml +12 -0
  149. data/config/locales/newspaper_article.en.yml +12 -0
  150. data/config/locales/newspaper_article.es.yml +12 -0
  151. data/config/locales/newspaper_article.fr.yml +12 -0
  152. data/config/locales/newspaper_article.it.yml +12 -0
  153. data/config/locales/newspaper_article.pt-BR.yml +12 -0
  154. data/config/locales/newspaper_article.zh.yml +12 -0
  155. data/config/locales/newspaper_container.de.yml +8 -0
  156. data/config/locales/newspaper_container.en.yml +8 -0
  157. data/config/locales/newspaper_container.es.yml +8 -0
  158. data/config/locales/newspaper_container.fr.yml +8 -0
  159. data/config/locales/newspaper_container.it.yml +8 -0
  160. data/config/locales/newspaper_container.pt-BR.yml +8 -0
  161. data/config/locales/newspaper_container.zh.yml +8 -0
  162. data/config/locales/newspaper_issue.de.yml +8 -0
  163. data/config/locales/newspaper_issue.en.yml +8 -0
  164. data/config/locales/newspaper_issue.es.yml +8 -0
  165. data/config/locales/newspaper_issue.fr.yml +8 -0
  166. data/config/locales/newspaper_issue.it.yml +8 -0
  167. data/config/locales/newspaper_issue.pt-BR.yml +8 -0
  168. data/config/locales/newspaper_issue.zh.yml +8 -0
  169. data/config/locales/newspaper_page.de.yml +15 -0
  170. data/config/locales/newspaper_page.en.yml +15 -0
  171. data/config/locales/newspaper_page.es.yml +15 -0
  172. data/config/locales/newspaper_page.fr.yml +15 -0
  173. data/config/locales/newspaper_page.it.yml +15 -0
  174. data/config/locales/newspaper_page.pt-BR.yml +15 -0
  175. data/config/locales/newspaper_page.zh.yml +15 -0
  176. data/config/locales/newspaper_title.de.yml +8 -0
  177. data/config/locales/newspaper_title.en.yml +8 -0
  178. data/config/locales/newspaper_title.es.yml +8 -0
  179. data/config/locales/newspaper_title.fr.yml +8 -0
  180. data/config/locales/newspaper_title.it.yml +8 -0
  181. data/config/locales/newspaper_title.pt-BR.yml +8 -0
  182. data/config/locales/newspaper_title.zh.yml +8 -0
  183. data/config/locales/newspaper_works.de.yml +50 -0
  184. data/config/locales/newspaper_works.en.yml +52 -0
  185. data/config/locales/newspaper_works.es.yml +52 -0
  186. data/config/locales/newspaper_works.fr.yml +52 -0
  187. data/config/locales/newspaper_works.it.yml +52 -0
  188. data/config/locales/newspaper_works.pt-BR.yml +52 -0
  189. data/config/locales/newspaper_works.zh.yml +52 -0
  190. data/config/routes.rb +9 -0
  191. data/config/solr_wrapper_test.yml +9 -0
  192. data/config/test-fixture/solr-config/_rest_managed.json +3 -0
  193. data/config/test-fixture/solr-config/admin-extra.html +31 -0
  194. data/config/test-fixture/solr-config/elevate.xml +36 -0
  195. data/config/test-fixture/solr-config/mapping-ISOLatin1Accent.txt +246 -0
  196. data/config/test-fixture/solr-config/protwords.txt +21 -0
  197. data/config/test-fixture/solr-config/schema.xml +366 -0
  198. data/config/test-fixture/solr-config/scripts.conf +24 -0
  199. data/config/test-fixture/solr-config/solrconfig.xml +322 -0
  200. data/config/test-fixture/solr-config/spellings.txt +2 -0
  201. data/config/test-fixture/solr-config/stopwords.txt +58 -0
  202. data/config/test-fixture/solr-config/stopwords_en.txt +58 -0
  203. data/config/test-fixture/solr-config/synonyms.txt +31 -0
  204. data/config/test-fixture/solr-config/xslt/example.xsl +132 -0
  205. data/config/test-fixture/solr-config/xslt/example_atom.xsl +67 -0
  206. data/config/test-fixture/solr-config/xslt/example_rss.xsl +66 -0
  207. data/config/test-fixture/solr-config/xslt/luke.xsl +337 -0
  208. data/config/vendor/imagemagick-6-policy.xml +76 -0
  209. data/db/migrate/20181214181358_create_newspaper_works_derivative_attachments.rb +12 -0
  210. data/db/migrate/20190107165909_create_newspaper_works_ingest_file_relations.rb +11 -0
  211. data/lib/generators/newspaper_works/assets_generator.rb +29 -0
  212. data/lib/generators/newspaper_works/blacklight_advanced_search_generator.rb +44 -0
  213. data/lib/generators/newspaper_works/blacklight_iiif_search_generator.rb +41 -0
  214. data/lib/generators/newspaper_works/catalog_controller_generator.rb +60 -0
  215. data/lib/generators/newspaper_works/install_generator.rb +97 -0
  216. data/lib/generators/newspaper_works/templates/annotation_behavior.rb +6 -0
  217. data/lib/generators/newspaper_works/templates/config/authorities/newspaper_article_genres.yml +86 -0
  218. data/lib/generators/newspaper_works/templates/config/initializers/newspaper_works.rb +12 -0
  219. data/lib/generators/newspaper_works/templates/config/initializers/patch_blacklight_advanced_search.rb +74 -0
  220. data/lib/generators/newspaper_works/templates/custom_search_builder.rb +23 -0
  221. data/lib/generators/newspaper_works/templates/newspaper_works.scss +1 -0
  222. data/lib/generators/newspaper_works/templates/newspaper_works_helper.rb +3 -0
  223. data/lib/generators/newspaper_works/templates/search_behavior.rb +6 -0
  224. data/lib/newspaper_works/configuration.rb +14 -0
  225. data/lib/newspaper_works/data/fileset_helper.rb +25 -0
  226. data/lib/newspaper_works/data/path_helper.rb +40 -0
  227. data/lib/newspaper_works/data/work_derivatives.rb +314 -0
  228. data/lib/newspaper_works/data/work_file.rb +92 -0
  229. data/lib/newspaper_works/data/work_files.rb +181 -0
  230. data/lib/newspaper_works/data.rb +35 -0
  231. data/lib/newspaper_works/engine.rb +42 -0
  232. data/lib/newspaper_works/errors.rb +14 -0
  233. data/lib/newspaper_works/ingest/base_ingest.rb +69 -0
  234. data/lib/newspaper_works/ingest/base_publication_info.rb +35 -0
  235. data/lib/newspaper_works/ingest/batch_ingest_helper.rb +44 -0
  236. data/lib/newspaper_works/ingest/batch_issue_ingester.rb +129 -0
  237. data/lib/newspaper_works/ingest/chronam_publication_info.rb +133 -0
  238. data/lib/newspaper_works/ingest/from_command.rb +52 -0
  239. data/lib/newspaper_works/ingest/image_ingest_issues.rb +43 -0
  240. data/lib/newspaper_works/ingest/issue_images.rb +51 -0
  241. data/lib/newspaper_works/ingest/lc_publication_info.rb +144 -0
  242. data/lib/newspaper_works/ingest/named_issue_metadata.rb +60 -0
  243. data/lib/newspaper_works/ingest/ndnp/batch_ingester.rb +64 -0
  244. data/lib/newspaper_works/ingest/ndnp/batch_xml_ingest.rb +72 -0
  245. data/lib/newspaper_works/ingest/ndnp/container_ingest.rb +99 -0
  246. data/lib/newspaper_works/ingest/ndnp/container_ingester.rb +84 -0
  247. data/lib/newspaper_works/ingest/ndnp/container_metadata.rb +87 -0
  248. data/lib/newspaper_works/ingest/ndnp/issue_ingest.rb +81 -0
  249. data/lib/newspaper_works/ingest/ndnp/issue_ingester.rb +101 -0
  250. data/lib/newspaper_works/ingest/ndnp/issue_metadata.rb +96 -0
  251. data/lib/newspaper_works/ingest/ndnp/ndnp_asset_helper.rb +20 -0
  252. data/lib/newspaper_works/ingest/ndnp/ndnp_mets_helper.rb +70 -0
  253. data/lib/newspaper_works/ingest/ndnp/page_ingest.rb +47 -0
  254. data/lib/newspaper_works/ingest/ndnp/page_ingester.rb +157 -0
  255. data/lib/newspaper_works/ingest/ndnp/page_metadata.rb +112 -0
  256. data/lib/newspaper_works/ingest/ndnp.rb +21 -0
  257. data/lib/newspaper_works/ingest/newspaper_issue_ingest.rb +56 -0
  258. data/lib/newspaper_works/ingest/newspaper_page_ingest.rb +6 -0
  259. data/lib/newspaper_works/ingest/page_image.rb +52 -0
  260. data/lib/newspaper_works/ingest/path_enumeration.rb +52 -0
  261. data/lib/newspaper_works/ingest/pdf_images.rb +85 -0
  262. data/lib/newspaper_works/ingest/pdf_issue.rb +20 -0
  263. data/lib/newspaper_works/ingest/pdf_issues.rb +39 -0
  264. data/lib/newspaper_works/ingest/pdf_pages.rb +114 -0
  265. data/lib/newspaper_works/ingest/pub_finder.rb +89 -0
  266. data/lib/newspaper_works/ingest/publication_info.rb +44 -0
  267. data/lib/newspaper_works/ingest.rb +90 -0
  268. data/lib/newspaper_works/issue_pdf_composer.rb +111 -0
  269. data/lib/newspaper_works/logging.rb +54 -0
  270. data/lib/newspaper_works/page_finder.rb +62 -0
  271. data/lib/newspaper_works/resource_fetcher.rb +78 -0
  272. data/lib/newspaper_works/text_extraction/alto_reader.rb +122 -0
  273. data/lib/newspaper_works/text_extraction/page_ocr.rb +100 -0
  274. data/lib/newspaper_works/text_extraction/render_alto.rb +84 -0
  275. data/lib/newspaper_works/text_extraction/word_coords_builder.rb +30 -0
  276. data/lib/newspaper_works/text_extraction.rb +10 -0
  277. data/lib/newspaper_works/version.rb +3 -0
  278. data/lib/newspaper_works.rb +19 -0
  279. data/lib/tasks/newspaper_works_tasks.rake +39 -0
  280. data/newspaper_works.gemspec +49 -0
  281. data/spec/.keep.txt +1 -0
  282. data/spec/actors/newspaper_works/actors/newspaper_works_upload_actor_spec.rb +69 -0
  283. data/spec/controllers/catalog_controller_spec.rb +63 -0
  284. data/spec/controllers/newspaper_works/newspapers_controller_spec.rb +114 -0
  285. data/spec/controllers/newspaper_works/newspapers_search_controller_spec.rb +21 -0
  286. data/spec/factories/ability.rb +6 -0
  287. data/spec/factories/newspaper_issue.rb +7 -0
  288. data/spec/factories/newspaper_issue_ingest.rb +6 -0
  289. data/spec/factories/newspaper_page.rb +7 -0
  290. data/spec/factories/newspaper_page_ingest.rb +6 -0
  291. data/spec/factories/newspaper_page_solr_document.rb +12 -0
  292. data/spec/factories/newspaper_title.rb +8 -0
  293. data/spec/factories/uploaded_pdf_file.rb +9 -0
  294. data/spec/factories/user.rb +13 -0
  295. data/spec/features/front_pages_for_title_spec.rb +19 -0
  296. data/spec/features/newspaper_title_search_spec.rb +30 -0
  297. data/spec/features/newspapers_search_spec.rb +49 -0
  298. data/spec/features/search_results_thumbnail_highlights_spec.rb +33 -0
  299. data/spec/features_shared.rb +71 -0
  300. data/spec/fixtures/files/4.1.07.jp2 +0 -0
  301. data/spec/fixtures/files/4.1.07.tiff +0 -0
  302. data/spec/fixtures/files/README.md +7 -0
  303. data/spec/fixtures/files/alto-2-0.xsd +714 -0
  304. data/spec/fixtures/files/broken-truncated.pdf +0 -0
  305. data/spec/fixtures/files/credits.md +16 -0
  306. data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
  307. data/spec/fixtures/files/minimal-1-page.pdf +0 -0
  308. data/spec/fixtures/files/minimal-2-page.pdf +0 -0
  309. data/spec/fixtures/files/minimal-alto.xml +31 -0
  310. data/spec/fixtures/files/ndnp-alto-sample.xml +24 -0
  311. data/spec/fixtures/files/ndnp-sample1-json.json +1 -0
  312. data/spec/fixtures/files/ndnp-sample1-txt.txt +1 -0
  313. data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
  314. data/spec/fixtures/files/ocr_alto.xml +202 -0
  315. data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +202 -0
  316. data/spec/fixtures/files/ocr_color.tiff +0 -0
  317. data/spec/fixtures/files/ocr_gray.jp2 +0 -0
  318. data/spec/fixtures/files/ocr_gray.tiff +0 -0
  319. data/spec/fixtures/files/ocr_mono.tiff +0 -0
  320. data/spec/fixtures/files/page1.tiff +0 -0
  321. data/spec/fixtures/files/resource_mocks/chronam/http404-expected +0 -0
  322. data/spec/fixtures/files/resource_mocks/chronam/sn84038814.rdf +1028 -0
  323. data/spec/fixtures/files/resource_mocks/chronam/sn93059126.rdf +36 -0
  324. data/spec/fixtures/files/resource_mocks/chronam/sn94051019.rdf +37 -0
  325. data/spec/fixtures/files/resource_mocks/geonames/Chicopee +1104 -0
  326. data/spec/fixtures/files/resource_mocks/geonames/Denver +1104 -0
  327. data/spec/fixtures/files/resource_mocks/geonames/Marysville +279 -0
  328. data/spec/fixtures/files/resource_mocks/geonames/Marysville2 +279 -0
  329. data/spec/fixtures/files/resource_mocks/geonames/SLC +1104 -0
  330. data/spec/fixtures/files/resource_mocks/lccn/sn2099999999 +1 -0
  331. data/spec/fixtures/files/resource_mocks/lccn/sn82014496 +2 -0
  332. data/spec/fixtures/files/resource_mocks/lccn/sn83020109 +1 -0
  333. data/spec/fixtures/files/resource_mocks/lccn/sn83021453 +2 -0
  334. data/spec/fixtures/files/resource_mocks/lccn/sn83045396 +2 -0
  335. data/spec/fixtures/files/resource_mocks/lccn/sn84038814 +2 -0
  336. data/spec/fixtures/files/resource_mocks/lccn/sn93059126 +1 -0
  337. data/spec/fixtures/files/resource_mocks/lccn/sn94051019 +1 -0
  338. data/spec/fixtures/files/resource_mocks/lccn/sn99999999 +1 -0
  339. data/spec/fixtures/files/resource_mocks/urls.json +82 -0
  340. data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
  341. data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
  342. data/spec/fixtures/files/thumbnail.jpg +0 -0
  343. data/spec/forms/hyrax/newspaper_article_form_spec.rb +33 -0
  344. data/spec/forms/hyrax/newspaper_container_form_spec.rb +30 -0
  345. data/spec/forms/hyrax/newspaper_issue_form_spec.rb +31 -0
  346. data/spec/forms/hyrax/newspaper_page_form_spec.rb +28 -0
  347. data/spec/forms/hyrax/newspaper_title_form_spec.rb +31 -0
  348. data/spec/forms/newspaper_works/newspaper_core_form_data_spec.rb +12 -0
  349. data/spec/helpers/newspaper_works/breadcrumb_helper_spec.rb +82 -0
  350. data/spec/helpers/newspaper_works_helper_spec.rb +57 -0
  351. data/spec/indexers/concerns/newspaper_works/indexes_full_text_spec.rb +31 -0
  352. data/spec/indexers/concerns/newspaper_works/indexes_place_of_publication_spec.rb +53 -0
  353. data/spec/indexers/concerns/newspaper_works/indexes_publication_date_range_spec.rb +39 -0
  354. data/spec/indexers/concerns/newspaper_works/indexes_relationships_spec.rb +86 -0
  355. data/spec/indexers/newspaper_article_indexer_spec.rb +29 -0
  356. data/spec/indexers/newspaper_issue_indexer_spec.rb +19 -0
  357. data/spec/indexers/newspaper_title_indexer_spec.rb +22 -0
  358. data/spec/indexers/newspaper_works/newspaper_core_indexer_spec.rb +23 -0
  359. data/spec/lib/newspaper_works/configuration_spec.rb +18 -0
  360. data/spec/lib/newspaper_works/data/work_derivatives_spec.rb +245 -0
  361. data/spec/lib/newspaper_works/data/work_file_spec.rb +99 -0
  362. data/spec/lib/newspaper_works/data/work_files_spec.rb +224 -0
  363. data/spec/lib/newspaper_works/ingest/batch_issue_ingester_spec.rb +158 -0
  364. data/spec/lib/newspaper_works/ingest/chronam_publication_info_spec.rb +35 -0
  365. data/spec/lib/newspaper_works/ingest/from_command_spec.rb +75 -0
  366. data/spec/lib/newspaper_works/ingest/image_ingest_issues_spec.rb +62 -0
  367. data/spec/lib/newspaper_works/ingest/ingest_shared.rb +75 -0
  368. data/spec/lib/newspaper_works/ingest/issue_images_spec.rb +65 -0
  369. data/spec/lib/newspaper_works/ingest/lc_publication_info_spec.rb +34 -0
  370. data/spec/lib/newspaper_works/ingest/ndnp/batch_ingester_spec.rb +131 -0
  371. data/spec/lib/newspaper_works/ingest/ndnp/batch_xml_ingest_spec.rb +64 -0
  372. data/spec/lib/newspaper_works/ingest/ndnp/container_ingest_spec.rb +44 -0
  373. data/spec/lib/newspaper_works/ingest/ndnp/container_ingester_spec.rb +126 -0
  374. data/spec/lib/newspaper_works/ingest/ndnp/container_metadata_spec.rb +36 -0
  375. data/spec/lib/newspaper_works/ingest/ndnp/issue_ingest_spec.rb +108 -0
  376. data/spec/lib/newspaper_works/ingest/ndnp/issue_ingester_spec.rb +155 -0
  377. data/spec/lib/newspaper_works/ingest/ndnp/issue_metadata_spec.rb +84 -0
  378. data/spec/lib/newspaper_works/ingest/ndnp/page_ingest_spec.rb +79 -0
  379. data/spec/lib/newspaper_works/ingest/ndnp/page_ingester_spec.rb +184 -0
  380. data/spec/lib/newspaper_works/ingest/ndnp/page_metadata_spec.rb +85 -0
  381. data/spec/lib/newspaper_works/ingest/newspaper_issue_ingest_spec.rb +83 -0
  382. data/spec/lib/newspaper_works/ingest/newspaper_page_ingest_spec.rb +77 -0
  383. data/spec/lib/newspaper_works/ingest/page_image_spec.rb +29 -0
  384. data/spec/lib/newspaper_works/ingest/pdf_images_spec.rb +32 -0
  385. data/spec/lib/newspaper_works/ingest/pdf_issue_spec.rb +29 -0
  386. data/spec/lib/newspaper_works/ingest/pdf_issues_spec.rb +62 -0
  387. data/spec/lib/newspaper_works/ingest/pdf_pages_spec.rb +110 -0
  388. data/spec/lib/newspaper_works/ingest/pub_finder_spec.rb +58 -0
  389. data/spec/lib/newspaper_works/ingest/publication_info_spec.rb +61 -0
  390. data/spec/lib/newspaper_works/ingest_spec.rb +45 -0
  391. data/spec/lib/newspaper_works/issue_pdf_composer_spec.rb +101 -0
  392. data/spec/lib/newspaper_works/logging_spec.rb +53 -0
  393. data/spec/lib/newspaper_works/page_finder_spec.rb +53 -0
  394. data/spec/lib/newspaper_works/resource_fetcher_spec.rb +65 -0
  395. data/spec/lib/newspaper_works/text_extraction/alto_reader_spec.rb +49 -0
  396. data/spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb +84 -0
  397. data/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb +54 -0
  398. data/spec/lib/newspaper_works/text_extraction/word_coords_builder_spec.rb +30 -0
  399. data/spec/lib/tasks/newspaper_works_rake_spec.rb +124 -0
  400. data/spec/misc_shared.rb +109 -0
  401. data/spec/model_shared.rb +134 -0
  402. data/spec/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior_spec.rb +45 -0
  403. data/spec/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior_spec.rb +27 -0
  404. data/spec/models/concerns/newspaper_works/newspaper_core_metadata_spec.rb +45 -0
  405. data/spec/models/concerns/newspaper_works/place_of_publication_behavior_spec.rb +17 -0
  406. data/spec/models/concerns/newspaper_works/scanned_media_metadata_spec.rb +35 -0
  407. data/spec/models/newspaper_article_spec.rb +73 -0
  408. data/spec/models/newspaper_container_spec.rb +111 -0
  409. data/spec/models/newspaper_issue_spec.rb +91 -0
  410. data/spec/models/newspaper_page_spec.rb +44 -0
  411. data/spec/models/newspaper_title_spec.rb +116 -0
  412. data/spec/models/newspaper_works/derivative_attachment_spec.rb +37 -0
  413. data/spec/models/newspaper_works/ingest_file_relation_spec.rb +56 -0
  414. data/spec/models/solr_document_spec.rb +14 -0
  415. data/spec/ndnp_shared.rb +48 -0
  416. data/spec/presenters/hyrax/newspaper_article_presenter_spec.rb +53 -0
  417. data/spec/presenters/hyrax/newspaper_container_presenter_spec.rb +20 -0
  418. data/spec/presenters/hyrax/newspaper_issue_presenter_spec.rb +65 -0
  419. data/spec/presenters/hyrax/newspaper_page_presenter_spec.rb +75 -0
  420. data/spec/presenters/hyrax/newspaper_title_presenter_spec.rb +153 -0
  421. data/spec/presenters/newspaper_works/iiif_manifest_presenter_behavior_spec.rb +32 -0
  422. data/spec/presenters/newspaper_works/issue_info_presenter_spec.rb +51 -0
  423. data/spec/presenters/newspaper_works/newspaper_core_presenter_spec.rb +22 -0
  424. data/spec/presenters/newspaper_works/persistent_url_presenter_behavior_spec.rb +24 -0
  425. data/spec/presenters/newspaper_works/place_of_publication_presenter_behavior_spec.rb +17 -0
  426. data/spec/presenters/newspaper_works/scanned_media_presenter_spec.rb +18 -0
  427. data/spec/presenters/newspaper_works/title_info_presenter_spec.rb +23 -0
  428. data/spec/routing/route_spec.rb +52 -0
  429. data/spec/search_builders/custom_search_builder_spec.rb +34 -0
  430. data/spec/search_builders/newspaper_works/newspapers_search_builder_spec.rb +33 -0
  431. data/spec/services/hyrax/article_genre_service_spec.rb +12 -0
  432. data/spec/services/hyrax/resource_types_service_spec.rb +12 -0
  433. data/spec/services/newspaper_works/jp2_derivative_service_spec.rb +62 -0
  434. data/spec/services/newspaper_works/newspaper_page_derivative_service_spec.rb +125 -0
  435. data/spec/services/newspaper_works/pdf_derivative_service_spec.rb +62 -0
  436. data/spec/services/newspaper_works/pluggable_derivative_service_spec.rb +204 -0
  437. data/spec/services/newspaper_works/text_extraction_derivative_service_spec.rb +82 -0
  438. data/spec/services/newspaper_works/text_formats_from_alto_service_spec.rb +129 -0
  439. data/spec/services/newspaper_works/tiff_derivative_service_spec.rb +58 -0
  440. data/spec/spec_helper.rb +261 -0
  441. data/spec/support/controller_level_helpers.rb +28 -0
  442. data/spec/test_app_templates/lib/generators/test_app_generator.rb +22 -0
  443. data/spec/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb_spec.rb +36 -0
  444. data/spec/views/catalog/_index_header_list_newspaper_page.html.erb_spec.rb +26 -0
  445. data/spec/views/catalog/_thumbnail_list_newspaper_page.html.erb_spec.rb +35 -0
  446. data/spec/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb_spec.rb +16 -0
  447. data/spec/views/hyrax/newspaper_titles/_issue_search_form.html.erb_spec.rb +33 -0
  448. data/spec/views/hyrax/newspaper_titles/_issues_calendar.html.erb_spec.rb +37 -0
  449. data/spec/views/hyrax/newspaper_titles/show.html.erb_spec.rb +87 -0
  450. data/spec/views/newspaper_works/base/_attribute_rows.html.erb_spec.rb +60 -0
  451. data/spec/views/newspaper_works/base/_newspaper_hierarchy.html.erb_spec.rb +80 -0
  452. data/spec/views/newspaper_works/base/_show.html.erb_spec.rb +78 -0
  453. data/spec/views/newspaper_works/newspapers_search/search.html.erb_spec.rb +54 -0
  454. data/spec/views/records/edit_fields/_place_of_publication.html.erb_spec.rb +26 -0
  455. data/tasks/newspaperworks_dev.rake +26 -0
  456. data/test/integration/navigation_test.rb +7 -0
  457. data/test/lib/generators/newspaper_works/install_generator_test.rb +16 -0
  458. data/test/newspaper_works_test.rb +7 -0
  459. data/test/test_helper.rb +17 -0
  460. data/tmp/.keep +0 -0
  461. metadata +1037 -0
@@ -0,0 +1,204 @@
1
+ require 'fileutils'
2
+ require 'spec_helper'
3
+
4
+ RSpec.describe NewspaperWorks::PluggableDerivativeService do
5
+ let(:valid_file_set) { FileSet.new }
6
+
7
+ let(:persisted_file_set) do
8
+ fs = FileSet.new
9
+ work = NewspaperPage.new
10
+ work.title = ['This is a page!']
11
+ work.members.push(fs)
12
+ fs.instance_variable_set(:@mime_type, 'image/tiff')
13
+ fs.save!(validate: false)
14
+ work.save!(validate: false)
15
+ fs
16
+ end
17
+
18
+ let(:fixture_path) do
19
+ File.join(
20
+ NewspaperWorks::GEM_PATH, 'spec', 'fixtures', 'files'
21
+ )
22
+ end
23
+
24
+ # cache and restore originally described derivative service plugins
25
+ before do
26
+ @orig_plugins = described_class.plugins
27
+ end
28
+ after do
29
+ described_class.plugins = @orig_plugins
30
+ end
31
+
32
+ describe ".plugins=" do
33
+ it "allows setting of derivative service plugins" do
34
+ expect(described_class.plugins).to eq @orig_plugins
35
+ described_class.plugins = [Hyrax::FileSetDerivativesService] * 2
36
+ expect(described_class.plugins).to eq [Hyrax::FileSetDerivativesService] * 2
37
+ end
38
+ end
39
+
40
+ describe "calls all derivative plugins" do
41
+ class FakeDerivativeService
42
+ @create_called = 0
43
+ @cleanup_called = 0
44
+ class << self
45
+ attr_accessor :create_called, :cleanup_called
46
+
47
+ def target_ext
48
+ 'txt'
49
+ end
50
+ end
51
+
52
+ def initialize(fileset)
53
+ @fileset = fileset
54
+ @created = false
55
+ end
56
+
57
+ def valid?
58
+ true
59
+ end
60
+
61
+ def create_derivatives(filename)
62
+ self.class.create_called += 1
63
+ filename
64
+ end
65
+
66
+ def cleanup_derivatives
67
+ self.class.cleanup_called += 1
68
+ end
69
+ end
70
+
71
+ def touch_fake_derivative_file(file_set, ext)
72
+ path = Hyrax::DerivativePath.derivative_path_for_reference(file_set, ext)
73
+ FileUtils.mkdir_p(File.join(path.split('/')[0..-2]))
74
+ FileUtils.touch(path)
75
+ end
76
+
77
+ it "calls each plugin on create" do
78
+ create_calls = FakeDerivativeService.create_called
79
+ described_class.plugins = [FakeDerivativeService, FakeDerivativeService]
80
+ service = described_class.new(FileSet.new)
81
+ service.create_derivatives('not_a_real_filename')
82
+ expect(FakeDerivativeService.create_called).to eq create_calls + 2
83
+ end
84
+
85
+ it "does not re-create existing derivative" do
86
+ create_calls = FakeDerivativeService.create_called
87
+ described_class.plugins = [FakeDerivativeService]
88
+ service = described_class.new(persisted_file_set)
89
+ expect(persisted_file_set.id).not_to be_nil
90
+ # Fake is configured to have 'txt' destination_path, let's create a
91
+ # destination file in Hyrax's opinionated plate for dest. name.
92
+ touch_fake_derivative_file(persisted_file_set, 'txt')
93
+ service.create_derivatives('/nonsense/source/path/ignored')
94
+ # create calls logged by fake should not increment,
95
+ # as PluggableDerivativeService should have skipped calling
96
+ # plugin's create_derivatives method w/ presence of existing derivative
97
+ expect(FakeDerivativeService.create_called).to eq create_calls
98
+ end
99
+
100
+ it "calls each plugin on cleanup" do
101
+ expect(FakeDerivativeService.cleanup_called).to eq 0
102
+ described_class.plugins = [FakeDerivativeService, FakeDerivativeService]
103
+ service = described_class.new(FileSet.new)
104
+ service.cleanup_derivatives
105
+ expect(FakeDerivativeService.cleanup_called).to eq 2
106
+ end
107
+
108
+ it "test meta: spec restores original plugins" do
109
+ # verify `after do` clean up of plugins array to original value
110
+ plugins = described_class.plugins
111
+ expect(plugins.length).to eq @orig_plugins.length
112
+ expect(plugins).to include Hyrax::FileSetDerivativesService
113
+ end
114
+ end
115
+
116
+ describe "service registration" do
117
+ # integration test with Hyrax, verify services is registered
118
+
119
+ it "is registered with Hyrax" do
120
+ expect(Hyrax::DerivativeService.services).to include described_class
121
+ end
122
+
123
+ it "is the first valide service found" do
124
+ found = Hyrax::DerivativeService.for(FileSet.new)
125
+ expect(found.class).to be described_class
126
+ end
127
+ end
128
+
129
+ # integration tests for plugins
130
+ describe "runs multiple plugins, makes multiple derivatives" do
131
+ def source_image(name)
132
+ File.join(fixture_path, name)
133
+ end
134
+
135
+ def derivatives_for(file_set)
136
+ Hyrax::DerivativePath.derivatives_for_reference(file_set)
137
+ end
138
+
139
+ def expected_plugins
140
+ [
141
+ Hyrax::FileSetDerivativesService,
142
+ NewspaperWorks::JP2DerivativeService,
143
+ NewspaperWorks::PDFDerivativeService,
144
+ NewspaperWorks::TextExtractionDerivativeService,
145
+ NewspaperWorks::TIFFDerivativeService
146
+ ]
147
+ end
148
+
149
+ # The expected set of Plugins that will run for file set
150
+ it "has expected valid plugins configured" do
151
+ plugins = described_class.plugins
152
+ fs = persisted_file_set
153
+ services = plugins.map { |plugin| plugin.new(fs) }.select(&:valid?)
154
+ expect(services.length).to eq 5
155
+ used_plugins = services.map(&:class)
156
+ expected_plugins.each do |plugin|
157
+ expect(used_plugins).to include plugin
158
+ end
159
+ end
160
+
161
+ it "creates expected derivatives from TIFF source" do
162
+ svc = described_class.new(persisted_file_set)
163
+ svc.create_derivatives(source_image('4.1.07.tiff'))
164
+ made = derivatives_for(persisted_file_set)
165
+ made.each { |path| expect(File.exist?(path)) }
166
+ extensions = made.map { |path| path.split('.')[-1] }
167
+ expect(extensions).to include 'pdf'
168
+ expect(extensions).to include 'jp2'
169
+ expect(extensions).not_to include 'tiff'
170
+ # Thumbnail, created by Hyrax:
171
+ expect(extensions).to include 'jpeg'
172
+ end
173
+ end
174
+
175
+ describe "ingest integration" do
176
+ def log_attachment(file_set)
177
+ # create a log entry for the fileset given destination name 'jp2'
178
+ NewspaperWorks::DerivativeAttachment.create(
179
+ fileset_id: file_set.id,
180
+ path: '/some/arbitrary/path/to.jp2',
181
+ destination_name: 'jp2'
182
+ )
183
+ end
184
+
185
+ def jp2_plugin?(plugins)
186
+ r = plugins.select { |p| p.class == NewspaperWorks::JP2DerivativeService }
187
+ !r.empty?
188
+ end
189
+
190
+ it "will not attempt creating over pre-made derivative" do
191
+ service = described_class.new(persisted_file_set)
192
+ # this should be respected, evaluate by obtaining filtered
193
+ # services list, which must omit JP2DerivativeService
194
+ plugins = service.services(:create_derivatives)
195
+ # initially has jp2 plugin
196
+ expect(jp2_plugin?(plugins)).to be true
197
+ # blacklist jp2 by effect of log entry of pre-made attachment
198
+ log_attachment(service.file_set)
199
+ # omits, after logging intent of previous attachment:
200
+ plugins = service.services(:create_derivatives)
201
+ expect(jp2_plugin?(plugins)).to be false
202
+ end
203
+ end
204
+ end
@@ -0,0 +1,82 @@
1
+ require 'nokogiri'
2
+ require 'spec_helper'
3
+ require 'misc_shared'
4
+
5
+ RSpec.describe NewspaperWorks::TextExtractionDerivativeService do
6
+ include_context "shared setup"
7
+
8
+ let(:valid_file_set) do
9
+ file_set = FileSet.new
10
+ file_set.save!(validate: false)
11
+ file_set
12
+ end
13
+
14
+ let(:work) do
15
+ work = NewspaperPage.create(title: ["Hello"])
16
+ work.members << valid_file_set
17
+ work.save!
18
+ end
19
+
20
+ let(:minimal_alto) do
21
+ File.join(fixture_path, 'minimal-alto.xml')
22
+ end
23
+
24
+ let(:altoxsd) do
25
+ xsdpath = File.join(fixture_path, 'alto-2-0.xsd')
26
+ Nokogiri::XML::Schema(File.read(xsdpath))
27
+ end
28
+
29
+ describe "Creates ALTO derivative" do
30
+ def source_image(name)
31
+ File.join(fixture_path, name)
32
+ end
33
+
34
+ def expected_path(file_set, ext)
35
+ Hyrax::DerivativePath.derivative_path_for_reference(file_set, ext)
36
+ end
37
+
38
+ def validate_alto(filename)
39
+ altoxsd.validate(filename)
40
+ end
41
+
42
+ def derivative_exists(ext)
43
+ path = expected_path(valid_file_set, ext)
44
+ expect(File.exist?(path)).to be true
45
+ expect(File.size(path)).to be > 0
46
+ end
47
+
48
+ it "creates, stores valid ALTO and plain-text derivatives" do
49
+ # these are in same test to avoid duplicate OCR operation
50
+ service = described_class.new(valid_file_set)
51
+ service.create_derivatives(source_image('ocr_mono.tiff'))
52
+ # ALTO derivative file exists at expected path and validates:
53
+ altoxsd.validate(expected_path(valid_file_set, 'xml'))
54
+ # Plain text exists as non-empty file:
55
+ derivative_exists('txt')
56
+ derivative_exists('json')
57
+ json_path = expected_path(valid_file_set, 'json')
58
+ loaded_result = JSON.parse(File.read(json_path))
59
+ expect(loaded_result['coords'].length).to be > 1
60
+ end
61
+
62
+ it "usually uses OCR, when no existing text" do
63
+ service = described_class.new(valid_file_set)
64
+ # here, service will delegate create_derivatives to OCR impl method:
65
+ expect(service).to receive(:create_derivatives_from_ocr)
66
+ service.create_derivatives(source_image('ocr_mono.tiff'))
67
+ end
68
+
69
+ it "defers to existing ALTO sources, when present" do
70
+ # Attach some ALTO to a work
71
+ derivatives = NewspaperWorks::Data::WorkDerivatives.of(
72
+ work,
73
+ valid_file_set
74
+ )
75
+ derivatives.attach(minimal_alto, 'xml')
76
+ # In this case, service will not call the OCR implementation method:
77
+ service = described_class.new(valid_file_set)
78
+ expect(service).not_to receive(:create_derivatives_from_ocr)
79
+ service.create_derivatives(source_image('ocr_mono.tiff'))
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,129 @@
1
+ require 'nokogiri'
2
+ require 'spec_helper'
3
+ require 'misc_shared'
4
+
5
+ RSpec.describe NewspaperWorks::TextFormatsFromALTOService do
6
+ include_context "shared setup"
7
+
8
+ let(:valid_file_set) do
9
+ file_set = FileSet.new
10
+ file_set.save!(validate: false)
11
+ file_set
12
+ end
13
+
14
+ let(:work) do
15
+ work = NewspaperPage.create(title: ["Hello"])
16
+ work.members << valid_file_set
17
+ work.save!
18
+ work
19
+ end
20
+
21
+ let(:minimal_alto) do
22
+ File.join(fixture_path, 'minimal-alto.xml')
23
+ end
24
+
25
+ def log_incoming_attachment(fsid)
26
+ NewspaperWorks::DerivativeAttachment.create!(
27
+ fileset_id: fsid,
28
+ path: minimal_alto,
29
+ destination_name: 'xml'
30
+ )
31
+ end
32
+
33
+ def derivatives_of(work, fileset)
34
+ NewspaperWorks::Data::WorkDerivatives.of(work, fileset)
35
+ end
36
+
37
+ describe "Saves other formats from ALTO" do
38
+ it "saves JSON, text from existing ALTO derivative" do
39
+ derivatives = derivatives_of(work, valid_file_set)
40
+ expect(derivatives.keys.size).to eq 0
41
+ derivatives.attach(minimal_alto, 'xml')
42
+ expect(derivatives.keys.size).to eq 1
43
+ service = described_class.new(valid_file_set)
44
+ service.create_derivatives('/some/random/primary/path/does_not/matter')
45
+ derivatives.load_paths
46
+ expect(derivatives.keys.size).to eq 3
47
+ expect(derivatives.keys).to include 'json', 'txt'
48
+ end
49
+
50
+ it "saves JSON, text from incoming ALTO derivative" do
51
+ derivatives = derivatives_of(work, valid_file_set)
52
+ expect(derivatives.keys.size).to eq 0
53
+ log_incoming_attachment(valid_file_set.id)
54
+ service = described_class.new(valid_file_set)
55
+ service.create_derivatives('/some/random/primary/path/does_not/matter')
56
+ # reload keys to check derivatives:
57
+ derivatives.load_paths
58
+ expect(derivatives.keys).to include 'json', 'txt'
59
+ end
60
+ end
61
+
62
+ describe "scaling matters" do
63
+ # we need an ingested, characterized file:
64
+ do_now_jobs = [
65
+ IngestLocalFileJob,
66
+ IngestJob,
67
+ InheritPermissionsJob,
68
+ CharacterizeJob
69
+ ]
70
+ # we omit CreateDerivativesJob from above, as obviously duplicative and
71
+ # therefore potential cause of problems here.
72
+
73
+ # remove any previous test run (development) artifacts in file
74
+ # attachment logging tables
75
+ before(:all) do
76
+ NewspaperWorks::DerivativeAttachment.all.delete_all
77
+ NewspaperWorks::IngestFileRelation.all.delete_all
78
+ end
79
+
80
+ let(:work) do
81
+ work = NewspaperPage.create(title: ["Hello"])
82
+ work
83
+ end
84
+
85
+ let(:tiff_path) { File.join(fixture_path, 'ocr_gray.tiff') }
86
+ let(:ocr_alto_path) do
87
+ File.join(fixture_path, 'ocr_alto_scaled_4pts_per_px.xml')
88
+ end
89
+
90
+ def attach_primary_file(work)
91
+ attachment = NewspaperWorks::Data::WorkFiles.of(work)
92
+ attachment.assign(tiff_path)
93
+ attachment.commit!
94
+ work.reload
95
+ pcdm_file = NewspaperWorks::Data::WorkFiles.of(work).values[0].unwrapped
96
+ expect(pcdm_file).not_to be_nil
97
+ # we have image dimensions (px) to work with:
98
+ expect(pcdm_file.width[0].to_i).to be_an Integer
99
+ expect(pcdm_file.height[0].to_i).to be_an Integer
100
+ end
101
+
102
+ def derivatives_of(work)
103
+ NewspaperWorks::Data::WorkFiles.of(work).derivatives
104
+ end
105
+
106
+ def attach_alto(work)
107
+ derivatives = derivatives_of(work)
108
+ derivatives.attach(ocr_alto_path, 'xml')
109
+ # has a path to now-stored derivative:
110
+ expect(derivatives.path('xml')).not_to be_nil
111
+ end
112
+
113
+ it "scales ALTO points to original image", perform_enqueued: do_now_jobs do
114
+ attach_primary_file(work)
115
+ attach_alto(work)
116
+ work.reload
117
+ file_set = work.ordered_members.to_a.select { |m| m.class == FileSet }[0]
118
+ service = described_class.new(file_set)
119
+ service.create_derivatives('/a/path/here/needed/but/will/not/matter')
120
+ coords = JSON.parse(derivatives_of(work).data('json'))
121
+ word = coords['coords'].select { |k, _v| k == 'Bethesda' }
122
+ # test against known scaled coordinate of OCR data:
123
+ # This roughly matches unscaled ALTO data for token 'Bethesda'
124
+ # in spec/fixtures/files/ocr_alto.xml, with the disclaimer that
125
+ # round-trip rounding error of 1px is noted for VPOS.
126
+ expect(word['Bethesda']).to eq [[16, 665, 78, 16]]
127
+ end
128
+ end
129
+ end
@@ -0,0 +1,58 @@
1
+ require 'spec_helper'
2
+ RSpec.describe NewspaperWorks::TIFFDerivativeService do
3
+ let(:valid_file_set) do
4
+ file_set = FileSet.new
5
+ file_set.save!(validate: false)
6
+ file_set
7
+ end
8
+
9
+ let(:fixture_path) do
10
+ File.join(
11
+ NewspaperWorks::GEM_PATH, 'spec', 'fixtures', 'files'
12
+ )
13
+ end
14
+
15
+ describe "Creates TIFF derivatives" do
16
+ def source_image(name)
17
+ File.join(fixture_path, name)
18
+ end
19
+
20
+ def expected_path(file_set)
21
+ Hyrax::DerivativePath.derivative_path_for_reference(file_set, 'tiff')
22
+ end
23
+
24
+ def get_res(path)
25
+ lines = `gm identify -verbose #{path}`.lines
26
+ lines.select { |line| line.strip.start_with?('Geometry') }[0].strip
27
+ end
28
+
29
+ def check_dpi_match(orig, dest)
30
+ # check ppi, but skip pdf to avoid ghostscript warnings to stderr
31
+ expect(get_res(orig)).to eq get_res(dest) unless orig.end_with?('pdf')
32
+ end
33
+
34
+ def makes_tiff(filename)
35
+ expected = expected_path(valid_file_set)
36
+ expect(File.exist?(expected)).to be false
37
+ svc = described_class.new(valid_file_set)
38
+ svc.create_derivatives(source_image(filename))
39
+ expect(File.exist?(expected)).to be true
40
+ desc = `gm identify #{expected}`
41
+ expect(desc).to include 'TIFF'
42
+ check_dpi_match(source_image(filename), expected)
43
+ svc.cleanup_derivatives
44
+ end
45
+
46
+ it "creates gray TIFF derivative from one-bit source" do
47
+ makes_tiff('page1.tiff')
48
+ end
49
+
50
+ it "creates gray TIFF from grayscale source" do
51
+ makes_tiff('lowres-gray-via-ndnp-sample.tiff')
52
+ end
53
+
54
+ it "creates TIFF from PDF source, robust to multi-page" do
55
+ makes_tiff('sample-color-newsletter.pdf')
56
+ end
57
+ end
58
+ end