newspaper_works 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.fcrepo_wrapper +4 -0
- data/.gitignore +43 -0
- data/.rubocop.yml +143 -0
- data/.solr_wrapper +8 -0
- data/.travis.yml +50 -0
- data/Gemfile +47 -0
- data/LICENSE +203 -0
- data/README.md +159 -0
- data/Rakefile +38 -0
- data/app/actors/hyrax/actors/newspaper_article_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_container_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_issue_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_page_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_title_actor.rb +8 -0
- data/app/actors/newspaper_works/actors/newspaper_works_upload_actor.rb +88 -0
- data/app/assets/config/newspaper_works_manifest.js +2 -0
- data/app/assets/images/newspaper_works/.keep +0 -0
- data/app/assets/javascripts/newspaper_works/autocomplete_fix.js +33 -0
- data/app/assets/javascripts/newspaper_works/ocr_search.js.erb +6 -0
- data/app/assets/javascripts/newspaper_works/thumbnail_highlights.js.erb +102 -0
- data/app/assets/javascripts/newspaper_works.js +4 -0
- data/app/assets/stylesheets/newspaper_works/_issue_search.scss +13 -0
- data/app/assets/stylesheets/newspaper_works/_issues_calendar.scss +18 -0
- data/app/assets/stylesheets/newspaper_works/_newspaper_works.scss +4 -0
- data/app/assets/stylesheets/newspaper_works/_newspapers_search.scss +38 -0
- data/app/assets/stylesheets/newspaper_works/_search_results.scss +12 -0
- data/app/controllers/hyrax/newspaper_articles_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_containers_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_issues_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_pages_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_titles_controller.rb +13 -0
- data/app/controllers/newspaper_works/newspapers_controller.rb +117 -0
- data/app/controllers/newspaper_works/newspapers_search_controller.rb +26 -0
- data/app/forms/hyrax/newspaper_article_form.rb +11 -0
- data/app/forms/hyrax/newspaper_container_form.rb +11 -0
- data/app/forms/hyrax/newspaper_issue_form.rb +11 -0
- data/app/forms/hyrax/newspaper_page_form.rb +15 -0
- data/app/forms/hyrax/newspaper_title_form.rb +12 -0
- data/app/forms/newspaper_works/newspaper_core_form_data.rb +17 -0
- data/app/helpers/hyrax/newspaper_articles_helper.rb +5 -0
- data/app/helpers/hyrax/newspaper_containers_helper.rb +5 -0
- data/app/helpers/hyrax/newspaper_issues_helper.rb +5 -0
- data/app/helpers/hyrax/newspaper_pages_helper.rb +5 -0
- data/app/helpers/newspaper_works/application_helper.rb +5 -0
- data/app/helpers/newspaper_works/breadcrumb_helper.rb +92 -0
- data/app/helpers/newspaper_works/newspaper_works_helper_behavior.rb +103 -0
- data/app/helpers/newspaper_works/newspapers_helper.rb +5 -0
- data/app/indexers/concerns/newspaper_works/indexes_full_text.rb +17 -0
- data/app/indexers/concerns/newspaper_works/indexes_place_of_publication.rb +67 -0
- data/app/indexers/concerns/newspaper_works/indexes_publication_date_range.rb +35 -0
- data/app/indexers/concerns/newspaper_works/indexes_relationships.rb +125 -0
- data/app/indexers/newspaper_article_indexer.rb +16 -0
- data/app/indexers/newspaper_container_indexer.rb +18 -0
- data/app/indexers/newspaper_issue_indexer.rb +26 -0
- data/app/indexers/newspaper_page_indexer.rb +9 -0
- data/app/indexers/newspaper_title_indexer.rb +19 -0
- data/app/indexers/newspaper_works/newspaper_core_indexer.rb +21 -0
- data/app/jobs/newspaper_works/application_job.rb +4 -0
- data/app/jobs/newspaper_works/compose_issue_pdf_job.rb +13 -0
- data/app/jobs/newspaper_works/create_issue_pages_job.rb +19 -0
- data/app/mailers/newspaper_works/application_mailer.rb +8 -0
- data/app/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior.rb +82 -0
- data/app/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior.rb +27 -0
- data/app/models/concerns/newspaper_works/newspaper_core_metadata.rb +67 -0
- data/app/models/concerns/newspaper_works/place_of_publication_behavior.rb +15 -0
- data/app/models/concerns/newspaper_works/scanned_media_metadata.rb +43 -0
- data/app/models/concerns/newspaper_works/solr/document.rb +25 -0
- data/app/models/file_set.rb +10 -0
- data/app/models/newspaper_article.rb +158 -0
- data/app/models/newspaper_container.rb +86 -0
- data/app/models/newspaper_issue.rb +115 -0
- data/app/models/newspaper_page.rb +70 -0
- data/app/models/newspaper_title.rb +111 -0
- data/app/models/newspaper_works/application_record.rb +6 -0
- data/app/models/newspaper_works/derivative_attachment.rb +8 -0
- data/app/models/newspaper_works/ingest_file_relation.rb +14 -0
- data/app/presenters/hyrax/newspaper_article_presenter.rb +38 -0
- data/app/presenters/hyrax/newspaper_container_presenter.rb +11 -0
- data/app/presenters/hyrax/newspaper_issue_presenter.rb +62 -0
- data/app/presenters/hyrax/newspaper_page_presenter.rb +72 -0
- data/app/presenters/hyrax/newspaper_title_presenter.rb +86 -0
- data/app/presenters/newspaper_works/iiif_manifest_presenter_behavior.rb +29 -0
- data/app/presenters/newspaper_works/issue_info_presenter.rb +29 -0
- data/app/presenters/newspaper_works/newspaper_core_presenter.rb +9 -0
- data/app/presenters/newspaper_works/persistent_url_presenter_behavior.rb +16 -0
- data/app/presenters/newspaper_works/place_of_publication_presenter_behavior.rb +8 -0
- data/app/presenters/newspaper_works/scanned_media_presenter.rb +7 -0
- data/app/presenters/newspaper_works/title_info_presenter.rb +13 -0
- data/app/search_builders/concerns/newspaper_works/exclude_models.rb +16 -0
- data/app/search_builders/concerns/newspaper_works/highlight_search_params.rb +14 -0
- data/app/search_builders/newspaper_works/newspapers_search_builder.rb +26 -0
- data/app/services/hyrax/article_genre_service.rb +9 -0
- data/app/services/newspaper_works/jp2_derivative_service.rb +120 -0
- data/app/services/newspaper_works/newspaper_page_derivative_service.rb +91 -0
- data/app/services/newspaper_works/pdf_derivative_service.rb +45 -0
- data/app/services/newspaper_works/pluggable_derivative_service.rb +114 -0
- data/app/services/newspaper_works/text_extraction_derivative_service.rb +56 -0
- data/app/services/newspaper_works/text_formats_from_alto_service.rb +77 -0
- data/app/services/newspaper_works/tiff_derivative_service.rb +54 -0
- data/app/validators/newspaper_works/publication_date_start_end_validator.rb +48 -0
- data/app/validators/newspaper_works/publication_date_validator.rb +16 -0
- data/app/views/catalog/_index_gallery_newspaper_article_wrapper.html.erb +9 -0
- data/app/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb +9 -0
- data/app/views/catalog/_index_header_gallery_newspaper_article.html.erb +23 -0
- data/app/views/catalog/_index_header_gallery_newspaper_page.html.erb +23 -0
- data/app/views/catalog/_index_header_list_newspaper_article.html.erb +7 -0
- data/app/views/catalog/_index_header_list_newspaper_page.html.erb +7 -0
- data/app/views/catalog/_snippets_more.html.erb +16 -0
- data/app/views/catalog/_thumbnail_list_newspaper_article.html.erb +6 -0
- data/app/views/catalog/_thumbnail_list_newspaper_page.html.erb +6 -0
- data/app/views/hyrax/file_sets/_actions.html.erb +45 -0
- data/app/views/hyrax/newspaper_articles/_newspaper_article.html.erb +2 -0
- data/app/views/hyrax/newspaper_articles/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_containers/_newspaper_container.html.erb +2 -0
- data/app/views/hyrax/newspaper_containers/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_issues/_newspaper_issue.html.erb +2 -0
- data/app/views/hyrax/newspaper_issues/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_pages/_newspaper_page.html.erb +2 -0
- data/app/views/hyrax/newspaper_pages/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb +5 -0
- data/app/views/hyrax/newspaper_titles/_issue_search_form.html.erb +33 -0
- data/app/views/hyrax/newspaper_titles/_issues_calendar.html.erb +63 -0
- data/app/views/hyrax/newspaper_titles/_newspaper_title.html.erb +2 -0
- data/app/views/hyrax/newspaper_titles/show.html.erb +54 -0
- data/app/views/newspaper_works/base/_attribute_rows.html.erb +42 -0
- data/app/views/newspaper_works/base/_attributes.html.erb +16 -0
- data/app/views/newspaper_works/base/_metadata.html.erb +6 -0
- data/app/views/newspaper_works/base/_newspaper_hierarchy.html.erb +14 -0
- data/app/views/newspaper_works/base/_persistent_url.html.erb +1 -0
- data/app/views/newspaper_works/base/_show.html.erb +45 -0
- data/app/views/newspaper_works/newspapers_search/_date_fields.html.erb +29 -0
- data/app/views/newspaper_works/newspapers_search/_facet_layout.html.erb +8 -0
- data/app/views/newspaper_works/newspapers_search/_facet_limit.html.erb +17 -0
- data/app/views/newspaper_works/newspapers_search/_front_pages_input.html.erb +5 -0
- data/app/views/newspaper_works/newspapers_search/_keyword_input.html.erb +18 -0
- data/app/views/newspaper_works/newspapers_search/_newspapers_facets.html.erb +5 -0
- data/app/views/newspaper_works/newspapers_search/_newspapers_search_form.html.erb +13 -0
- data/app/views/newspaper_works/newspapers_search/_newspapers_search_help.html.erb +8 -0
- data/app/views/newspaper_works/newspapers_search/search.html.erb +13 -0
- data/app/views/records/edit_fields/_alternate_title.html.erb +4 -0
- data/app/views/records/edit_fields/_genre.html.erb +4 -0
- data/app/views/records/edit_fields/_place_of_publication.html.erb +14 -0
- data/app/views/records/edit_fields/_subtitle.html.erb +4 -0
- data/bin/rails +13 -0
- data/config/fcrepo_wrapper_test.yml +5 -0
- data/config/initializers/assets.rb +2 -0
- data/config/locales/newspaper_article.de.yml +12 -0
- data/config/locales/newspaper_article.en.yml +12 -0
- data/config/locales/newspaper_article.es.yml +12 -0
- data/config/locales/newspaper_article.fr.yml +12 -0
- data/config/locales/newspaper_article.it.yml +12 -0
- data/config/locales/newspaper_article.pt-BR.yml +12 -0
- data/config/locales/newspaper_article.zh.yml +12 -0
- data/config/locales/newspaper_container.de.yml +8 -0
- data/config/locales/newspaper_container.en.yml +8 -0
- data/config/locales/newspaper_container.es.yml +8 -0
- data/config/locales/newspaper_container.fr.yml +8 -0
- data/config/locales/newspaper_container.it.yml +8 -0
- data/config/locales/newspaper_container.pt-BR.yml +8 -0
- data/config/locales/newspaper_container.zh.yml +8 -0
- data/config/locales/newspaper_issue.de.yml +8 -0
- data/config/locales/newspaper_issue.en.yml +8 -0
- data/config/locales/newspaper_issue.es.yml +8 -0
- data/config/locales/newspaper_issue.fr.yml +8 -0
- data/config/locales/newspaper_issue.it.yml +8 -0
- data/config/locales/newspaper_issue.pt-BR.yml +8 -0
- data/config/locales/newspaper_issue.zh.yml +8 -0
- data/config/locales/newspaper_page.de.yml +15 -0
- data/config/locales/newspaper_page.en.yml +15 -0
- data/config/locales/newspaper_page.es.yml +15 -0
- data/config/locales/newspaper_page.fr.yml +15 -0
- data/config/locales/newspaper_page.it.yml +15 -0
- data/config/locales/newspaper_page.pt-BR.yml +15 -0
- data/config/locales/newspaper_page.zh.yml +15 -0
- data/config/locales/newspaper_title.de.yml +8 -0
- data/config/locales/newspaper_title.en.yml +8 -0
- data/config/locales/newspaper_title.es.yml +8 -0
- data/config/locales/newspaper_title.fr.yml +8 -0
- data/config/locales/newspaper_title.it.yml +8 -0
- data/config/locales/newspaper_title.pt-BR.yml +8 -0
- data/config/locales/newspaper_title.zh.yml +8 -0
- data/config/locales/newspaper_works.de.yml +50 -0
- data/config/locales/newspaper_works.en.yml +52 -0
- data/config/locales/newspaper_works.es.yml +52 -0
- data/config/locales/newspaper_works.fr.yml +52 -0
- data/config/locales/newspaper_works.it.yml +52 -0
- data/config/locales/newspaper_works.pt-BR.yml +52 -0
- data/config/locales/newspaper_works.zh.yml +52 -0
- data/config/routes.rb +9 -0
- data/config/solr_wrapper_test.yml +9 -0
- data/config/test-fixture/solr-config/_rest_managed.json +3 -0
- data/config/test-fixture/solr-config/admin-extra.html +31 -0
- data/config/test-fixture/solr-config/elevate.xml +36 -0
- data/config/test-fixture/solr-config/mapping-ISOLatin1Accent.txt +246 -0
- data/config/test-fixture/solr-config/protwords.txt +21 -0
- data/config/test-fixture/solr-config/schema.xml +366 -0
- data/config/test-fixture/solr-config/scripts.conf +24 -0
- data/config/test-fixture/solr-config/solrconfig.xml +322 -0
- data/config/test-fixture/solr-config/spellings.txt +2 -0
- data/config/test-fixture/solr-config/stopwords.txt +58 -0
- data/config/test-fixture/solr-config/stopwords_en.txt +58 -0
- data/config/test-fixture/solr-config/synonyms.txt +31 -0
- data/config/test-fixture/solr-config/xslt/example.xsl +132 -0
- data/config/test-fixture/solr-config/xslt/example_atom.xsl +67 -0
- data/config/test-fixture/solr-config/xslt/example_rss.xsl +66 -0
- data/config/test-fixture/solr-config/xslt/luke.xsl +337 -0
- data/config/vendor/imagemagick-6-policy.xml +76 -0
- data/db/migrate/20181214181358_create_newspaper_works_derivative_attachments.rb +12 -0
- data/db/migrate/20190107165909_create_newspaper_works_ingest_file_relations.rb +11 -0
- data/lib/generators/newspaper_works/assets_generator.rb +29 -0
- data/lib/generators/newspaper_works/blacklight_advanced_search_generator.rb +44 -0
- data/lib/generators/newspaper_works/blacklight_iiif_search_generator.rb +41 -0
- data/lib/generators/newspaper_works/catalog_controller_generator.rb +60 -0
- data/lib/generators/newspaper_works/install_generator.rb +97 -0
- data/lib/generators/newspaper_works/templates/annotation_behavior.rb +6 -0
- data/lib/generators/newspaper_works/templates/config/authorities/newspaper_article_genres.yml +86 -0
- data/lib/generators/newspaper_works/templates/config/initializers/newspaper_works.rb +12 -0
- data/lib/generators/newspaper_works/templates/config/initializers/patch_blacklight_advanced_search.rb +74 -0
- data/lib/generators/newspaper_works/templates/custom_search_builder.rb +23 -0
- data/lib/generators/newspaper_works/templates/newspaper_works.scss +1 -0
- data/lib/generators/newspaper_works/templates/newspaper_works_helper.rb +3 -0
- data/lib/generators/newspaper_works/templates/search_behavior.rb +6 -0
- data/lib/newspaper_works/configuration.rb +14 -0
- data/lib/newspaper_works/data/fileset_helper.rb +25 -0
- data/lib/newspaper_works/data/path_helper.rb +40 -0
- data/lib/newspaper_works/data/work_derivatives.rb +314 -0
- data/lib/newspaper_works/data/work_file.rb +92 -0
- data/lib/newspaper_works/data/work_files.rb +181 -0
- data/lib/newspaper_works/data.rb +35 -0
- data/lib/newspaper_works/engine.rb +42 -0
- data/lib/newspaper_works/errors.rb +14 -0
- data/lib/newspaper_works/ingest/base_ingest.rb +69 -0
- data/lib/newspaper_works/ingest/base_publication_info.rb +35 -0
- data/lib/newspaper_works/ingest/batch_ingest_helper.rb +44 -0
- data/lib/newspaper_works/ingest/batch_issue_ingester.rb +129 -0
- data/lib/newspaper_works/ingest/chronam_publication_info.rb +133 -0
- data/lib/newspaper_works/ingest/from_command.rb +52 -0
- data/lib/newspaper_works/ingest/image_ingest_issues.rb +43 -0
- data/lib/newspaper_works/ingest/issue_images.rb +51 -0
- data/lib/newspaper_works/ingest/lc_publication_info.rb +144 -0
- data/lib/newspaper_works/ingest/named_issue_metadata.rb +60 -0
- data/lib/newspaper_works/ingest/ndnp/batch_ingester.rb +64 -0
- data/lib/newspaper_works/ingest/ndnp/batch_xml_ingest.rb +72 -0
- data/lib/newspaper_works/ingest/ndnp/container_ingest.rb +99 -0
- data/lib/newspaper_works/ingest/ndnp/container_ingester.rb +84 -0
- data/lib/newspaper_works/ingest/ndnp/container_metadata.rb +87 -0
- data/lib/newspaper_works/ingest/ndnp/issue_ingest.rb +81 -0
- data/lib/newspaper_works/ingest/ndnp/issue_ingester.rb +101 -0
- data/lib/newspaper_works/ingest/ndnp/issue_metadata.rb +96 -0
- data/lib/newspaper_works/ingest/ndnp/ndnp_asset_helper.rb +20 -0
- data/lib/newspaper_works/ingest/ndnp/ndnp_mets_helper.rb +70 -0
- data/lib/newspaper_works/ingest/ndnp/page_ingest.rb +47 -0
- data/lib/newspaper_works/ingest/ndnp/page_ingester.rb +157 -0
- data/lib/newspaper_works/ingest/ndnp/page_metadata.rb +112 -0
- data/lib/newspaper_works/ingest/ndnp.rb +21 -0
- data/lib/newspaper_works/ingest/newspaper_issue_ingest.rb +56 -0
- data/lib/newspaper_works/ingest/newspaper_page_ingest.rb +6 -0
- data/lib/newspaper_works/ingest/page_image.rb +52 -0
- data/lib/newspaper_works/ingest/path_enumeration.rb +52 -0
- data/lib/newspaper_works/ingest/pdf_images.rb +85 -0
- data/lib/newspaper_works/ingest/pdf_issue.rb +20 -0
- data/lib/newspaper_works/ingest/pdf_issues.rb +39 -0
- data/lib/newspaper_works/ingest/pdf_pages.rb +114 -0
- data/lib/newspaper_works/ingest/pub_finder.rb +89 -0
- data/lib/newspaper_works/ingest/publication_info.rb +44 -0
- data/lib/newspaper_works/ingest.rb +90 -0
- data/lib/newspaper_works/issue_pdf_composer.rb +111 -0
- data/lib/newspaper_works/logging.rb +54 -0
- data/lib/newspaper_works/page_finder.rb +62 -0
- data/lib/newspaper_works/resource_fetcher.rb +78 -0
- data/lib/newspaper_works/text_extraction/alto_reader.rb +122 -0
- data/lib/newspaper_works/text_extraction/page_ocr.rb +100 -0
- data/lib/newspaper_works/text_extraction/render_alto.rb +84 -0
- data/lib/newspaper_works/text_extraction/word_coords_builder.rb +30 -0
- data/lib/newspaper_works/text_extraction.rb +10 -0
- data/lib/newspaper_works/version.rb +3 -0
- data/lib/newspaper_works.rb +19 -0
- data/lib/tasks/newspaper_works_tasks.rake +39 -0
- data/newspaper_works.gemspec +49 -0
- data/spec/.keep.txt +1 -0
- data/spec/actors/newspaper_works/actors/newspaper_works_upload_actor_spec.rb +69 -0
- data/spec/controllers/catalog_controller_spec.rb +63 -0
- data/spec/controllers/newspaper_works/newspapers_controller_spec.rb +114 -0
- data/spec/controllers/newspaper_works/newspapers_search_controller_spec.rb +21 -0
- data/spec/factories/ability.rb +6 -0
- data/spec/factories/newspaper_issue.rb +7 -0
- data/spec/factories/newspaper_issue_ingest.rb +6 -0
- data/spec/factories/newspaper_page.rb +7 -0
- data/spec/factories/newspaper_page_ingest.rb +6 -0
- data/spec/factories/newspaper_page_solr_document.rb +12 -0
- data/spec/factories/newspaper_title.rb +8 -0
- data/spec/factories/uploaded_pdf_file.rb +9 -0
- data/spec/factories/user.rb +13 -0
- data/spec/features/front_pages_for_title_spec.rb +19 -0
- data/spec/features/newspaper_title_search_spec.rb +30 -0
- data/spec/features/newspapers_search_spec.rb +49 -0
- data/spec/features/search_results_thumbnail_highlights_spec.rb +33 -0
- data/spec/features_shared.rb +71 -0
- data/spec/fixtures/files/4.1.07.jp2 +0 -0
- data/spec/fixtures/files/4.1.07.tiff +0 -0
- data/spec/fixtures/files/README.md +7 -0
- data/spec/fixtures/files/alto-2-0.xsd +714 -0
- data/spec/fixtures/files/broken-truncated.pdf +0 -0
- data/spec/fixtures/files/credits.md +16 -0
- data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
- data/spec/fixtures/files/minimal-1-page.pdf +0 -0
- data/spec/fixtures/files/minimal-2-page.pdf +0 -0
- data/spec/fixtures/files/minimal-alto.xml +31 -0
- data/spec/fixtures/files/ndnp-alto-sample.xml +24 -0
- data/spec/fixtures/files/ndnp-sample1-json.json +1 -0
- data/spec/fixtures/files/ndnp-sample1-txt.txt +1 -0
- data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
- data/spec/fixtures/files/ocr_alto.xml +202 -0
- data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +202 -0
- data/spec/fixtures/files/ocr_color.tiff +0 -0
- data/spec/fixtures/files/ocr_gray.jp2 +0 -0
- data/spec/fixtures/files/ocr_gray.tiff +0 -0
- data/spec/fixtures/files/ocr_mono.tiff +0 -0
- data/spec/fixtures/files/page1.tiff +0 -0
- data/spec/fixtures/files/resource_mocks/chronam/http404-expected +0 -0
- data/spec/fixtures/files/resource_mocks/chronam/sn84038814.rdf +1028 -0
- data/spec/fixtures/files/resource_mocks/chronam/sn93059126.rdf +36 -0
- data/spec/fixtures/files/resource_mocks/chronam/sn94051019.rdf +37 -0
- data/spec/fixtures/files/resource_mocks/geonames/Chicopee +1104 -0
- data/spec/fixtures/files/resource_mocks/geonames/Denver +1104 -0
- data/spec/fixtures/files/resource_mocks/geonames/Marysville +279 -0
- data/spec/fixtures/files/resource_mocks/geonames/Marysville2 +279 -0
- data/spec/fixtures/files/resource_mocks/geonames/SLC +1104 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn2099999999 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn82014496 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn83020109 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn83021453 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn83045396 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn84038814 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn93059126 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn94051019 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn99999999 +1 -0
- data/spec/fixtures/files/resource_mocks/urls.json +82 -0
- data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
- data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
- data/spec/fixtures/files/thumbnail.jpg +0 -0
- data/spec/forms/hyrax/newspaper_article_form_spec.rb +33 -0
- data/spec/forms/hyrax/newspaper_container_form_spec.rb +30 -0
- data/spec/forms/hyrax/newspaper_issue_form_spec.rb +31 -0
- data/spec/forms/hyrax/newspaper_page_form_spec.rb +28 -0
- data/spec/forms/hyrax/newspaper_title_form_spec.rb +31 -0
- data/spec/forms/newspaper_works/newspaper_core_form_data_spec.rb +12 -0
- data/spec/helpers/newspaper_works/breadcrumb_helper_spec.rb +82 -0
- data/spec/helpers/newspaper_works_helper_spec.rb +57 -0
- data/spec/indexers/concerns/newspaper_works/indexes_full_text_spec.rb +31 -0
- data/spec/indexers/concerns/newspaper_works/indexes_place_of_publication_spec.rb +53 -0
- data/spec/indexers/concerns/newspaper_works/indexes_publication_date_range_spec.rb +39 -0
- data/spec/indexers/concerns/newspaper_works/indexes_relationships_spec.rb +86 -0
- data/spec/indexers/newspaper_article_indexer_spec.rb +29 -0
- data/spec/indexers/newspaper_issue_indexer_spec.rb +19 -0
- data/spec/indexers/newspaper_title_indexer_spec.rb +22 -0
- data/spec/indexers/newspaper_works/newspaper_core_indexer_spec.rb +23 -0
- data/spec/lib/newspaper_works/configuration_spec.rb +18 -0
- data/spec/lib/newspaper_works/data/work_derivatives_spec.rb +245 -0
- data/spec/lib/newspaper_works/data/work_file_spec.rb +99 -0
- data/spec/lib/newspaper_works/data/work_files_spec.rb +224 -0
- data/spec/lib/newspaper_works/ingest/batch_issue_ingester_spec.rb +158 -0
- data/spec/lib/newspaper_works/ingest/chronam_publication_info_spec.rb +35 -0
- data/spec/lib/newspaper_works/ingest/from_command_spec.rb +75 -0
- data/spec/lib/newspaper_works/ingest/image_ingest_issues_spec.rb +62 -0
- data/spec/lib/newspaper_works/ingest/ingest_shared.rb +75 -0
- data/spec/lib/newspaper_works/ingest/issue_images_spec.rb +65 -0
- data/spec/lib/newspaper_works/ingest/lc_publication_info_spec.rb +34 -0
- data/spec/lib/newspaper_works/ingest/ndnp/batch_ingester_spec.rb +131 -0
- data/spec/lib/newspaper_works/ingest/ndnp/batch_xml_ingest_spec.rb +64 -0
- data/spec/lib/newspaper_works/ingest/ndnp/container_ingest_spec.rb +44 -0
- data/spec/lib/newspaper_works/ingest/ndnp/container_ingester_spec.rb +126 -0
- data/spec/lib/newspaper_works/ingest/ndnp/container_metadata_spec.rb +36 -0
- data/spec/lib/newspaper_works/ingest/ndnp/issue_ingest_spec.rb +108 -0
- data/spec/lib/newspaper_works/ingest/ndnp/issue_ingester_spec.rb +155 -0
- data/spec/lib/newspaper_works/ingest/ndnp/issue_metadata_spec.rb +84 -0
- data/spec/lib/newspaper_works/ingest/ndnp/page_ingest_spec.rb +79 -0
- data/spec/lib/newspaper_works/ingest/ndnp/page_ingester_spec.rb +184 -0
- data/spec/lib/newspaper_works/ingest/ndnp/page_metadata_spec.rb +85 -0
- data/spec/lib/newspaper_works/ingest/newspaper_issue_ingest_spec.rb +83 -0
- data/spec/lib/newspaper_works/ingest/newspaper_page_ingest_spec.rb +77 -0
- data/spec/lib/newspaper_works/ingest/page_image_spec.rb +29 -0
- data/spec/lib/newspaper_works/ingest/pdf_images_spec.rb +32 -0
- data/spec/lib/newspaper_works/ingest/pdf_issue_spec.rb +29 -0
- data/spec/lib/newspaper_works/ingest/pdf_issues_spec.rb +62 -0
- data/spec/lib/newspaper_works/ingest/pdf_pages_spec.rb +110 -0
- data/spec/lib/newspaper_works/ingest/pub_finder_spec.rb +58 -0
- data/spec/lib/newspaper_works/ingest/publication_info_spec.rb +61 -0
- data/spec/lib/newspaper_works/ingest_spec.rb +45 -0
- data/spec/lib/newspaper_works/issue_pdf_composer_spec.rb +101 -0
- data/spec/lib/newspaper_works/logging_spec.rb +53 -0
- data/spec/lib/newspaper_works/page_finder_spec.rb +53 -0
- data/spec/lib/newspaper_works/resource_fetcher_spec.rb +65 -0
- data/spec/lib/newspaper_works/text_extraction/alto_reader_spec.rb +49 -0
- data/spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb +84 -0
- data/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb +54 -0
- data/spec/lib/newspaper_works/text_extraction/word_coords_builder_spec.rb +30 -0
- data/spec/lib/tasks/newspaper_works_rake_spec.rb +124 -0
- data/spec/misc_shared.rb +109 -0
- data/spec/model_shared.rb +134 -0
- data/spec/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior_spec.rb +45 -0
- data/spec/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior_spec.rb +27 -0
- data/spec/models/concerns/newspaper_works/newspaper_core_metadata_spec.rb +45 -0
- data/spec/models/concerns/newspaper_works/place_of_publication_behavior_spec.rb +17 -0
- data/spec/models/concerns/newspaper_works/scanned_media_metadata_spec.rb +35 -0
- data/spec/models/newspaper_article_spec.rb +73 -0
- data/spec/models/newspaper_container_spec.rb +111 -0
- data/spec/models/newspaper_issue_spec.rb +91 -0
- data/spec/models/newspaper_page_spec.rb +44 -0
- data/spec/models/newspaper_title_spec.rb +116 -0
- data/spec/models/newspaper_works/derivative_attachment_spec.rb +37 -0
- data/spec/models/newspaper_works/ingest_file_relation_spec.rb +56 -0
- data/spec/models/solr_document_spec.rb +14 -0
- data/spec/ndnp_shared.rb +48 -0
- data/spec/presenters/hyrax/newspaper_article_presenter_spec.rb +53 -0
- data/spec/presenters/hyrax/newspaper_container_presenter_spec.rb +20 -0
- data/spec/presenters/hyrax/newspaper_issue_presenter_spec.rb +65 -0
- data/spec/presenters/hyrax/newspaper_page_presenter_spec.rb +75 -0
- data/spec/presenters/hyrax/newspaper_title_presenter_spec.rb +153 -0
- data/spec/presenters/newspaper_works/iiif_manifest_presenter_behavior_spec.rb +32 -0
- data/spec/presenters/newspaper_works/issue_info_presenter_spec.rb +51 -0
- data/spec/presenters/newspaper_works/newspaper_core_presenter_spec.rb +22 -0
- data/spec/presenters/newspaper_works/persistent_url_presenter_behavior_spec.rb +24 -0
- data/spec/presenters/newspaper_works/place_of_publication_presenter_behavior_spec.rb +17 -0
- data/spec/presenters/newspaper_works/scanned_media_presenter_spec.rb +18 -0
- data/spec/presenters/newspaper_works/title_info_presenter_spec.rb +23 -0
- data/spec/routing/route_spec.rb +52 -0
- data/spec/search_builders/custom_search_builder_spec.rb +34 -0
- data/spec/search_builders/newspaper_works/newspapers_search_builder_spec.rb +33 -0
- data/spec/services/hyrax/article_genre_service_spec.rb +12 -0
- data/spec/services/hyrax/resource_types_service_spec.rb +12 -0
- data/spec/services/newspaper_works/jp2_derivative_service_spec.rb +62 -0
- data/spec/services/newspaper_works/newspaper_page_derivative_service_spec.rb +125 -0
- data/spec/services/newspaper_works/pdf_derivative_service_spec.rb +62 -0
- data/spec/services/newspaper_works/pluggable_derivative_service_spec.rb +204 -0
- data/spec/services/newspaper_works/text_extraction_derivative_service_spec.rb +82 -0
- data/spec/services/newspaper_works/text_formats_from_alto_service_spec.rb +129 -0
- data/spec/services/newspaper_works/tiff_derivative_service_spec.rb +58 -0
- data/spec/spec_helper.rb +261 -0
- data/spec/support/controller_level_helpers.rb +28 -0
- data/spec/test_app_templates/lib/generators/test_app_generator.rb +22 -0
- data/spec/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb_spec.rb +36 -0
- data/spec/views/catalog/_index_header_list_newspaper_page.html.erb_spec.rb +26 -0
- data/spec/views/catalog/_thumbnail_list_newspaper_page.html.erb_spec.rb +35 -0
- data/spec/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb_spec.rb +16 -0
- data/spec/views/hyrax/newspaper_titles/_issue_search_form.html.erb_spec.rb +33 -0
- data/spec/views/hyrax/newspaper_titles/_issues_calendar.html.erb_spec.rb +37 -0
- data/spec/views/hyrax/newspaper_titles/show.html.erb_spec.rb +87 -0
- data/spec/views/newspaper_works/base/_attribute_rows.html.erb_spec.rb +60 -0
- data/spec/views/newspaper_works/base/_newspaper_hierarchy.html.erb_spec.rb +80 -0
- data/spec/views/newspaper_works/base/_show.html.erb_spec.rb +78 -0
- data/spec/views/newspaper_works/newspapers_search/search.html.erb_spec.rb +54 -0
- data/spec/views/records/edit_fields/_place_of_publication.html.erb_spec.rb +26 -0
- data/tasks/newspaperworks_dev.rake +26 -0
- data/test/integration/navigation_test.rb +7 -0
- data/test/lib/generators/newspaper_works/install_generator_test.rb +16 -0
- data/test/newspaper_works_test.rb +7 -0
- data/test/test_helper.rb +17 -0
- data/tmp/.keep +0 -0
- metadata +1037 -0
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
require 'fileutils'
|
|
2
|
+
require 'spec_helper'
|
|
3
|
+
|
|
4
|
+
RSpec.describe NewspaperWorks::PluggableDerivativeService do
|
|
5
|
+
let(:valid_file_set) { FileSet.new }
|
|
6
|
+
|
|
7
|
+
let(:persisted_file_set) do
|
|
8
|
+
fs = FileSet.new
|
|
9
|
+
work = NewspaperPage.new
|
|
10
|
+
work.title = ['This is a page!']
|
|
11
|
+
work.members.push(fs)
|
|
12
|
+
fs.instance_variable_set(:@mime_type, 'image/tiff')
|
|
13
|
+
fs.save!(validate: false)
|
|
14
|
+
work.save!(validate: false)
|
|
15
|
+
fs
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
let(:fixture_path) do
|
|
19
|
+
File.join(
|
|
20
|
+
NewspaperWorks::GEM_PATH, 'spec', 'fixtures', 'files'
|
|
21
|
+
)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# cache and restore originally described derivative service plugins
|
|
25
|
+
before do
|
|
26
|
+
@orig_plugins = described_class.plugins
|
|
27
|
+
end
|
|
28
|
+
after do
|
|
29
|
+
described_class.plugins = @orig_plugins
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
describe ".plugins=" do
|
|
33
|
+
it "allows setting of derivative service plugins" do
|
|
34
|
+
expect(described_class.plugins).to eq @orig_plugins
|
|
35
|
+
described_class.plugins = [Hyrax::FileSetDerivativesService] * 2
|
|
36
|
+
expect(described_class.plugins).to eq [Hyrax::FileSetDerivativesService] * 2
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
describe "calls all derivative plugins" do
|
|
41
|
+
class FakeDerivativeService
|
|
42
|
+
@create_called = 0
|
|
43
|
+
@cleanup_called = 0
|
|
44
|
+
class << self
|
|
45
|
+
attr_accessor :create_called, :cleanup_called
|
|
46
|
+
|
|
47
|
+
def target_ext
|
|
48
|
+
'txt'
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def initialize(fileset)
|
|
53
|
+
@fileset = fileset
|
|
54
|
+
@created = false
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def valid?
|
|
58
|
+
true
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def create_derivatives(filename)
|
|
62
|
+
self.class.create_called += 1
|
|
63
|
+
filename
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def cleanup_derivatives
|
|
67
|
+
self.class.cleanup_called += 1
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def touch_fake_derivative_file(file_set, ext)
|
|
72
|
+
path = Hyrax::DerivativePath.derivative_path_for_reference(file_set, ext)
|
|
73
|
+
FileUtils.mkdir_p(File.join(path.split('/')[0..-2]))
|
|
74
|
+
FileUtils.touch(path)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
it "calls each plugin on create" do
|
|
78
|
+
create_calls = FakeDerivativeService.create_called
|
|
79
|
+
described_class.plugins = [FakeDerivativeService, FakeDerivativeService]
|
|
80
|
+
service = described_class.new(FileSet.new)
|
|
81
|
+
service.create_derivatives('not_a_real_filename')
|
|
82
|
+
expect(FakeDerivativeService.create_called).to eq create_calls + 2
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
it "does not re-create existing derivative" do
|
|
86
|
+
create_calls = FakeDerivativeService.create_called
|
|
87
|
+
described_class.plugins = [FakeDerivativeService]
|
|
88
|
+
service = described_class.new(persisted_file_set)
|
|
89
|
+
expect(persisted_file_set.id).not_to be_nil
|
|
90
|
+
# Fake is configured to have 'txt' destination_path, let's create a
|
|
91
|
+
# destination file in Hyrax's opinionated plate for dest. name.
|
|
92
|
+
touch_fake_derivative_file(persisted_file_set, 'txt')
|
|
93
|
+
service.create_derivatives('/nonsense/source/path/ignored')
|
|
94
|
+
# create calls logged by fake should not increment,
|
|
95
|
+
# as PluggableDerivativeService should have skipped calling
|
|
96
|
+
# plugin's create_derivatives method w/ presence of existing derivative
|
|
97
|
+
expect(FakeDerivativeService.create_called).to eq create_calls
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
it "calls each plugin on cleanup" do
|
|
101
|
+
expect(FakeDerivativeService.cleanup_called).to eq 0
|
|
102
|
+
described_class.plugins = [FakeDerivativeService, FakeDerivativeService]
|
|
103
|
+
service = described_class.new(FileSet.new)
|
|
104
|
+
service.cleanup_derivatives
|
|
105
|
+
expect(FakeDerivativeService.cleanup_called).to eq 2
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
it "test meta: spec restores original plugins" do
|
|
109
|
+
# verify `after do` clean up of plugins array to original value
|
|
110
|
+
plugins = described_class.plugins
|
|
111
|
+
expect(plugins.length).to eq @orig_plugins.length
|
|
112
|
+
expect(plugins).to include Hyrax::FileSetDerivativesService
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
describe "service registration" do
|
|
117
|
+
# integration test with Hyrax, verify services is registered
|
|
118
|
+
|
|
119
|
+
it "is registered with Hyrax" do
|
|
120
|
+
expect(Hyrax::DerivativeService.services).to include described_class
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
it "is the first valide service found" do
|
|
124
|
+
found = Hyrax::DerivativeService.for(FileSet.new)
|
|
125
|
+
expect(found.class).to be described_class
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# integration tests for plugins
|
|
130
|
+
describe "runs multiple plugins, makes multiple derivatives" do
|
|
131
|
+
def source_image(name)
|
|
132
|
+
File.join(fixture_path, name)
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
def derivatives_for(file_set)
|
|
136
|
+
Hyrax::DerivativePath.derivatives_for_reference(file_set)
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
def expected_plugins
|
|
140
|
+
[
|
|
141
|
+
Hyrax::FileSetDerivativesService,
|
|
142
|
+
NewspaperWorks::JP2DerivativeService,
|
|
143
|
+
NewspaperWorks::PDFDerivativeService,
|
|
144
|
+
NewspaperWorks::TextExtractionDerivativeService,
|
|
145
|
+
NewspaperWorks::TIFFDerivativeService
|
|
146
|
+
]
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# The expected set of Plugins that will run for file set
|
|
150
|
+
it "has expected valid plugins configured" do
|
|
151
|
+
plugins = described_class.plugins
|
|
152
|
+
fs = persisted_file_set
|
|
153
|
+
services = plugins.map { |plugin| plugin.new(fs) }.select(&:valid?)
|
|
154
|
+
expect(services.length).to eq 5
|
|
155
|
+
used_plugins = services.map(&:class)
|
|
156
|
+
expected_plugins.each do |plugin|
|
|
157
|
+
expect(used_plugins).to include plugin
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
it "creates expected derivatives from TIFF source" do
|
|
162
|
+
svc = described_class.new(persisted_file_set)
|
|
163
|
+
svc.create_derivatives(source_image('4.1.07.tiff'))
|
|
164
|
+
made = derivatives_for(persisted_file_set)
|
|
165
|
+
made.each { |path| expect(File.exist?(path)) }
|
|
166
|
+
extensions = made.map { |path| path.split('.')[-1] }
|
|
167
|
+
expect(extensions).to include 'pdf'
|
|
168
|
+
expect(extensions).to include 'jp2'
|
|
169
|
+
expect(extensions).not_to include 'tiff'
|
|
170
|
+
# Thumbnail, created by Hyrax:
|
|
171
|
+
expect(extensions).to include 'jpeg'
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
describe "ingest integration" do
|
|
176
|
+
def log_attachment(file_set)
|
|
177
|
+
# create a log entry for the fileset given destination name 'jp2'
|
|
178
|
+
NewspaperWorks::DerivativeAttachment.create(
|
|
179
|
+
fileset_id: file_set.id,
|
|
180
|
+
path: '/some/arbitrary/path/to.jp2',
|
|
181
|
+
destination_name: 'jp2'
|
|
182
|
+
)
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
def jp2_plugin?(plugins)
|
|
186
|
+
r = plugins.select { |p| p.class == NewspaperWorks::JP2DerivativeService }
|
|
187
|
+
!r.empty?
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
it "will not attempt creating over pre-made derivative" do
|
|
191
|
+
service = described_class.new(persisted_file_set)
|
|
192
|
+
# this should be respected, evaluate by obtaining filtered
|
|
193
|
+
# services list, which must omit JP2DerivativeService
|
|
194
|
+
plugins = service.services(:create_derivatives)
|
|
195
|
+
# initially has jp2 plugin
|
|
196
|
+
expect(jp2_plugin?(plugins)).to be true
|
|
197
|
+
# blacklist jp2 by effect of log entry of pre-made attachment
|
|
198
|
+
log_attachment(service.file_set)
|
|
199
|
+
# omits, after logging intent of previous attachment:
|
|
200
|
+
plugins = service.services(:create_derivatives)
|
|
201
|
+
expect(jp2_plugin?(plugins)).to be false
|
|
202
|
+
end
|
|
203
|
+
end
|
|
204
|
+
end
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
require 'nokogiri'
|
|
2
|
+
require 'spec_helper'
|
|
3
|
+
require 'misc_shared'
|
|
4
|
+
|
|
5
|
+
RSpec.describe NewspaperWorks::TextExtractionDerivativeService do
|
|
6
|
+
include_context "shared setup"
|
|
7
|
+
|
|
8
|
+
let(:valid_file_set) do
|
|
9
|
+
file_set = FileSet.new
|
|
10
|
+
file_set.save!(validate: false)
|
|
11
|
+
file_set
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
let(:work) do
|
|
15
|
+
work = NewspaperPage.create(title: ["Hello"])
|
|
16
|
+
work.members << valid_file_set
|
|
17
|
+
work.save!
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
let(:minimal_alto) do
|
|
21
|
+
File.join(fixture_path, 'minimal-alto.xml')
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
let(:altoxsd) do
|
|
25
|
+
xsdpath = File.join(fixture_path, 'alto-2-0.xsd')
|
|
26
|
+
Nokogiri::XML::Schema(File.read(xsdpath))
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
describe "Creates ALTO derivative" do
|
|
30
|
+
def source_image(name)
|
|
31
|
+
File.join(fixture_path, name)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def expected_path(file_set, ext)
|
|
35
|
+
Hyrax::DerivativePath.derivative_path_for_reference(file_set, ext)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def validate_alto(filename)
|
|
39
|
+
altoxsd.validate(filename)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def derivative_exists(ext)
|
|
43
|
+
path = expected_path(valid_file_set, ext)
|
|
44
|
+
expect(File.exist?(path)).to be true
|
|
45
|
+
expect(File.size(path)).to be > 0
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
it "creates, stores valid ALTO and plain-text derivatives" do
|
|
49
|
+
# these are in same test to avoid duplicate OCR operation
|
|
50
|
+
service = described_class.new(valid_file_set)
|
|
51
|
+
service.create_derivatives(source_image('ocr_mono.tiff'))
|
|
52
|
+
# ALTO derivative file exists at expected path and validates:
|
|
53
|
+
altoxsd.validate(expected_path(valid_file_set, 'xml'))
|
|
54
|
+
# Plain text exists as non-empty file:
|
|
55
|
+
derivative_exists('txt')
|
|
56
|
+
derivative_exists('json')
|
|
57
|
+
json_path = expected_path(valid_file_set, 'json')
|
|
58
|
+
loaded_result = JSON.parse(File.read(json_path))
|
|
59
|
+
expect(loaded_result['coords'].length).to be > 1
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
it "usually uses OCR, when no existing text" do
|
|
63
|
+
service = described_class.new(valid_file_set)
|
|
64
|
+
# here, service will delegate create_derivatives to OCR impl method:
|
|
65
|
+
expect(service).to receive(:create_derivatives_from_ocr)
|
|
66
|
+
service.create_derivatives(source_image('ocr_mono.tiff'))
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
it "defers to existing ALTO sources, when present" do
|
|
70
|
+
# Attach some ALTO to a work
|
|
71
|
+
derivatives = NewspaperWorks::Data::WorkDerivatives.of(
|
|
72
|
+
work,
|
|
73
|
+
valid_file_set
|
|
74
|
+
)
|
|
75
|
+
derivatives.attach(minimal_alto, 'xml')
|
|
76
|
+
# In this case, service will not call the OCR implementation method:
|
|
77
|
+
service = described_class.new(valid_file_set)
|
|
78
|
+
expect(service).not_to receive(:create_derivatives_from_ocr)
|
|
79
|
+
service.create_derivatives(source_image('ocr_mono.tiff'))
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
require 'nokogiri'
|
|
2
|
+
require 'spec_helper'
|
|
3
|
+
require 'misc_shared'
|
|
4
|
+
|
|
5
|
+
RSpec.describe NewspaperWorks::TextFormatsFromALTOService do
|
|
6
|
+
include_context "shared setup"
|
|
7
|
+
|
|
8
|
+
let(:valid_file_set) do
|
|
9
|
+
file_set = FileSet.new
|
|
10
|
+
file_set.save!(validate: false)
|
|
11
|
+
file_set
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
let(:work) do
|
|
15
|
+
work = NewspaperPage.create(title: ["Hello"])
|
|
16
|
+
work.members << valid_file_set
|
|
17
|
+
work.save!
|
|
18
|
+
work
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
let(:minimal_alto) do
|
|
22
|
+
File.join(fixture_path, 'minimal-alto.xml')
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def log_incoming_attachment(fsid)
|
|
26
|
+
NewspaperWorks::DerivativeAttachment.create!(
|
|
27
|
+
fileset_id: fsid,
|
|
28
|
+
path: minimal_alto,
|
|
29
|
+
destination_name: 'xml'
|
|
30
|
+
)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def derivatives_of(work, fileset)
|
|
34
|
+
NewspaperWorks::Data::WorkDerivatives.of(work, fileset)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
describe "Saves other formats from ALTO" do
|
|
38
|
+
it "saves JSON, text from existing ALTO derivative" do
|
|
39
|
+
derivatives = derivatives_of(work, valid_file_set)
|
|
40
|
+
expect(derivatives.keys.size).to eq 0
|
|
41
|
+
derivatives.attach(minimal_alto, 'xml')
|
|
42
|
+
expect(derivatives.keys.size).to eq 1
|
|
43
|
+
service = described_class.new(valid_file_set)
|
|
44
|
+
service.create_derivatives('/some/random/primary/path/does_not/matter')
|
|
45
|
+
derivatives.load_paths
|
|
46
|
+
expect(derivatives.keys.size).to eq 3
|
|
47
|
+
expect(derivatives.keys).to include 'json', 'txt'
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
it "saves JSON, text from incoming ALTO derivative" do
|
|
51
|
+
derivatives = derivatives_of(work, valid_file_set)
|
|
52
|
+
expect(derivatives.keys.size).to eq 0
|
|
53
|
+
log_incoming_attachment(valid_file_set.id)
|
|
54
|
+
service = described_class.new(valid_file_set)
|
|
55
|
+
service.create_derivatives('/some/random/primary/path/does_not/matter')
|
|
56
|
+
# reload keys to check derivatives:
|
|
57
|
+
derivatives.load_paths
|
|
58
|
+
expect(derivatives.keys).to include 'json', 'txt'
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
describe "scaling matters" do
|
|
63
|
+
# we need an ingested, characterized file:
|
|
64
|
+
do_now_jobs = [
|
|
65
|
+
IngestLocalFileJob,
|
|
66
|
+
IngestJob,
|
|
67
|
+
InheritPermissionsJob,
|
|
68
|
+
CharacterizeJob
|
|
69
|
+
]
|
|
70
|
+
# we omit CreateDerivativesJob from above, as obviously duplicative and
|
|
71
|
+
# therefore potential cause of problems here.
|
|
72
|
+
|
|
73
|
+
# remove any previous test run (development) artifacts in file
|
|
74
|
+
# attachment logging tables
|
|
75
|
+
before(:all) do
|
|
76
|
+
NewspaperWorks::DerivativeAttachment.all.delete_all
|
|
77
|
+
NewspaperWorks::IngestFileRelation.all.delete_all
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
let(:work) do
|
|
81
|
+
work = NewspaperPage.create(title: ["Hello"])
|
|
82
|
+
work
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
let(:tiff_path) { File.join(fixture_path, 'ocr_gray.tiff') }
|
|
86
|
+
let(:ocr_alto_path) do
|
|
87
|
+
File.join(fixture_path, 'ocr_alto_scaled_4pts_per_px.xml')
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def attach_primary_file(work)
|
|
91
|
+
attachment = NewspaperWorks::Data::WorkFiles.of(work)
|
|
92
|
+
attachment.assign(tiff_path)
|
|
93
|
+
attachment.commit!
|
|
94
|
+
work.reload
|
|
95
|
+
pcdm_file = NewspaperWorks::Data::WorkFiles.of(work).values[0].unwrapped
|
|
96
|
+
expect(pcdm_file).not_to be_nil
|
|
97
|
+
# we have image dimensions (px) to work with:
|
|
98
|
+
expect(pcdm_file.width[0].to_i).to be_an Integer
|
|
99
|
+
expect(pcdm_file.height[0].to_i).to be_an Integer
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def derivatives_of(work)
|
|
103
|
+
NewspaperWorks::Data::WorkFiles.of(work).derivatives
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def attach_alto(work)
|
|
107
|
+
derivatives = derivatives_of(work)
|
|
108
|
+
derivatives.attach(ocr_alto_path, 'xml')
|
|
109
|
+
# has a path to now-stored derivative:
|
|
110
|
+
expect(derivatives.path('xml')).not_to be_nil
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
it "scales ALTO points to original image", perform_enqueued: do_now_jobs do
|
|
114
|
+
attach_primary_file(work)
|
|
115
|
+
attach_alto(work)
|
|
116
|
+
work.reload
|
|
117
|
+
file_set = work.ordered_members.to_a.select { |m| m.class == FileSet }[0]
|
|
118
|
+
service = described_class.new(file_set)
|
|
119
|
+
service.create_derivatives('/a/path/here/needed/but/will/not/matter')
|
|
120
|
+
coords = JSON.parse(derivatives_of(work).data('json'))
|
|
121
|
+
word = coords['coords'].select { |k, _v| k == 'Bethesda' }
|
|
122
|
+
# test against known scaled coordinate of OCR data:
|
|
123
|
+
# This roughly matches unscaled ALTO data for token 'Bethesda'
|
|
124
|
+
# in spec/fixtures/files/ocr_alto.xml, with the disclaimer that
|
|
125
|
+
# round-trip rounding error of 1px is noted for VPOS.
|
|
126
|
+
expect(word['Bethesda']).to eq [[16, 665, 78, 16]]
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
end
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
RSpec.describe NewspaperWorks::TIFFDerivativeService do
|
|
3
|
+
let(:valid_file_set) do
|
|
4
|
+
file_set = FileSet.new
|
|
5
|
+
file_set.save!(validate: false)
|
|
6
|
+
file_set
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
let(:fixture_path) do
|
|
10
|
+
File.join(
|
|
11
|
+
NewspaperWorks::GEM_PATH, 'spec', 'fixtures', 'files'
|
|
12
|
+
)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
describe "Creates TIFF derivatives" do
|
|
16
|
+
def source_image(name)
|
|
17
|
+
File.join(fixture_path, name)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def expected_path(file_set)
|
|
21
|
+
Hyrax::DerivativePath.derivative_path_for_reference(file_set, 'tiff')
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def get_res(path)
|
|
25
|
+
lines = `gm identify -verbose #{path}`.lines
|
|
26
|
+
lines.select { |line| line.strip.start_with?('Geometry') }[0].strip
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def check_dpi_match(orig, dest)
|
|
30
|
+
# check ppi, but skip pdf to avoid ghostscript warnings to stderr
|
|
31
|
+
expect(get_res(orig)).to eq get_res(dest) unless orig.end_with?('pdf')
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def makes_tiff(filename)
|
|
35
|
+
expected = expected_path(valid_file_set)
|
|
36
|
+
expect(File.exist?(expected)).to be false
|
|
37
|
+
svc = described_class.new(valid_file_set)
|
|
38
|
+
svc.create_derivatives(source_image(filename))
|
|
39
|
+
expect(File.exist?(expected)).to be true
|
|
40
|
+
desc = `gm identify #{expected}`
|
|
41
|
+
expect(desc).to include 'TIFF'
|
|
42
|
+
check_dpi_match(source_image(filename), expected)
|
|
43
|
+
svc.cleanup_derivatives
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
it "creates gray TIFF derivative from one-bit source" do
|
|
47
|
+
makes_tiff('page1.tiff')
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
it "creates gray TIFF from grayscale source" do
|
|
51
|
+
makes_tiff('lowres-gray-via-ndnp-sample.tiff')
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
it "creates TIFF from PDF source, robust to multi-page" do
|
|
55
|
+
makes_tiff('sample-color-newsletter.pdf')
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|