newspaper_works 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.fcrepo_wrapper +4 -0
- data/.gitignore +43 -0
- data/.rubocop.yml +143 -0
- data/.solr_wrapper +8 -0
- data/.travis.yml +50 -0
- data/Gemfile +47 -0
- data/LICENSE +203 -0
- data/README.md +159 -0
- data/Rakefile +38 -0
- data/app/actors/hyrax/actors/newspaper_article_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_container_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_issue_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_page_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_title_actor.rb +8 -0
- data/app/actors/newspaper_works/actors/newspaper_works_upload_actor.rb +88 -0
- data/app/assets/config/newspaper_works_manifest.js +2 -0
- data/app/assets/images/newspaper_works/.keep +0 -0
- data/app/assets/javascripts/newspaper_works/autocomplete_fix.js +33 -0
- data/app/assets/javascripts/newspaper_works/ocr_search.js.erb +6 -0
- data/app/assets/javascripts/newspaper_works/thumbnail_highlights.js.erb +102 -0
- data/app/assets/javascripts/newspaper_works.js +4 -0
- data/app/assets/stylesheets/newspaper_works/_issue_search.scss +13 -0
- data/app/assets/stylesheets/newspaper_works/_issues_calendar.scss +18 -0
- data/app/assets/stylesheets/newspaper_works/_newspaper_works.scss +4 -0
- data/app/assets/stylesheets/newspaper_works/_newspapers_search.scss +38 -0
- data/app/assets/stylesheets/newspaper_works/_search_results.scss +12 -0
- data/app/controllers/hyrax/newspaper_articles_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_containers_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_issues_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_pages_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_titles_controller.rb +13 -0
- data/app/controllers/newspaper_works/newspapers_controller.rb +117 -0
- data/app/controllers/newspaper_works/newspapers_search_controller.rb +26 -0
- data/app/forms/hyrax/newspaper_article_form.rb +11 -0
- data/app/forms/hyrax/newspaper_container_form.rb +11 -0
- data/app/forms/hyrax/newspaper_issue_form.rb +11 -0
- data/app/forms/hyrax/newspaper_page_form.rb +15 -0
- data/app/forms/hyrax/newspaper_title_form.rb +12 -0
- data/app/forms/newspaper_works/newspaper_core_form_data.rb +17 -0
- data/app/helpers/hyrax/newspaper_articles_helper.rb +5 -0
- data/app/helpers/hyrax/newspaper_containers_helper.rb +5 -0
- data/app/helpers/hyrax/newspaper_issues_helper.rb +5 -0
- data/app/helpers/hyrax/newspaper_pages_helper.rb +5 -0
- data/app/helpers/newspaper_works/application_helper.rb +5 -0
- data/app/helpers/newspaper_works/breadcrumb_helper.rb +92 -0
- data/app/helpers/newspaper_works/newspaper_works_helper_behavior.rb +103 -0
- data/app/helpers/newspaper_works/newspapers_helper.rb +5 -0
- data/app/indexers/concerns/newspaper_works/indexes_full_text.rb +17 -0
- data/app/indexers/concerns/newspaper_works/indexes_place_of_publication.rb +67 -0
- data/app/indexers/concerns/newspaper_works/indexes_publication_date_range.rb +35 -0
- data/app/indexers/concerns/newspaper_works/indexes_relationships.rb +125 -0
- data/app/indexers/newspaper_article_indexer.rb +16 -0
- data/app/indexers/newspaper_container_indexer.rb +18 -0
- data/app/indexers/newspaper_issue_indexer.rb +26 -0
- data/app/indexers/newspaper_page_indexer.rb +9 -0
- data/app/indexers/newspaper_title_indexer.rb +19 -0
- data/app/indexers/newspaper_works/newspaper_core_indexer.rb +21 -0
- data/app/jobs/newspaper_works/application_job.rb +4 -0
- data/app/jobs/newspaper_works/compose_issue_pdf_job.rb +13 -0
- data/app/jobs/newspaper_works/create_issue_pages_job.rb +19 -0
- data/app/mailers/newspaper_works/application_mailer.rb +8 -0
- data/app/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior.rb +82 -0
- data/app/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior.rb +27 -0
- data/app/models/concerns/newspaper_works/newspaper_core_metadata.rb +67 -0
- data/app/models/concerns/newspaper_works/place_of_publication_behavior.rb +15 -0
- data/app/models/concerns/newspaper_works/scanned_media_metadata.rb +43 -0
- data/app/models/concerns/newspaper_works/solr/document.rb +25 -0
- data/app/models/file_set.rb +10 -0
- data/app/models/newspaper_article.rb +158 -0
- data/app/models/newspaper_container.rb +86 -0
- data/app/models/newspaper_issue.rb +115 -0
- data/app/models/newspaper_page.rb +70 -0
- data/app/models/newspaper_title.rb +111 -0
- data/app/models/newspaper_works/application_record.rb +6 -0
- data/app/models/newspaper_works/derivative_attachment.rb +8 -0
- data/app/models/newspaper_works/ingest_file_relation.rb +14 -0
- data/app/presenters/hyrax/newspaper_article_presenter.rb +38 -0
- data/app/presenters/hyrax/newspaper_container_presenter.rb +11 -0
- data/app/presenters/hyrax/newspaper_issue_presenter.rb +62 -0
- data/app/presenters/hyrax/newspaper_page_presenter.rb +72 -0
- data/app/presenters/hyrax/newspaper_title_presenter.rb +86 -0
- data/app/presenters/newspaper_works/iiif_manifest_presenter_behavior.rb +29 -0
- data/app/presenters/newspaper_works/issue_info_presenter.rb +29 -0
- data/app/presenters/newspaper_works/newspaper_core_presenter.rb +9 -0
- data/app/presenters/newspaper_works/persistent_url_presenter_behavior.rb +16 -0
- data/app/presenters/newspaper_works/place_of_publication_presenter_behavior.rb +8 -0
- data/app/presenters/newspaper_works/scanned_media_presenter.rb +7 -0
- data/app/presenters/newspaper_works/title_info_presenter.rb +13 -0
- data/app/search_builders/concerns/newspaper_works/exclude_models.rb +16 -0
- data/app/search_builders/concerns/newspaper_works/highlight_search_params.rb +14 -0
- data/app/search_builders/newspaper_works/newspapers_search_builder.rb +26 -0
- data/app/services/hyrax/article_genre_service.rb +9 -0
- data/app/services/newspaper_works/jp2_derivative_service.rb +120 -0
- data/app/services/newspaper_works/newspaper_page_derivative_service.rb +91 -0
- data/app/services/newspaper_works/pdf_derivative_service.rb +45 -0
- data/app/services/newspaper_works/pluggable_derivative_service.rb +114 -0
- data/app/services/newspaper_works/text_extraction_derivative_service.rb +56 -0
- data/app/services/newspaper_works/text_formats_from_alto_service.rb +77 -0
- data/app/services/newspaper_works/tiff_derivative_service.rb +54 -0
- data/app/validators/newspaper_works/publication_date_start_end_validator.rb +48 -0
- data/app/validators/newspaper_works/publication_date_validator.rb +16 -0
- data/app/views/catalog/_index_gallery_newspaper_article_wrapper.html.erb +9 -0
- data/app/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb +9 -0
- data/app/views/catalog/_index_header_gallery_newspaper_article.html.erb +23 -0
- data/app/views/catalog/_index_header_gallery_newspaper_page.html.erb +23 -0
- data/app/views/catalog/_index_header_list_newspaper_article.html.erb +7 -0
- data/app/views/catalog/_index_header_list_newspaper_page.html.erb +7 -0
- data/app/views/catalog/_snippets_more.html.erb +16 -0
- data/app/views/catalog/_thumbnail_list_newspaper_article.html.erb +6 -0
- data/app/views/catalog/_thumbnail_list_newspaper_page.html.erb +6 -0
- data/app/views/hyrax/file_sets/_actions.html.erb +45 -0
- data/app/views/hyrax/newspaper_articles/_newspaper_article.html.erb +2 -0
- data/app/views/hyrax/newspaper_articles/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_containers/_newspaper_container.html.erb +2 -0
- data/app/views/hyrax/newspaper_containers/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_issues/_newspaper_issue.html.erb +2 -0
- data/app/views/hyrax/newspaper_issues/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_pages/_newspaper_page.html.erb +2 -0
- data/app/views/hyrax/newspaper_pages/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb +5 -0
- data/app/views/hyrax/newspaper_titles/_issue_search_form.html.erb +33 -0
- data/app/views/hyrax/newspaper_titles/_issues_calendar.html.erb +63 -0
- data/app/views/hyrax/newspaper_titles/_newspaper_title.html.erb +2 -0
- data/app/views/hyrax/newspaper_titles/show.html.erb +54 -0
- data/app/views/newspaper_works/base/_attribute_rows.html.erb +42 -0
- data/app/views/newspaper_works/base/_attributes.html.erb +16 -0
- data/app/views/newspaper_works/base/_metadata.html.erb +6 -0
- data/app/views/newspaper_works/base/_newspaper_hierarchy.html.erb +14 -0
- data/app/views/newspaper_works/base/_persistent_url.html.erb +1 -0
- data/app/views/newspaper_works/base/_show.html.erb +45 -0
- data/app/views/newspaper_works/newspapers_search/_date_fields.html.erb +29 -0
- data/app/views/newspaper_works/newspapers_search/_facet_layout.html.erb +8 -0
- data/app/views/newspaper_works/newspapers_search/_facet_limit.html.erb +17 -0
- data/app/views/newspaper_works/newspapers_search/_front_pages_input.html.erb +5 -0
- data/app/views/newspaper_works/newspapers_search/_keyword_input.html.erb +18 -0
- data/app/views/newspaper_works/newspapers_search/_newspapers_facets.html.erb +5 -0
- data/app/views/newspaper_works/newspapers_search/_newspapers_search_form.html.erb +13 -0
- data/app/views/newspaper_works/newspapers_search/_newspapers_search_help.html.erb +8 -0
- data/app/views/newspaper_works/newspapers_search/search.html.erb +13 -0
- data/app/views/records/edit_fields/_alternate_title.html.erb +4 -0
- data/app/views/records/edit_fields/_genre.html.erb +4 -0
- data/app/views/records/edit_fields/_place_of_publication.html.erb +14 -0
- data/app/views/records/edit_fields/_subtitle.html.erb +4 -0
- data/bin/rails +13 -0
- data/config/fcrepo_wrapper_test.yml +5 -0
- data/config/initializers/assets.rb +2 -0
- data/config/locales/newspaper_article.de.yml +12 -0
- data/config/locales/newspaper_article.en.yml +12 -0
- data/config/locales/newspaper_article.es.yml +12 -0
- data/config/locales/newspaper_article.fr.yml +12 -0
- data/config/locales/newspaper_article.it.yml +12 -0
- data/config/locales/newspaper_article.pt-BR.yml +12 -0
- data/config/locales/newspaper_article.zh.yml +12 -0
- data/config/locales/newspaper_container.de.yml +8 -0
- data/config/locales/newspaper_container.en.yml +8 -0
- data/config/locales/newspaper_container.es.yml +8 -0
- data/config/locales/newspaper_container.fr.yml +8 -0
- data/config/locales/newspaper_container.it.yml +8 -0
- data/config/locales/newspaper_container.pt-BR.yml +8 -0
- data/config/locales/newspaper_container.zh.yml +8 -0
- data/config/locales/newspaper_issue.de.yml +8 -0
- data/config/locales/newspaper_issue.en.yml +8 -0
- data/config/locales/newspaper_issue.es.yml +8 -0
- data/config/locales/newspaper_issue.fr.yml +8 -0
- data/config/locales/newspaper_issue.it.yml +8 -0
- data/config/locales/newspaper_issue.pt-BR.yml +8 -0
- data/config/locales/newspaper_issue.zh.yml +8 -0
- data/config/locales/newspaper_page.de.yml +15 -0
- data/config/locales/newspaper_page.en.yml +15 -0
- data/config/locales/newspaper_page.es.yml +15 -0
- data/config/locales/newspaper_page.fr.yml +15 -0
- data/config/locales/newspaper_page.it.yml +15 -0
- data/config/locales/newspaper_page.pt-BR.yml +15 -0
- data/config/locales/newspaper_page.zh.yml +15 -0
- data/config/locales/newspaper_title.de.yml +8 -0
- data/config/locales/newspaper_title.en.yml +8 -0
- data/config/locales/newspaper_title.es.yml +8 -0
- data/config/locales/newspaper_title.fr.yml +8 -0
- data/config/locales/newspaper_title.it.yml +8 -0
- data/config/locales/newspaper_title.pt-BR.yml +8 -0
- data/config/locales/newspaper_title.zh.yml +8 -0
- data/config/locales/newspaper_works.de.yml +50 -0
- data/config/locales/newspaper_works.en.yml +52 -0
- data/config/locales/newspaper_works.es.yml +52 -0
- data/config/locales/newspaper_works.fr.yml +52 -0
- data/config/locales/newspaper_works.it.yml +52 -0
- data/config/locales/newspaper_works.pt-BR.yml +52 -0
- data/config/locales/newspaper_works.zh.yml +52 -0
- data/config/routes.rb +9 -0
- data/config/solr_wrapper_test.yml +9 -0
- data/config/test-fixture/solr-config/_rest_managed.json +3 -0
- data/config/test-fixture/solr-config/admin-extra.html +31 -0
- data/config/test-fixture/solr-config/elevate.xml +36 -0
- data/config/test-fixture/solr-config/mapping-ISOLatin1Accent.txt +246 -0
- data/config/test-fixture/solr-config/protwords.txt +21 -0
- data/config/test-fixture/solr-config/schema.xml +366 -0
- data/config/test-fixture/solr-config/scripts.conf +24 -0
- data/config/test-fixture/solr-config/solrconfig.xml +322 -0
- data/config/test-fixture/solr-config/spellings.txt +2 -0
- data/config/test-fixture/solr-config/stopwords.txt +58 -0
- data/config/test-fixture/solr-config/stopwords_en.txt +58 -0
- data/config/test-fixture/solr-config/synonyms.txt +31 -0
- data/config/test-fixture/solr-config/xslt/example.xsl +132 -0
- data/config/test-fixture/solr-config/xslt/example_atom.xsl +67 -0
- data/config/test-fixture/solr-config/xslt/example_rss.xsl +66 -0
- data/config/test-fixture/solr-config/xslt/luke.xsl +337 -0
- data/config/vendor/imagemagick-6-policy.xml +76 -0
- data/db/migrate/20181214181358_create_newspaper_works_derivative_attachments.rb +12 -0
- data/db/migrate/20190107165909_create_newspaper_works_ingest_file_relations.rb +11 -0
- data/lib/generators/newspaper_works/assets_generator.rb +29 -0
- data/lib/generators/newspaper_works/blacklight_advanced_search_generator.rb +44 -0
- data/lib/generators/newspaper_works/blacklight_iiif_search_generator.rb +41 -0
- data/lib/generators/newspaper_works/catalog_controller_generator.rb +60 -0
- data/lib/generators/newspaper_works/install_generator.rb +97 -0
- data/lib/generators/newspaper_works/templates/annotation_behavior.rb +6 -0
- data/lib/generators/newspaper_works/templates/config/authorities/newspaper_article_genres.yml +86 -0
- data/lib/generators/newspaper_works/templates/config/initializers/newspaper_works.rb +12 -0
- data/lib/generators/newspaper_works/templates/config/initializers/patch_blacklight_advanced_search.rb +74 -0
- data/lib/generators/newspaper_works/templates/custom_search_builder.rb +23 -0
- data/lib/generators/newspaper_works/templates/newspaper_works.scss +1 -0
- data/lib/generators/newspaper_works/templates/newspaper_works_helper.rb +3 -0
- data/lib/generators/newspaper_works/templates/search_behavior.rb +6 -0
- data/lib/newspaper_works/configuration.rb +14 -0
- data/lib/newspaper_works/data/fileset_helper.rb +25 -0
- data/lib/newspaper_works/data/path_helper.rb +40 -0
- data/lib/newspaper_works/data/work_derivatives.rb +314 -0
- data/lib/newspaper_works/data/work_file.rb +92 -0
- data/lib/newspaper_works/data/work_files.rb +181 -0
- data/lib/newspaper_works/data.rb +35 -0
- data/lib/newspaper_works/engine.rb +42 -0
- data/lib/newspaper_works/errors.rb +14 -0
- data/lib/newspaper_works/ingest/base_ingest.rb +69 -0
- data/lib/newspaper_works/ingest/base_publication_info.rb +35 -0
- data/lib/newspaper_works/ingest/batch_ingest_helper.rb +44 -0
- data/lib/newspaper_works/ingest/batch_issue_ingester.rb +129 -0
- data/lib/newspaper_works/ingest/chronam_publication_info.rb +133 -0
- data/lib/newspaper_works/ingest/from_command.rb +52 -0
- data/lib/newspaper_works/ingest/image_ingest_issues.rb +43 -0
- data/lib/newspaper_works/ingest/issue_images.rb +51 -0
- data/lib/newspaper_works/ingest/lc_publication_info.rb +144 -0
- data/lib/newspaper_works/ingest/named_issue_metadata.rb +60 -0
- data/lib/newspaper_works/ingest/ndnp/batch_ingester.rb +64 -0
- data/lib/newspaper_works/ingest/ndnp/batch_xml_ingest.rb +72 -0
- data/lib/newspaper_works/ingest/ndnp/container_ingest.rb +99 -0
- data/lib/newspaper_works/ingest/ndnp/container_ingester.rb +84 -0
- data/lib/newspaper_works/ingest/ndnp/container_metadata.rb +87 -0
- data/lib/newspaper_works/ingest/ndnp/issue_ingest.rb +81 -0
- data/lib/newspaper_works/ingest/ndnp/issue_ingester.rb +101 -0
- data/lib/newspaper_works/ingest/ndnp/issue_metadata.rb +96 -0
- data/lib/newspaper_works/ingest/ndnp/ndnp_asset_helper.rb +20 -0
- data/lib/newspaper_works/ingest/ndnp/ndnp_mets_helper.rb +70 -0
- data/lib/newspaper_works/ingest/ndnp/page_ingest.rb +47 -0
- data/lib/newspaper_works/ingest/ndnp/page_ingester.rb +157 -0
- data/lib/newspaper_works/ingest/ndnp/page_metadata.rb +112 -0
- data/lib/newspaper_works/ingest/ndnp.rb +21 -0
- data/lib/newspaper_works/ingest/newspaper_issue_ingest.rb +56 -0
- data/lib/newspaper_works/ingest/newspaper_page_ingest.rb +6 -0
- data/lib/newspaper_works/ingest/page_image.rb +52 -0
- data/lib/newspaper_works/ingest/path_enumeration.rb +52 -0
- data/lib/newspaper_works/ingest/pdf_images.rb +85 -0
- data/lib/newspaper_works/ingest/pdf_issue.rb +20 -0
- data/lib/newspaper_works/ingest/pdf_issues.rb +39 -0
- data/lib/newspaper_works/ingest/pdf_pages.rb +114 -0
- data/lib/newspaper_works/ingest/pub_finder.rb +89 -0
- data/lib/newspaper_works/ingest/publication_info.rb +44 -0
- data/lib/newspaper_works/ingest.rb +90 -0
- data/lib/newspaper_works/issue_pdf_composer.rb +111 -0
- data/lib/newspaper_works/logging.rb +54 -0
- data/lib/newspaper_works/page_finder.rb +62 -0
- data/lib/newspaper_works/resource_fetcher.rb +78 -0
- data/lib/newspaper_works/text_extraction/alto_reader.rb +122 -0
- data/lib/newspaper_works/text_extraction/page_ocr.rb +100 -0
- data/lib/newspaper_works/text_extraction/render_alto.rb +84 -0
- data/lib/newspaper_works/text_extraction/word_coords_builder.rb +30 -0
- data/lib/newspaper_works/text_extraction.rb +10 -0
- data/lib/newspaper_works/version.rb +3 -0
- data/lib/newspaper_works.rb +19 -0
- data/lib/tasks/newspaper_works_tasks.rake +39 -0
- data/newspaper_works.gemspec +49 -0
- data/spec/.keep.txt +1 -0
- data/spec/actors/newspaper_works/actors/newspaper_works_upload_actor_spec.rb +69 -0
- data/spec/controllers/catalog_controller_spec.rb +63 -0
- data/spec/controllers/newspaper_works/newspapers_controller_spec.rb +114 -0
- data/spec/controllers/newspaper_works/newspapers_search_controller_spec.rb +21 -0
- data/spec/factories/ability.rb +6 -0
- data/spec/factories/newspaper_issue.rb +7 -0
- data/spec/factories/newspaper_issue_ingest.rb +6 -0
- data/spec/factories/newspaper_page.rb +7 -0
- data/spec/factories/newspaper_page_ingest.rb +6 -0
- data/spec/factories/newspaper_page_solr_document.rb +12 -0
- data/spec/factories/newspaper_title.rb +8 -0
- data/spec/factories/uploaded_pdf_file.rb +9 -0
- data/spec/factories/user.rb +13 -0
- data/spec/features/front_pages_for_title_spec.rb +19 -0
- data/spec/features/newspaper_title_search_spec.rb +30 -0
- data/spec/features/newspapers_search_spec.rb +49 -0
- data/spec/features/search_results_thumbnail_highlights_spec.rb +33 -0
- data/spec/features_shared.rb +71 -0
- data/spec/fixtures/files/4.1.07.jp2 +0 -0
- data/spec/fixtures/files/4.1.07.tiff +0 -0
- data/spec/fixtures/files/README.md +7 -0
- data/spec/fixtures/files/alto-2-0.xsd +714 -0
- data/spec/fixtures/files/broken-truncated.pdf +0 -0
- data/spec/fixtures/files/credits.md +16 -0
- data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
- data/spec/fixtures/files/minimal-1-page.pdf +0 -0
- data/spec/fixtures/files/minimal-2-page.pdf +0 -0
- data/spec/fixtures/files/minimal-alto.xml +31 -0
- data/spec/fixtures/files/ndnp-alto-sample.xml +24 -0
- data/spec/fixtures/files/ndnp-sample1-json.json +1 -0
- data/spec/fixtures/files/ndnp-sample1-txt.txt +1 -0
- data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
- data/spec/fixtures/files/ocr_alto.xml +202 -0
- data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +202 -0
- data/spec/fixtures/files/ocr_color.tiff +0 -0
- data/spec/fixtures/files/ocr_gray.jp2 +0 -0
- data/spec/fixtures/files/ocr_gray.tiff +0 -0
- data/spec/fixtures/files/ocr_mono.tiff +0 -0
- data/spec/fixtures/files/page1.tiff +0 -0
- data/spec/fixtures/files/resource_mocks/chronam/http404-expected +0 -0
- data/spec/fixtures/files/resource_mocks/chronam/sn84038814.rdf +1028 -0
- data/spec/fixtures/files/resource_mocks/chronam/sn93059126.rdf +36 -0
- data/spec/fixtures/files/resource_mocks/chronam/sn94051019.rdf +37 -0
- data/spec/fixtures/files/resource_mocks/geonames/Chicopee +1104 -0
- data/spec/fixtures/files/resource_mocks/geonames/Denver +1104 -0
- data/spec/fixtures/files/resource_mocks/geonames/Marysville +279 -0
- data/spec/fixtures/files/resource_mocks/geonames/Marysville2 +279 -0
- data/spec/fixtures/files/resource_mocks/geonames/SLC +1104 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn2099999999 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn82014496 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn83020109 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn83021453 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn83045396 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn84038814 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn93059126 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn94051019 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn99999999 +1 -0
- data/spec/fixtures/files/resource_mocks/urls.json +82 -0
- data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
- data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
- data/spec/fixtures/files/thumbnail.jpg +0 -0
- data/spec/forms/hyrax/newspaper_article_form_spec.rb +33 -0
- data/spec/forms/hyrax/newspaper_container_form_spec.rb +30 -0
- data/spec/forms/hyrax/newspaper_issue_form_spec.rb +31 -0
- data/spec/forms/hyrax/newspaper_page_form_spec.rb +28 -0
- data/spec/forms/hyrax/newspaper_title_form_spec.rb +31 -0
- data/spec/forms/newspaper_works/newspaper_core_form_data_spec.rb +12 -0
- data/spec/helpers/newspaper_works/breadcrumb_helper_spec.rb +82 -0
- data/spec/helpers/newspaper_works_helper_spec.rb +57 -0
- data/spec/indexers/concerns/newspaper_works/indexes_full_text_spec.rb +31 -0
- data/spec/indexers/concerns/newspaper_works/indexes_place_of_publication_spec.rb +53 -0
- data/spec/indexers/concerns/newspaper_works/indexes_publication_date_range_spec.rb +39 -0
- data/spec/indexers/concerns/newspaper_works/indexes_relationships_spec.rb +86 -0
- data/spec/indexers/newspaper_article_indexer_spec.rb +29 -0
- data/spec/indexers/newspaper_issue_indexer_spec.rb +19 -0
- data/spec/indexers/newspaper_title_indexer_spec.rb +22 -0
- data/spec/indexers/newspaper_works/newspaper_core_indexer_spec.rb +23 -0
- data/spec/lib/newspaper_works/configuration_spec.rb +18 -0
- data/spec/lib/newspaper_works/data/work_derivatives_spec.rb +245 -0
- data/spec/lib/newspaper_works/data/work_file_spec.rb +99 -0
- data/spec/lib/newspaper_works/data/work_files_spec.rb +224 -0
- data/spec/lib/newspaper_works/ingest/batch_issue_ingester_spec.rb +158 -0
- data/spec/lib/newspaper_works/ingest/chronam_publication_info_spec.rb +35 -0
- data/spec/lib/newspaper_works/ingest/from_command_spec.rb +75 -0
- data/spec/lib/newspaper_works/ingest/image_ingest_issues_spec.rb +62 -0
- data/spec/lib/newspaper_works/ingest/ingest_shared.rb +75 -0
- data/spec/lib/newspaper_works/ingest/issue_images_spec.rb +65 -0
- data/spec/lib/newspaper_works/ingest/lc_publication_info_spec.rb +34 -0
- data/spec/lib/newspaper_works/ingest/ndnp/batch_ingester_spec.rb +131 -0
- data/spec/lib/newspaper_works/ingest/ndnp/batch_xml_ingest_spec.rb +64 -0
- data/spec/lib/newspaper_works/ingest/ndnp/container_ingest_spec.rb +44 -0
- data/spec/lib/newspaper_works/ingest/ndnp/container_ingester_spec.rb +126 -0
- data/spec/lib/newspaper_works/ingest/ndnp/container_metadata_spec.rb +36 -0
- data/spec/lib/newspaper_works/ingest/ndnp/issue_ingest_spec.rb +108 -0
- data/spec/lib/newspaper_works/ingest/ndnp/issue_ingester_spec.rb +155 -0
- data/spec/lib/newspaper_works/ingest/ndnp/issue_metadata_spec.rb +84 -0
- data/spec/lib/newspaper_works/ingest/ndnp/page_ingest_spec.rb +79 -0
- data/spec/lib/newspaper_works/ingest/ndnp/page_ingester_spec.rb +184 -0
- data/spec/lib/newspaper_works/ingest/ndnp/page_metadata_spec.rb +85 -0
- data/spec/lib/newspaper_works/ingest/newspaper_issue_ingest_spec.rb +83 -0
- data/spec/lib/newspaper_works/ingest/newspaper_page_ingest_spec.rb +77 -0
- data/spec/lib/newspaper_works/ingest/page_image_spec.rb +29 -0
- data/spec/lib/newspaper_works/ingest/pdf_images_spec.rb +32 -0
- data/spec/lib/newspaper_works/ingest/pdf_issue_spec.rb +29 -0
- data/spec/lib/newspaper_works/ingest/pdf_issues_spec.rb +62 -0
- data/spec/lib/newspaper_works/ingest/pdf_pages_spec.rb +110 -0
- data/spec/lib/newspaper_works/ingest/pub_finder_spec.rb +58 -0
- data/spec/lib/newspaper_works/ingest/publication_info_spec.rb +61 -0
- data/spec/lib/newspaper_works/ingest_spec.rb +45 -0
- data/spec/lib/newspaper_works/issue_pdf_composer_spec.rb +101 -0
- data/spec/lib/newspaper_works/logging_spec.rb +53 -0
- data/spec/lib/newspaper_works/page_finder_spec.rb +53 -0
- data/spec/lib/newspaper_works/resource_fetcher_spec.rb +65 -0
- data/spec/lib/newspaper_works/text_extraction/alto_reader_spec.rb +49 -0
- data/spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb +84 -0
- data/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb +54 -0
- data/spec/lib/newspaper_works/text_extraction/word_coords_builder_spec.rb +30 -0
- data/spec/lib/tasks/newspaper_works_rake_spec.rb +124 -0
- data/spec/misc_shared.rb +109 -0
- data/spec/model_shared.rb +134 -0
- data/spec/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior_spec.rb +45 -0
- data/spec/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior_spec.rb +27 -0
- data/spec/models/concerns/newspaper_works/newspaper_core_metadata_spec.rb +45 -0
- data/spec/models/concerns/newspaper_works/place_of_publication_behavior_spec.rb +17 -0
- data/spec/models/concerns/newspaper_works/scanned_media_metadata_spec.rb +35 -0
- data/spec/models/newspaper_article_spec.rb +73 -0
- data/spec/models/newspaper_container_spec.rb +111 -0
- data/spec/models/newspaper_issue_spec.rb +91 -0
- data/spec/models/newspaper_page_spec.rb +44 -0
- data/spec/models/newspaper_title_spec.rb +116 -0
- data/spec/models/newspaper_works/derivative_attachment_spec.rb +37 -0
- data/spec/models/newspaper_works/ingest_file_relation_spec.rb +56 -0
- data/spec/models/solr_document_spec.rb +14 -0
- data/spec/ndnp_shared.rb +48 -0
- data/spec/presenters/hyrax/newspaper_article_presenter_spec.rb +53 -0
- data/spec/presenters/hyrax/newspaper_container_presenter_spec.rb +20 -0
- data/spec/presenters/hyrax/newspaper_issue_presenter_spec.rb +65 -0
- data/spec/presenters/hyrax/newspaper_page_presenter_spec.rb +75 -0
- data/spec/presenters/hyrax/newspaper_title_presenter_spec.rb +153 -0
- data/spec/presenters/newspaper_works/iiif_manifest_presenter_behavior_spec.rb +32 -0
- data/spec/presenters/newspaper_works/issue_info_presenter_spec.rb +51 -0
- data/spec/presenters/newspaper_works/newspaper_core_presenter_spec.rb +22 -0
- data/spec/presenters/newspaper_works/persistent_url_presenter_behavior_spec.rb +24 -0
- data/spec/presenters/newspaper_works/place_of_publication_presenter_behavior_spec.rb +17 -0
- data/spec/presenters/newspaper_works/scanned_media_presenter_spec.rb +18 -0
- data/spec/presenters/newspaper_works/title_info_presenter_spec.rb +23 -0
- data/spec/routing/route_spec.rb +52 -0
- data/spec/search_builders/custom_search_builder_spec.rb +34 -0
- data/spec/search_builders/newspaper_works/newspapers_search_builder_spec.rb +33 -0
- data/spec/services/hyrax/article_genre_service_spec.rb +12 -0
- data/spec/services/hyrax/resource_types_service_spec.rb +12 -0
- data/spec/services/newspaper_works/jp2_derivative_service_spec.rb +62 -0
- data/spec/services/newspaper_works/newspaper_page_derivative_service_spec.rb +125 -0
- data/spec/services/newspaper_works/pdf_derivative_service_spec.rb +62 -0
- data/spec/services/newspaper_works/pluggable_derivative_service_spec.rb +204 -0
- data/spec/services/newspaper_works/text_extraction_derivative_service_spec.rb +82 -0
- data/spec/services/newspaper_works/text_formats_from_alto_service_spec.rb +129 -0
- data/spec/services/newspaper_works/tiff_derivative_service_spec.rb +58 -0
- data/spec/spec_helper.rb +261 -0
- data/spec/support/controller_level_helpers.rb +28 -0
- data/spec/test_app_templates/lib/generators/test_app_generator.rb +22 -0
- data/spec/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb_spec.rb +36 -0
- data/spec/views/catalog/_index_header_list_newspaper_page.html.erb_spec.rb +26 -0
- data/spec/views/catalog/_thumbnail_list_newspaper_page.html.erb_spec.rb +35 -0
- data/spec/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb_spec.rb +16 -0
- data/spec/views/hyrax/newspaper_titles/_issue_search_form.html.erb_spec.rb +33 -0
- data/spec/views/hyrax/newspaper_titles/_issues_calendar.html.erb_spec.rb +37 -0
- data/spec/views/hyrax/newspaper_titles/show.html.erb_spec.rb +87 -0
- data/spec/views/newspaper_works/base/_attribute_rows.html.erb_spec.rb +60 -0
- data/spec/views/newspaper_works/base/_newspaper_hierarchy.html.erb_spec.rb +80 -0
- data/spec/views/newspaper_works/base/_show.html.erb_spec.rb +78 -0
- data/spec/views/newspaper_works/newspapers_search/search.html.erb_spec.rb +54 -0
- data/spec/views/records/edit_fields/_place_of_publication.html.erb_spec.rb +26 -0
- data/tasks/newspaperworks_dev.rake +26 -0
- data/test/integration/navigation_test.rb +7 -0
- data/test/lib/generators/newspaper_works/install_generator_test.rb +16 -0
- data/test/newspaper_works_test.rb +7 -0
- data/test/test_helper.rb +17 -0
- data/tmp/.keep +0 -0
- metadata +1037 -0
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
RSpec.describe NewspaperWorks::PageFinder do
|
3
|
+
# use before(:all) so we only create fixtures once
|
4
|
+
before(:all) do
|
5
|
+
@issue = NewspaperIssue.new
|
6
|
+
@issue.title = ["Yesterday's News: December 7, 1941"]
|
7
|
+
|
8
|
+
@page1 = NewspaperPage.new
|
9
|
+
@page1.title = ['Page 1']
|
10
|
+
@page2 = NewspaperPage.new
|
11
|
+
@page2.title = ['Page 2']
|
12
|
+
@page3 = NewspaperPage.new
|
13
|
+
@page3.title = ['Page 3']
|
14
|
+
|
15
|
+
@issue.ordered_members << @page1
|
16
|
+
@issue.ordered_members << @page2
|
17
|
+
@issue.ordered_members << @page3
|
18
|
+
|
19
|
+
@issue.save!
|
20
|
+
@page1.save!
|
21
|
+
@page2.save!
|
22
|
+
@page3.save!
|
23
|
+
|
24
|
+
@page1_solr_doc = SolrDocument.find(@page1.id)
|
25
|
+
@page2_solr_doc = SolrDocument.find(@page2.id)
|
26
|
+
@page3_solr_doc = SolrDocument.find(@page3.id)
|
27
|
+
end
|
28
|
+
|
29
|
+
let(:controller) { NewspaperWorks::NewspapersController.new }
|
30
|
+
|
31
|
+
let(:ordered_pages_array) { [@page1_solr_doc, @page2_solr_doc, @page3_solr_doc] }
|
32
|
+
|
33
|
+
describe 'pages_for_issue' do
|
34
|
+
subject { controller.pages_for_issue(@issue.id) }
|
35
|
+
it 'returns the pages, in order' do
|
36
|
+
# for some reason, the line below doesn't work, so we compare ids
|
37
|
+
# expect(subject).to eq ordered_pages_array
|
38
|
+
expect(subject[0]['id']).to eq ordered_pages_array[0]['id']
|
39
|
+
expect(subject[1]['id']).to eq ordered_pages_array[1]['id']
|
40
|
+
expect(subject[2]['id']).to eq ordered_pages_array[2]['id']
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
describe '#ordered_pages' do
|
45
|
+
subject { controller.ordered_pages(ordered_pages_array.shuffle) }
|
46
|
+
it { is_expected.to eq ordered_pages_array }
|
47
|
+
end
|
48
|
+
|
49
|
+
describe '#get_page_index' do
|
50
|
+
subject { controller.get_page_index(@page2.id) }
|
51
|
+
it { is_expected.to eq 1 }
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe NewspaperWorks::ResourceFetcher do
|
4
|
+
describe "cache hit and expiration handling" do
|
5
|
+
cached_time_url1 = 0
|
6
|
+
cached_time_url2 = 0
|
7
|
+
|
8
|
+
let(:url1) { 'https://www.example.com/1' }
|
9
|
+
let(:url2) { 'https://www.example.com/2' }
|
10
|
+
|
11
|
+
before do
|
12
|
+
stub_request(:any, url1)
|
13
|
+
.to_return(body: 'abc', headers: { 'Content-Length' => 3 })
|
14
|
+
stub_request(:any, url2)
|
15
|
+
.to_return(body: 'xyz', headers: { 'Content-Length' => 3 })
|
16
|
+
# populate cache for url1 by getting:
|
17
|
+
record = described_class.new(3600).miss_get(url1)
|
18
|
+
cached_time_url1 = record['cached_time']
|
19
|
+
# populate cache for url2, but...
|
20
|
+
record = described_class.new(3600).miss_get(url2)
|
21
|
+
# set cached time to something old:
|
22
|
+
record['cached_time'] = record['cached_time'] - 3601
|
23
|
+
cached_time_url2 = record['cached_time']
|
24
|
+
end
|
25
|
+
|
26
|
+
it "gets cached record for url" do
|
27
|
+
expect(described_class.include?(url1)).to be true
|
28
|
+
record = described_class.get(url1)
|
29
|
+
expect(record['cached_time']).to eq cached_time_url1
|
30
|
+
end
|
31
|
+
|
32
|
+
it "refreshes resource from origin on stale cached record" do
|
33
|
+
# while it "has" or includes url:
|
34
|
+
expect(described_class.include?(url2)).to be true
|
35
|
+
# on the terms of the default stale_after parameter, it is too old:
|
36
|
+
record = described_class.cache[url2]
|
37
|
+
expect(described_class.new(3600).expired(record)).to be true
|
38
|
+
# ...fetching will get new:
|
39
|
+
record = described_class.get(url2)
|
40
|
+
# new time means fresh request made to origin:
|
41
|
+
expect(record['cached_time']).not_to eq cached_time_url2
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
describe "cache miss fetch handling" do
|
46
|
+
let(:url) { 'https://www.example.com' }
|
47
|
+
|
48
|
+
before do
|
49
|
+
stub_request(:any, url)
|
50
|
+
.to_return(body: 'abc', headers: { 'Content-Length' => 3 })
|
51
|
+
end
|
52
|
+
|
53
|
+
it "makes request on cache miss" do
|
54
|
+
expect(described_class.include?(url)).to be false
|
55
|
+
record = described_class.get(url)
|
56
|
+
expect(record).to be_a Hash
|
57
|
+
timestamp = record['cached_time']
|
58
|
+
# now cached:
|
59
|
+
expect(described_class.include?(url)).to be true
|
60
|
+
record = described_class.get(url)
|
61
|
+
# same timestamp == effect of cache HIT
|
62
|
+
expect(record['cached_time']).to eq timestamp
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
RSpec.describe NewspaperWorks::TextExtraction::AltoReader do
|
5
|
+
let(:fixture_path) do
|
6
|
+
File.join(
|
7
|
+
NewspaperWorks::GEM_PATH, 'spec', 'fixtures', 'files'
|
8
|
+
)
|
9
|
+
end
|
10
|
+
|
11
|
+
let(:minimal_path) { File.join(fixture_path, 'minimal-alto.xml') }
|
12
|
+
let(:ndnp_alto_path) { File.join(fixture_path, 'ndnp-alto-sample.xml') }
|
13
|
+
let(:minimal) { File.read(minimal_path) }
|
14
|
+
|
15
|
+
let(:reader_minimal) { described_class.new(minimal) }
|
16
|
+
let(:reader_minimal_path) { described_class.new(minimal_path) }
|
17
|
+
let(:reader_ndnp) { described_class.new(ndnp_alto_path) }
|
18
|
+
|
19
|
+
describe "reads alto" do
|
20
|
+
it "loads ALTO source" do
|
21
|
+
expect(reader_minimal_path.source).to eq reader_minimal.source
|
22
|
+
expect(reader_minimal_path.source.size).to eq 1383
|
23
|
+
expect(reader_ndnp.source.size).to eq 1_050_876
|
24
|
+
end
|
25
|
+
|
26
|
+
it "loads document stream" do
|
27
|
+
expect(reader_minimal_path.doc_stream).to be_kind_of Nokogiri::XML::SAX::Document
|
28
|
+
expect(reader_minimal_path.doc_stream).to respond_to :text
|
29
|
+
expect(reader_minimal_path.doc_stream).to respond_to :words
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
describe "outputs text derivative formats" do
|
34
|
+
it "outputs plain text" do
|
35
|
+
# try simple flat text input
|
36
|
+
expect(reader_minimal.text).to eq "This is only a test."
|
37
|
+
expect(reader_minimal.text).to eq reader_minimal.doc_stream.text
|
38
|
+
# try more complex input
|
39
|
+
expect(reader_ndnp.text.size).to eq 30_519
|
40
|
+
end
|
41
|
+
|
42
|
+
it "passes args to WordCoordsBuilder and receives output" do
|
43
|
+
parsed = JSON.parse(reader_minimal.json)
|
44
|
+
expect(parsed['coords'].length).to be > 1
|
45
|
+
parsed = JSON.parse(reader_ndnp.json)
|
46
|
+
expect(parsed['coords'].size).to eq 2_125
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
RSpec.describe NewspaperWorks::TextExtraction::PageOCR do
|
6
|
+
let(:fixture_path) do
|
7
|
+
File.join(
|
8
|
+
NewspaperWorks::GEM_PATH, 'spec', 'fixtures', 'files'
|
9
|
+
)
|
10
|
+
end
|
11
|
+
|
12
|
+
let(:altoxsd) do
|
13
|
+
xsdpath = File.join(fixture_path, 'alto-2-0.xsd')
|
14
|
+
Nokogiri::XML::Schema(File.read(xsdpath))
|
15
|
+
end
|
16
|
+
|
17
|
+
# sample "snippet" images for OCR testing:
|
18
|
+
let(:example_gray_tiff) { File.join(fixture_path, 'ocr_gray.tiff') }
|
19
|
+
let(:example_mono_tiff) { File.join(fixture_path, 'ocr_mono.tiff') }
|
20
|
+
let(:example_color_tiff) { File.join(fixture_path, 'ocr_color.tiff') }
|
21
|
+
let(:example_gray_jp2) { File.join(fixture_path, 'ocr_gray.jp2') }
|
22
|
+
let(:ocr_from_gray_tiff) { described_class.new(example_gray_tiff) }
|
23
|
+
|
24
|
+
describe "performs OCR" do
|
25
|
+
def match_ocr_expectations(words)
|
26
|
+
expect(words).to be_an(Array)
|
27
|
+
expect(words).not_to be_empty
|
28
|
+
expect(words[0]).to be_a(Hash)
|
29
|
+
[:word, :x_start, :y_start, :x_end, :y_end].each do |key|
|
30
|
+
expect(words[0].keys).to include key
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
it "gets words and coordinates from grayscale source" do
|
35
|
+
match_ocr_expectations(ocr_from_gray_tiff.words)
|
36
|
+
end
|
37
|
+
|
38
|
+
it "gets words and coordinates from one-bit source" do
|
39
|
+
ocr = described_class.new(example_mono_tiff)
|
40
|
+
match_ocr_expectations(ocr.words)
|
41
|
+
end
|
42
|
+
|
43
|
+
it "gets words and coordinates from color source" do
|
44
|
+
ocr = described_class.new(example_color_tiff)
|
45
|
+
match_ocr_expectations(ocr.words)
|
46
|
+
end
|
47
|
+
|
48
|
+
it "gets words and coordinates from jp2 source" do
|
49
|
+
ocr = described_class.new(example_gray_jp2)
|
50
|
+
match_ocr_expectations(ocr.words)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
describe "turns image into ALTO" do
|
55
|
+
it "takes grayscale tiff, outputs valid ALTO, geometry" do
|
56
|
+
alto = ocr_from_gray_tiff.alto
|
57
|
+
document = Nokogiri::XML(alto)
|
58
|
+
errors = altoxsd.validate(document)
|
59
|
+
expect(errors.length).to eq 0
|
60
|
+
expect(document.at_css('PrintSpace')['WIDTH']).to eq "418"
|
61
|
+
expect(document.at_css('PrintSpace')['HEIGHT']).to eq "1046"
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
describe "plain text" do
|
66
|
+
it "makes plain text available for image" do
|
67
|
+
plain = ocr_from_gray_tiff.plain
|
68
|
+
expect(plain.class).to be String
|
69
|
+
expect(plain.length).to be > 0
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
describe "JSON word coordinates" do
|
74
|
+
it "passes properly formatted data to WordCoordsBuilder and receives output" do
|
75
|
+
parsed = JSON.parse(ocr_from_gray_tiff.word_json)
|
76
|
+
expect(parsed['coords'].length).to be > 1
|
77
|
+
word = ocr_from_gray_tiff.words[0]
|
78
|
+
word1 = parsed['coords'][word[:word]]
|
79
|
+
word1_coords = word1[0]
|
80
|
+
expect(word1_coords[2]).to eq word[:x_end] - word[:x_start]
|
81
|
+
expect(word1_coords[3]).to eq word[:y_end] - word[:y_start]
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
RSpec.describe NewspaperWorks::TextExtraction::RenderAlto do
|
4
|
+
let(:fixture_path) do
|
5
|
+
File.join(
|
6
|
+
NewspaperWorks::GEM_PATH, 'spec', 'fixtures', 'files'
|
7
|
+
)
|
8
|
+
end
|
9
|
+
|
10
|
+
let(:altoxsd) do
|
11
|
+
xsdpath = File.join(fixture_path, 'alto-2-0.xsd')
|
12
|
+
Nokogiri::XML::Schema(File.read(xsdpath))
|
13
|
+
end
|
14
|
+
|
15
|
+
let(:page_prefix) { '<Page ID="ID1" PHYSICAL_IMG_NR="1"' }
|
16
|
+
|
17
|
+
let(:words) do
|
18
|
+
[
|
19
|
+
{ word: 'If', x_start: 52, y_start: 13, x_end: 63, y_end: 27 },
|
20
|
+
{ word: 'you', x_start: 69, y_start: 17, x_end: 100, y_end: 31 },
|
21
|
+
{ word: 'are', x_start: 108, y_start: 17, x_end: 136, y_end: 27 },
|
22
|
+
{ word: 'a', x_start: 143, y_start: 17, x_end: 151, y_end: 27 },
|
23
|
+
{ word: 'friend,', x_start: 158, y_start: 13, x_end: 214, y_end: 29 },
|
24
|
+
{ word: 'you', x_start: 51, y_start: 39, x_end: 82, y_end: 53 },
|
25
|
+
{ word: 'speak', x_start: 90, y_start: 35, x_end: 140, y_end: 53 },
|
26
|
+
{ word: 'the', x_start: 146, y_start: 35, x_end: 174, y_end: 49 },
|
27
|
+
{ word: 'password,', x_start: 182, y_start: 35, x_end: 267, y_end: 53 },
|
28
|
+
{ word: 'and', x_start: 51, y_start: 57, x_end: 81, y_end: 71 },
|
29
|
+
{ word: 'the', x_start: 89, y_start: 57, x_end: 117, y_end: 71 },
|
30
|
+
{ word: 'doors', x_start: 124, y_start: 57, x_end: 172, y_end: 71 },
|
31
|
+
{ word: 'will', x_start: 180, y_start: 57, x_end: 208, y_end: 71 },
|
32
|
+
{ word: 'open.', x_start: 216, y_start: 61, x_end: 263, y_end: 75 }
|
33
|
+
]
|
34
|
+
end
|
35
|
+
|
36
|
+
describe "renders alto" do
|
37
|
+
it "creates alto given width, height, words" do
|
38
|
+
renderer = described_class.new(12_000, 9600)
|
39
|
+
output = renderer.to_alto(words)
|
40
|
+
expect(output.class).to be String
|
41
|
+
expect(output).to include '<alto'
|
42
|
+
expect(output).to include '<String'
|
43
|
+
expect(output).to include page_prefix + ' HEIGHT="9600" WIDTH="12000"'
|
44
|
+
expect(Nokogiri::XML(output).errors.empty?).to be true
|
45
|
+
end
|
46
|
+
|
47
|
+
it "makes alto 2.0 that validates" do
|
48
|
+
renderer = described_class.new(12_000, 9600)
|
49
|
+
output = renderer.to_alto(words)
|
50
|
+
document = Nokogiri::XML(output)
|
51
|
+
altoxsd.validate(document)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
RSpec.describe NewspaperWorks::TextExtraction::WordCoordsBuilder do
|
4
|
+
let(:words) do
|
5
|
+
[
|
6
|
+
{ word: "foo", coordinates: [1, 2, 3, 4] },
|
7
|
+
{ word: "bar", coordinates: [5, 6, 7, 8] },
|
8
|
+
{ word: "baz", coordinates: [9, 10, 11, 12] },
|
9
|
+
{ word: "foo", coordinates: [13, 14, 15, 16] }
|
10
|
+
]
|
11
|
+
end
|
12
|
+
let(:image_width) { 1_234 }
|
13
|
+
let(:image_height) { 5_678 }
|
14
|
+
let(:wcb) { described_class.new(words, image_width, image_height) }
|
15
|
+
|
16
|
+
describe '#to_json' do
|
17
|
+
let(:wcb_to_json) { JSON.parse(wcb.to_json) }
|
18
|
+
|
19
|
+
it 'has the correct structure' do
|
20
|
+
expect(wcb_to_json['height']).to eq image_height
|
21
|
+
expect(wcb_to_json['width']).to eq image_width
|
22
|
+
expect(wcb_to_json['coords'].length).to eq 3
|
23
|
+
expect(wcb_to_json['coords']['foo']).not_to be_falsey
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'combines coordinates for the same word' do
|
27
|
+
expect(wcb_to_json['coords']['foo']).to eq [[1, 2, 3, 4], [13, 14, 15, 16]]
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,124 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'rake'
|
3
|
+
require 'ndnp_shared'
|
4
|
+
require 'lib/newspaper_works/ingest/ingest_shared'
|
5
|
+
require 'active_fedora/cleaner'
|
6
|
+
|
7
|
+
describe 'newspaper_works rake tasks' do
|
8
|
+
include_context 'ndnp fixture setup'
|
9
|
+
include_context 'ingest test fixtures'
|
10
|
+
|
11
|
+
before(:all) do
|
12
|
+
Rake.application.rake_require '../lib/tasks/newspaper_works_tasks'
|
13
|
+
Rake::Task.define_task(:environment)
|
14
|
+
end
|
15
|
+
|
16
|
+
describe 'ingest tasks' do
|
17
|
+
before do
|
18
|
+
ActiveFedora::Cleaner.clean!
|
19
|
+
Hyrax::PermissionTemplateAccess.destroy_all
|
20
|
+
Hyrax::PermissionTemplate.destroy_all
|
21
|
+
end
|
22
|
+
|
23
|
+
let(:pdf_lccn) { 'sn93059126' }
|
24
|
+
|
25
|
+
let(:pdf_path) { File.join(pdf_fixtures, pdf_lccn, '1853060401.pdf') }
|
26
|
+
|
27
|
+
let(:single_issue_dir) do
|
28
|
+
Hyrax.config.whitelisted_ingest_dirs.push('/tmp')
|
29
|
+
parent_dir = Dir.mktmpdir
|
30
|
+
dir = File.join(parent_dir, pdf_lccn)
|
31
|
+
FileUtils.mkdir(dir)
|
32
|
+
FileUtils.cp(pdf_path, dir)
|
33
|
+
dir
|
34
|
+
end
|
35
|
+
|
36
|
+
let(:run_ndnp_ingest_task) do
|
37
|
+
task = 'newspaper_works:ingest_ndnp'
|
38
|
+
stub_const(
|
39
|
+
'ARGV',
|
40
|
+
[
|
41
|
+
'newspaper_works:ingest_ndnp',
|
42
|
+
'--',
|
43
|
+
"--path=#{batch1}"
|
44
|
+
]
|
45
|
+
)
|
46
|
+
Rake::Task[task].reenable
|
47
|
+
Rake.application.invoke_task(task)
|
48
|
+
end
|
49
|
+
|
50
|
+
let(:run_pdf_ingest_task) do
|
51
|
+
task = 'newspaper_works:ingest_pdf_issues'
|
52
|
+
stub_const(
|
53
|
+
'ARGV',
|
54
|
+
[
|
55
|
+
'newspaper_works:ingest_pdf_issues',
|
56
|
+
'--',
|
57
|
+
"--path=#{single_issue_dir}"
|
58
|
+
]
|
59
|
+
)
|
60
|
+
Rake::Task[task].reenable
|
61
|
+
Rake.application.invoke_task(task)
|
62
|
+
end
|
63
|
+
|
64
|
+
def expect_clean_slate
|
65
|
+
expect(NewspaperTitle.all.to_a).to be_empty
|
66
|
+
expect(NewspaperIssue.all.to_a).to be_empty
|
67
|
+
expect(NewspaperPage.all.to_a).to be_empty
|
68
|
+
end
|
69
|
+
|
70
|
+
def expect_generated_issues(publication)
|
71
|
+
batch = NewspaperWorks::Ingest::NDNP::BatchXMLIngest.new(batch1)
|
72
|
+
relevant = batch.select { |i| i.metadata.lccn == publication.lccn }
|
73
|
+
issue_dates = relevant.map(&:publication_date)
|
74
|
+
expect(publication.issues.size).to eq issue_dates.size
|
75
|
+
expect(publication.issues.map(&:publication_date)).to \
|
76
|
+
match_array issue_dates
|
77
|
+
end
|
78
|
+
|
79
|
+
def expect_generated_content(lccn_list)
|
80
|
+
lccn_list.each do |lccn|
|
81
|
+
# expect title work for LCCN
|
82
|
+
publication = NewspaperTitle.where(lccn: lccn).first
|
83
|
+
expect(publication).not_to be_nil
|
84
|
+
# expect title to have issue children
|
85
|
+
issues = publication.issues.to_a
|
86
|
+
expect(issues).not_to be_empty
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def check_pages(lccns)
|
91
|
+
# quick verification of pages imported:
|
92
|
+
pages = NewspaperPage.all
|
93
|
+
expect(pages.size).to eq 5
|
94
|
+
pages.each do |page|
|
95
|
+
lccn = page.publication.lccn
|
96
|
+
expect(lccns).to include lccn
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
def expect_pdf_issues
|
101
|
+
# expect NewspaperTitle for LCCN 'sn93059126'
|
102
|
+
publication = NewspaperTitle.where(lccn: 'sn93059126').first
|
103
|
+
expect(publication).not_to be_nil
|
104
|
+
# expect issue for date:
|
105
|
+
issue = publication.issues[0]
|
106
|
+
expect(issue.publication_date).to eq '1853-06-04'
|
107
|
+
end
|
108
|
+
|
109
|
+
it 'successfully ingests NDNP batch by task' do
|
110
|
+
pub_lccns = ['sn84038814', 'sn85025202']
|
111
|
+
expect_clean_slate
|
112
|
+
run_ndnp_ingest_task
|
113
|
+
# the batch we test has two titles, verify all content for each:
|
114
|
+
expect_generated_content(pub_lccns)
|
115
|
+
check_pages(pub_lccns)
|
116
|
+
end
|
117
|
+
|
118
|
+
it 'ingests a PDF issue batch' do
|
119
|
+
expect_clean_slate
|
120
|
+
run_pdf_ingest_task
|
121
|
+
expect_pdf_issues
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
data/spec/misc_shared.rb
ADDED
@@ -0,0 +1,109 @@
|
|
1
|
+
RSpec.shared_context "shared setup", shared_context: :metadata do
|
2
|
+
let(:fixture_path) do
|
3
|
+
path = File.join(
|
4
|
+
NewspaperWorks::GEM_PATH, 'spec', 'fixtures', 'files'
|
5
|
+
)
|
6
|
+
whitelist = Hyrax.config.whitelisted_ingest_dirs
|
7
|
+
whitelist.push(path) unless whitelist.include?(path)
|
8
|
+
path
|
9
|
+
end
|
10
|
+
|
11
|
+
# shared date to be invariant across all tests in a run:
|
12
|
+
date_static = Hyrax::TimeService.time_in_utc
|
13
|
+
let(:static_date) { date_static }
|
14
|
+
|
15
|
+
# path fixtures:
|
16
|
+
let(:example_gray_jp2) { File.join(fixture_path, 'ocr_gray.jp2') }
|
17
|
+
let(:txt_path) { File.join(fixture_path, 'credits.md') }
|
18
|
+
let(:sample_thumbnail) { File.join(fixture_path, 'thumbnail.jpg') }
|
19
|
+
|
20
|
+
# sample data:
|
21
|
+
let(:sample_text) { 'even in a mythical Age there must be some enigmas' }
|
22
|
+
|
23
|
+
let(:valid_file_set) do
|
24
|
+
file_set = FileSet.new
|
25
|
+
file_set.save!(validate: false)
|
26
|
+
file_set
|
27
|
+
end
|
28
|
+
|
29
|
+
let(:sample_work) do
|
30
|
+
work = NewspaperPage.new
|
31
|
+
work.title = ['Bombadil']
|
32
|
+
work.members.push(valid_file_set)
|
33
|
+
work.save!
|
34
|
+
work
|
35
|
+
end
|
36
|
+
|
37
|
+
# sample objects:
|
38
|
+
let(:work_with_file) do
|
39
|
+
# we need a work with not just a valid (but empty) fileset, but also
|
40
|
+
# a persisted file, so we use the shared work sample, and expand
|
41
|
+
# on it with actual file data/metadata.
|
42
|
+
work = sample_work
|
43
|
+
fileset = work.members.select { |m| m.class == FileSet }[0]
|
44
|
+
file = Hydra::PCDM::File.create
|
45
|
+
fileset.original_file = file
|
46
|
+
# Set binary content on file via ActiveFedora content= mutator method
|
47
|
+
# which also makes .size method return valid result for content
|
48
|
+
file.content = File.open(txt_path)
|
49
|
+
# Set some metdata we would expect to otherwise be set upon an upload
|
50
|
+
file.original_name = 'credits.md'
|
51
|
+
file.mime_type = 'text/plain'
|
52
|
+
file.date_modified = static_date
|
53
|
+
file.date_created = static_date
|
54
|
+
# saving fileset also saves file content
|
55
|
+
fileset.save!
|
56
|
+
work
|
57
|
+
end
|
58
|
+
|
59
|
+
def path_factory
|
60
|
+
Hyrax::DerivativePath
|
61
|
+
end
|
62
|
+
|
63
|
+
def work_file_set(work)
|
64
|
+
work.members.select { |m| m.class == FileSet }[0]
|
65
|
+
end
|
66
|
+
|
67
|
+
def text_path(work)
|
68
|
+
path_factory.derivative_path_for_reference(work_file_set(work), 'txt')
|
69
|
+
end
|
70
|
+
|
71
|
+
def jp2_path(work)
|
72
|
+
path_factory.derivative_path_for_reference(work_file_set(work), 'jp2')
|
73
|
+
end
|
74
|
+
|
75
|
+
def thumbnail_path(work)
|
76
|
+
path_factory.derivative_path_for_reference(work_file_set(work), 'thumbnail')
|
77
|
+
end
|
78
|
+
|
79
|
+
def mkdir_derivative(work, name)
|
80
|
+
# make shared path for derivatives to live, Hyrax ususally does this
|
81
|
+
# for thumbnails, and newspaper_works does this in its derivative
|
82
|
+
# service plugins; here we do same.
|
83
|
+
fsid = work_file_set(work).id
|
84
|
+
path = path_factory.derivative_path_for_reference(fsid, name)
|
85
|
+
dir = File.join(path.split('/')[0..-2])
|
86
|
+
FileUtils.mkdir_p(dir) unless Dir.exist?(dir)
|
87
|
+
end
|
88
|
+
|
89
|
+
def mk_jp2_derivative(work)
|
90
|
+
mkdir_derivative(work, 'jp2')
|
91
|
+
dst_path = jp2_path(work)
|
92
|
+
FileUtils.copy(example_gray_jp2, dst_path)
|
93
|
+
expect(File.exist?(dst_path)).to be true
|
94
|
+
end
|
95
|
+
|
96
|
+
def mk_txt_derivative(work)
|
97
|
+
mkdir_derivative(work, 'txt')
|
98
|
+
dst_path = text_path(work)
|
99
|
+
File.open(dst_path, 'w') { |f| f.write(sample_text) }
|
100
|
+
expect(File.exist?(dst_path)).to be true
|
101
|
+
end
|
102
|
+
|
103
|
+
def mk_thumbnail_derivative(work)
|
104
|
+
mkdir_derivative(work, 'thumbnail')
|
105
|
+
dst_path = thumbnail_path(work)
|
106
|
+
FileUtils.copy(sample_thumbnail, dst_path)
|
107
|
+
expect(File.exist?(dst_path)).to be true
|
108
|
+
end
|
109
|
+
end
|