newspaper_works 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.fcrepo_wrapper +4 -0
- data/.gitignore +43 -0
- data/.rubocop.yml +143 -0
- data/.solr_wrapper +8 -0
- data/.travis.yml +50 -0
- data/Gemfile +47 -0
- data/LICENSE +203 -0
- data/README.md +159 -0
- data/Rakefile +38 -0
- data/app/actors/hyrax/actors/newspaper_article_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_container_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_issue_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_page_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_title_actor.rb +8 -0
- data/app/actors/newspaper_works/actors/newspaper_works_upload_actor.rb +88 -0
- data/app/assets/config/newspaper_works_manifest.js +2 -0
- data/app/assets/images/newspaper_works/.keep +0 -0
- data/app/assets/javascripts/newspaper_works/autocomplete_fix.js +33 -0
- data/app/assets/javascripts/newspaper_works/ocr_search.js.erb +6 -0
- data/app/assets/javascripts/newspaper_works/thumbnail_highlights.js.erb +102 -0
- data/app/assets/javascripts/newspaper_works.js +4 -0
- data/app/assets/stylesheets/newspaper_works/_issue_search.scss +13 -0
- data/app/assets/stylesheets/newspaper_works/_issues_calendar.scss +18 -0
- data/app/assets/stylesheets/newspaper_works/_newspaper_works.scss +4 -0
- data/app/assets/stylesheets/newspaper_works/_newspapers_search.scss +38 -0
- data/app/assets/stylesheets/newspaper_works/_search_results.scss +12 -0
- data/app/controllers/hyrax/newspaper_articles_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_containers_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_issues_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_pages_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_titles_controller.rb +13 -0
- data/app/controllers/newspaper_works/newspapers_controller.rb +117 -0
- data/app/controllers/newspaper_works/newspapers_search_controller.rb +26 -0
- data/app/forms/hyrax/newspaper_article_form.rb +11 -0
- data/app/forms/hyrax/newspaper_container_form.rb +11 -0
- data/app/forms/hyrax/newspaper_issue_form.rb +11 -0
- data/app/forms/hyrax/newspaper_page_form.rb +15 -0
- data/app/forms/hyrax/newspaper_title_form.rb +12 -0
- data/app/forms/newspaper_works/newspaper_core_form_data.rb +17 -0
- data/app/helpers/hyrax/newspaper_articles_helper.rb +5 -0
- data/app/helpers/hyrax/newspaper_containers_helper.rb +5 -0
- data/app/helpers/hyrax/newspaper_issues_helper.rb +5 -0
- data/app/helpers/hyrax/newspaper_pages_helper.rb +5 -0
- data/app/helpers/newspaper_works/application_helper.rb +5 -0
- data/app/helpers/newspaper_works/breadcrumb_helper.rb +92 -0
- data/app/helpers/newspaper_works/newspaper_works_helper_behavior.rb +103 -0
- data/app/helpers/newspaper_works/newspapers_helper.rb +5 -0
- data/app/indexers/concerns/newspaper_works/indexes_full_text.rb +17 -0
- data/app/indexers/concerns/newspaper_works/indexes_place_of_publication.rb +67 -0
- data/app/indexers/concerns/newspaper_works/indexes_publication_date_range.rb +35 -0
- data/app/indexers/concerns/newspaper_works/indexes_relationships.rb +125 -0
- data/app/indexers/newspaper_article_indexer.rb +16 -0
- data/app/indexers/newspaper_container_indexer.rb +18 -0
- data/app/indexers/newspaper_issue_indexer.rb +26 -0
- data/app/indexers/newspaper_page_indexer.rb +9 -0
- data/app/indexers/newspaper_title_indexer.rb +19 -0
- data/app/indexers/newspaper_works/newspaper_core_indexer.rb +21 -0
- data/app/jobs/newspaper_works/application_job.rb +4 -0
- data/app/jobs/newspaper_works/compose_issue_pdf_job.rb +13 -0
- data/app/jobs/newspaper_works/create_issue_pages_job.rb +19 -0
- data/app/mailers/newspaper_works/application_mailer.rb +8 -0
- data/app/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior.rb +82 -0
- data/app/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior.rb +27 -0
- data/app/models/concerns/newspaper_works/newspaper_core_metadata.rb +67 -0
- data/app/models/concerns/newspaper_works/place_of_publication_behavior.rb +15 -0
- data/app/models/concerns/newspaper_works/scanned_media_metadata.rb +43 -0
- data/app/models/concerns/newspaper_works/solr/document.rb +25 -0
- data/app/models/file_set.rb +10 -0
- data/app/models/newspaper_article.rb +158 -0
- data/app/models/newspaper_container.rb +86 -0
- data/app/models/newspaper_issue.rb +115 -0
- data/app/models/newspaper_page.rb +70 -0
- data/app/models/newspaper_title.rb +111 -0
- data/app/models/newspaper_works/application_record.rb +6 -0
- data/app/models/newspaper_works/derivative_attachment.rb +8 -0
- data/app/models/newspaper_works/ingest_file_relation.rb +14 -0
- data/app/presenters/hyrax/newspaper_article_presenter.rb +38 -0
- data/app/presenters/hyrax/newspaper_container_presenter.rb +11 -0
- data/app/presenters/hyrax/newspaper_issue_presenter.rb +62 -0
- data/app/presenters/hyrax/newspaper_page_presenter.rb +72 -0
- data/app/presenters/hyrax/newspaper_title_presenter.rb +86 -0
- data/app/presenters/newspaper_works/iiif_manifest_presenter_behavior.rb +29 -0
- data/app/presenters/newspaper_works/issue_info_presenter.rb +29 -0
- data/app/presenters/newspaper_works/newspaper_core_presenter.rb +9 -0
- data/app/presenters/newspaper_works/persistent_url_presenter_behavior.rb +16 -0
- data/app/presenters/newspaper_works/place_of_publication_presenter_behavior.rb +8 -0
- data/app/presenters/newspaper_works/scanned_media_presenter.rb +7 -0
- data/app/presenters/newspaper_works/title_info_presenter.rb +13 -0
- data/app/search_builders/concerns/newspaper_works/exclude_models.rb +16 -0
- data/app/search_builders/concerns/newspaper_works/highlight_search_params.rb +14 -0
- data/app/search_builders/newspaper_works/newspapers_search_builder.rb +26 -0
- data/app/services/hyrax/article_genre_service.rb +9 -0
- data/app/services/newspaper_works/jp2_derivative_service.rb +120 -0
- data/app/services/newspaper_works/newspaper_page_derivative_service.rb +91 -0
- data/app/services/newspaper_works/pdf_derivative_service.rb +45 -0
- data/app/services/newspaper_works/pluggable_derivative_service.rb +114 -0
- data/app/services/newspaper_works/text_extraction_derivative_service.rb +56 -0
- data/app/services/newspaper_works/text_formats_from_alto_service.rb +77 -0
- data/app/services/newspaper_works/tiff_derivative_service.rb +54 -0
- data/app/validators/newspaper_works/publication_date_start_end_validator.rb +48 -0
- data/app/validators/newspaper_works/publication_date_validator.rb +16 -0
- data/app/views/catalog/_index_gallery_newspaper_article_wrapper.html.erb +9 -0
- data/app/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb +9 -0
- data/app/views/catalog/_index_header_gallery_newspaper_article.html.erb +23 -0
- data/app/views/catalog/_index_header_gallery_newspaper_page.html.erb +23 -0
- data/app/views/catalog/_index_header_list_newspaper_article.html.erb +7 -0
- data/app/views/catalog/_index_header_list_newspaper_page.html.erb +7 -0
- data/app/views/catalog/_snippets_more.html.erb +16 -0
- data/app/views/catalog/_thumbnail_list_newspaper_article.html.erb +6 -0
- data/app/views/catalog/_thumbnail_list_newspaper_page.html.erb +6 -0
- data/app/views/hyrax/file_sets/_actions.html.erb +45 -0
- data/app/views/hyrax/newspaper_articles/_newspaper_article.html.erb +2 -0
- data/app/views/hyrax/newspaper_articles/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_containers/_newspaper_container.html.erb +2 -0
- data/app/views/hyrax/newspaper_containers/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_issues/_newspaper_issue.html.erb +2 -0
- data/app/views/hyrax/newspaper_issues/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_pages/_newspaper_page.html.erb +2 -0
- data/app/views/hyrax/newspaper_pages/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb +5 -0
- data/app/views/hyrax/newspaper_titles/_issue_search_form.html.erb +33 -0
- data/app/views/hyrax/newspaper_titles/_issues_calendar.html.erb +63 -0
- data/app/views/hyrax/newspaper_titles/_newspaper_title.html.erb +2 -0
- data/app/views/hyrax/newspaper_titles/show.html.erb +54 -0
- data/app/views/newspaper_works/base/_attribute_rows.html.erb +42 -0
- data/app/views/newspaper_works/base/_attributes.html.erb +16 -0
- data/app/views/newspaper_works/base/_metadata.html.erb +6 -0
- data/app/views/newspaper_works/base/_newspaper_hierarchy.html.erb +14 -0
- data/app/views/newspaper_works/base/_persistent_url.html.erb +1 -0
- data/app/views/newspaper_works/base/_show.html.erb +45 -0
- data/app/views/newspaper_works/newspapers_search/_date_fields.html.erb +29 -0
- data/app/views/newspaper_works/newspapers_search/_facet_layout.html.erb +8 -0
- data/app/views/newspaper_works/newspapers_search/_facet_limit.html.erb +17 -0
- data/app/views/newspaper_works/newspapers_search/_front_pages_input.html.erb +5 -0
- data/app/views/newspaper_works/newspapers_search/_keyword_input.html.erb +18 -0
- data/app/views/newspaper_works/newspapers_search/_newspapers_facets.html.erb +5 -0
- data/app/views/newspaper_works/newspapers_search/_newspapers_search_form.html.erb +13 -0
- data/app/views/newspaper_works/newspapers_search/_newspapers_search_help.html.erb +8 -0
- data/app/views/newspaper_works/newspapers_search/search.html.erb +13 -0
- data/app/views/records/edit_fields/_alternate_title.html.erb +4 -0
- data/app/views/records/edit_fields/_genre.html.erb +4 -0
- data/app/views/records/edit_fields/_place_of_publication.html.erb +14 -0
- data/app/views/records/edit_fields/_subtitle.html.erb +4 -0
- data/bin/rails +13 -0
- data/config/fcrepo_wrapper_test.yml +5 -0
- data/config/initializers/assets.rb +2 -0
- data/config/locales/newspaper_article.de.yml +12 -0
- data/config/locales/newspaper_article.en.yml +12 -0
- data/config/locales/newspaper_article.es.yml +12 -0
- data/config/locales/newspaper_article.fr.yml +12 -0
- data/config/locales/newspaper_article.it.yml +12 -0
- data/config/locales/newspaper_article.pt-BR.yml +12 -0
- data/config/locales/newspaper_article.zh.yml +12 -0
- data/config/locales/newspaper_container.de.yml +8 -0
- data/config/locales/newspaper_container.en.yml +8 -0
- data/config/locales/newspaper_container.es.yml +8 -0
- data/config/locales/newspaper_container.fr.yml +8 -0
- data/config/locales/newspaper_container.it.yml +8 -0
- data/config/locales/newspaper_container.pt-BR.yml +8 -0
- data/config/locales/newspaper_container.zh.yml +8 -0
- data/config/locales/newspaper_issue.de.yml +8 -0
- data/config/locales/newspaper_issue.en.yml +8 -0
- data/config/locales/newspaper_issue.es.yml +8 -0
- data/config/locales/newspaper_issue.fr.yml +8 -0
- data/config/locales/newspaper_issue.it.yml +8 -0
- data/config/locales/newspaper_issue.pt-BR.yml +8 -0
- data/config/locales/newspaper_issue.zh.yml +8 -0
- data/config/locales/newspaper_page.de.yml +15 -0
- data/config/locales/newspaper_page.en.yml +15 -0
- data/config/locales/newspaper_page.es.yml +15 -0
- data/config/locales/newspaper_page.fr.yml +15 -0
- data/config/locales/newspaper_page.it.yml +15 -0
- data/config/locales/newspaper_page.pt-BR.yml +15 -0
- data/config/locales/newspaper_page.zh.yml +15 -0
- data/config/locales/newspaper_title.de.yml +8 -0
- data/config/locales/newspaper_title.en.yml +8 -0
- data/config/locales/newspaper_title.es.yml +8 -0
- data/config/locales/newspaper_title.fr.yml +8 -0
- data/config/locales/newspaper_title.it.yml +8 -0
- data/config/locales/newspaper_title.pt-BR.yml +8 -0
- data/config/locales/newspaper_title.zh.yml +8 -0
- data/config/locales/newspaper_works.de.yml +50 -0
- data/config/locales/newspaper_works.en.yml +52 -0
- data/config/locales/newspaper_works.es.yml +52 -0
- data/config/locales/newspaper_works.fr.yml +52 -0
- data/config/locales/newspaper_works.it.yml +52 -0
- data/config/locales/newspaper_works.pt-BR.yml +52 -0
- data/config/locales/newspaper_works.zh.yml +52 -0
- data/config/routes.rb +9 -0
- data/config/solr_wrapper_test.yml +9 -0
- data/config/test-fixture/solr-config/_rest_managed.json +3 -0
- data/config/test-fixture/solr-config/admin-extra.html +31 -0
- data/config/test-fixture/solr-config/elevate.xml +36 -0
- data/config/test-fixture/solr-config/mapping-ISOLatin1Accent.txt +246 -0
- data/config/test-fixture/solr-config/protwords.txt +21 -0
- data/config/test-fixture/solr-config/schema.xml +366 -0
- data/config/test-fixture/solr-config/scripts.conf +24 -0
- data/config/test-fixture/solr-config/solrconfig.xml +322 -0
- data/config/test-fixture/solr-config/spellings.txt +2 -0
- data/config/test-fixture/solr-config/stopwords.txt +58 -0
- data/config/test-fixture/solr-config/stopwords_en.txt +58 -0
- data/config/test-fixture/solr-config/synonyms.txt +31 -0
- data/config/test-fixture/solr-config/xslt/example.xsl +132 -0
- data/config/test-fixture/solr-config/xslt/example_atom.xsl +67 -0
- data/config/test-fixture/solr-config/xslt/example_rss.xsl +66 -0
- data/config/test-fixture/solr-config/xslt/luke.xsl +337 -0
- data/config/vendor/imagemagick-6-policy.xml +76 -0
- data/db/migrate/20181214181358_create_newspaper_works_derivative_attachments.rb +12 -0
- data/db/migrate/20190107165909_create_newspaper_works_ingest_file_relations.rb +11 -0
- data/lib/generators/newspaper_works/assets_generator.rb +29 -0
- data/lib/generators/newspaper_works/blacklight_advanced_search_generator.rb +44 -0
- data/lib/generators/newspaper_works/blacklight_iiif_search_generator.rb +41 -0
- data/lib/generators/newspaper_works/catalog_controller_generator.rb +60 -0
- data/lib/generators/newspaper_works/install_generator.rb +97 -0
- data/lib/generators/newspaper_works/templates/annotation_behavior.rb +6 -0
- data/lib/generators/newspaper_works/templates/config/authorities/newspaper_article_genres.yml +86 -0
- data/lib/generators/newspaper_works/templates/config/initializers/newspaper_works.rb +12 -0
- data/lib/generators/newspaper_works/templates/config/initializers/patch_blacklight_advanced_search.rb +74 -0
- data/lib/generators/newspaper_works/templates/custom_search_builder.rb +23 -0
- data/lib/generators/newspaper_works/templates/newspaper_works.scss +1 -0
- data/lib/generators/newspaper_works/templates/newspaper_works_helper.rb +3 -0
- data/lib/generators/newspaper_works/templates/search_behavior.rb +6 -0
- data/lib/newspaper_works/configuration.rb +14 -0
- data/lib/newspaper_works/data/fileset_helper.rb +25 -0
- data/lib/newspaper_works/data/path_helper.rb +40 -0
- data/lib/newspaper_works/data/work_derivatives.rb +314 -0
- data/lib/newspaper_works/data/work_file.rb +92 -0
- data/lib/newspaper_works/data/work_files.rb +181 -0
- data/lib/newspaper_works/data.rb +35 -0
- data/lib/newspaper_works/engine.rb +42 -0
- data/lib/newspaper_works/errors.rb +14 -0
- data/lib/newspaper_works/ingest/base_ingest.rb +69 -0
- data/lib/newspaper_works/ingest/base_publication_info.rb +35 -0
- data/lib/newspaper_works/ingest/batch_ingest_helper.rb +44 -0
- data/lib/newspaper_works/ingest/batch_issue_ingester.rb +129 -0
- data/lib/newspaper_works/ingest/chronam_publication_info.rb +133 -0
- data/lib/newspaper_works/ingest/from_command.rb +52 -0
- data/lib/newspaper_works/ingest/image_ingest_issues.rb +43 -0
- data/lib/newspaper_works/ingest/issue_images.rb +51 -0
- data/lib/newspaper_works/ingest/lc_publication_info.rb +144 -0
- data/lib/newspaper_works/ingest/named_issue_metadata.rb +60 -0
- data/lib/newspaper_works/ingest/ndnp/batch_ingester.rb +64 -0
- data/lib/newspaper_works/ingest/ndnp/batch_xml_ingest.rb +72 -0
- data/lib/newspaper_works/ingest/ndnp/container_ingest.rb +99 -0
- data/lib/newspaper_works/ingest/ndnp/container_ingester.rb +84 -0
- data/lib/newspaper_works/ingest/ndnp/container_metadata.rb +87 -0
- data/lib/newspaper_works/ingest/ndnp/issue_ingest.rb +81 -0
- data/lib/newspaper_works/ingest/ndnp/issue_ingester.rb +101 -0
- data/lib/newspaper_works/ingest/ndnp/issue_metadata.rb +96 -0
- data/lib/newspaper_works/ingest/ndnp/ndnp_asset_helper.rb +20 -0
- data/lib/newspaper_works/ingest/ndnp/ndnp_mets_helper.rb +70 -0
- data/lib/newspaper_works/ingest/ndnp/page_ingest.rb +47 -0
- data/lib/newspaper_works/ingest/ndnp/page_ingester.rb +157 -0
- data/lib/newspaper_works/ingest/ndnp/page_metadata.rb +112 -0
- data/lib/newspaper_works/ingest/ndnp.rb +21 -0
- data/lib/newspaper_works/ingest/newspaper_issue_ingest.rb +56 -0
- data/lib/newspaper_works/ingest/newspaper_page_ingest.rb +6 -0
- data/lib/newspaper_works/ingest/page_image.rb +52 -0
- data/lib/newspaper_works/ingest/path_enumeration.rb +52 -0
- data/lib/newspaper_works/ingest/pdf_images.rb +85 -0
- data/lib/newspaper_works/ingest/pdf_issue.rb +20 -0
- data/lib/newspaper_works/ingest/pdf_issues.rb +39 -0
- data/lib/newspaper_works/ingest/pdf_pages.rb +114 -0
- data/lib/newspaper_works/ingest/pub_finder.rb +89 -0
- data/lib/newspaper_works/ingest/publication_info.rb +44 -0
- data/lib/newspaper_works/ingest.rb +90 -0
- data/lib/newspaper_works/issue_pdf_composer.rb +111 -0
- data/lib/newspaper_works/logging.rb +54 -0
- data/lib/newspaper_works/page_finder.rb +62 -0
- data/lib/newspaper_works/resource_fetcher.rb +78 -0
- data/lib/newspaper_works/text_extraction/alto_reader.rb +122 -0
- data/lib/newspaper_works/text_extraction/page_ocr.rb +100 -0
- data/lib/newspaper_works/text_extraction/render_alto.rb +84 -0
- data/lib/newspaper_works/text_extraction/word_coords_builder.rb +30 -0
- data/lib/newspaper_works/text_extraction.rb +10 -0
- data/lib/newspaper_works/version.rb +3 -0
- data/lib/newspaper_works.rb +19 -0
- data/lib/tasks/newspaper_works_tasks.rake +39 -0
- data/newspaper_works.gemspec +49 -0
- data/spec/.keep.txt +1 -0
- data/spec/actors/newspaper_works/actors/newspaper_works_upload_actor_spec.rb +69 -0
- data/spec/controllers/catalog_controller_spec.rb +63 -0
- data/spec/controllers/newspaper_works/newspapers_controller_spec.rb +114 -0
- data/spec/controllers/newspaper_works/newspapers_search_controller_spec.rb +21 -0
- data/spec/factories/ability.rb +6 -0
- data/spec/factories/newspaper_issue.rb +7 -0
- data/spec/factories/newspaper_issue_ingest.rb +6 -0
- data/spec/factories/newspaper_page.rb +7 -0
- data/spec/factories/newspaper_page_ingest.rb +6 -0
- data/spec/factories/newspaper_page_solr_document.rb +12 -0
- data/spec/factories/newspaper_title.rb +8 -0
- data/spec/factories/uploaded_pdf_file.rb +9 -0
- data/spec/factories/user.rb +13 -0
- data/spec/features/front_pages_for_title_spec.rb +19 -0
- data/spec/features/newspaper_title_search_spec.rb +30 -0
- data/spec/features/newspapers_search_spec.rb +49 -0
- data/spec/features/search_results_thumbnail_highlights_spec.rb +33 -0
- data/spec/features_shared.rb +71 -0
- data/spec/fixtures/files/4.1.07.jp2 +0 -0
- data/spec/fixtures/files/4.1.07.tiff +0 -0
- data/spec/fixtures/files/README.md +7 -0
- data/spec/fixtures/files/alto-2-0.xsd +714 -0
- data/spec/fixtures/files/broken-truncated.pdf +0 -0
- data/spec/fixtures/files/credits.md +16 -0
- data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
- data/spec/fixtures/files/minimal-1-page.pdf +0 -0
- data/spec/fixtures/files/minimal-2-page.pdf +0 -0
- data/spec/fixtures/files/minimal-alto.xml +31 -0
- data/spec/fixtures/files/ndnp-alto-sample.xml +24 -0
- data/spec/fixtures/files/ndnp-sample1-json.json +1 -0
- data/spec/fixtures/files/ndnp-sample1-txt.txt +1 -0
- data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
- data/spec/fixtures/files/ocr_alto.xml +202 -0
- data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +202 -0
- data/spec/fixtures/files/ocr_color.tiff +0 -0
- data/spec/fixtures/files/ocr_gray.jp2 +0 -0
- data/spec/fixtures/files/ocr_gray.tiff +0 -0
- data/spec/fixtures/files/ocr_mono.tiff +0 -0
- data/spec/fixtures/files/page1.tiff +0 -0
- data/spec/fixtures/files/resource_mocks/chronam/http404-expected +0 -0
- data/spec/fixtures/files/resource_mocks/chronam/sn84038814.rdf +1028 -0
- data/spec/fixtures/files/resource_mocks/chronam/sn93059126.rdf +36 -0
- data/spec/fixtures/files/resource_mocks/chronam/sn94051019.rdf +37 -0
- data/spec/fixtures/files/resource_mocks/geonames/Chicopee +1104 -0
- data/spec/fixtures/files/resource_mocks/geonames/Denver +1104 -0
- data/spec/fixtures/files/resource_mocks/geonames/Marysville +279 -0
- data/spec/fixtures/files/resource_mocks/geonames/Marysville2 +279 -0
- data/spec/fixtures/files/resource_mocks/geonames/SLC +1104 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn2099999999 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn82014496 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn83020109 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn83021453 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn83045396 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn84038814 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn93059126 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn94051019 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn99999999 +1 -0
- data/spec/fixtures/files/resource_mocks/urls.json +82 -0
- data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
- data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
- data/spec/fixtures/files/thumbnail.jpg +0 -0
- data/spec/forms/hyrax/newspaper_article_form_spec.rb +33 -0
- data/spec/forms/hyrax/newspaper_container_form_spec.rb +30 -0
- data/spec/forms/hyrax/newspaper_issue_form_spec.rb +31 -0
- data/spec/forms/hyrax/newspaper_page_form_spec.rb +28 -0
- data/spec/forms/hyrax/newspaper_title_form_spec.rb +31 -0
- data/spec/forms/newspaper_works/newspaper_core_form_data_spec.rb +12 -0
- data/spec/helpers/newspaper_works/breadcrumb_helper_spec.rb +82 -0
- data/spec/helpers/newspaper_works_helper_spec.rb +57 -0
- data/spec/indexers/concerns/newspaper_works/indexes_full_text_spec.rb +31 -0
- data/spec/indexers/concerns/newspaper_works/indexes_place_of_publication_spec.rb +53 -0
- data/spec/indexers/concerns/newspaper_works/indexes_publication_date_range_spec.rb +39 -0
- data/spec/indexers/concerns/newspaper_works/indexes_relationships_spec.rb +86 -0
- data/spec/indexers/newspaper_article_indexer_spec.rb +29 -0
- data/spec/indexers/newspaper_issue_indexer_spec.rb +19 -0
- data/spec/indexers/newspaper_title_indexer_spec.rb +22 -0
- data/spec/indexers/newspaper_works/newspaper_core_indexer_spec.rb +23 -0
- data/spec/lib/newspaper_works/configuration_spec.rb +18 -0
- data/spec/lib/newspaper_works/data/work_derivatives_spec.rb +245 -0
- data/spec/lib/newspaper_works/data/work_file_spec.rb +99 -0
- data/spec/lib/newspaper_works/data/work_files_spec.rb +224 -0
- data/spec/lib/newspaper_works/ingest/batch_issue_ingester_spec.rb +158 -0
- data/spec/lib/newspaper_works/ingest/chronam_publication_info_spec.rb +35 -0
- data/spec/lib/newspaper_works/ingest/from_command_spec.rb +75 -0
- data/spec/lib/newspaper_works/ingest/image_ingest_issues_spec.rb +62 -0
- data/spec/lib/newspaper_works/ingest/ingest_shared.rb +75 -0
- data/spec/lib/newspaper_works/ingest/issue_images_spec.rb +65 -0
- data/spec/lib/newspaper_works/ingest/lc_publication_info_spec.rb +34 -0
- data/spec/lib/newspaper_works/ingest/ndnp/batch_ingester_spec.rb +131 -0
- data/spec/lib/newspaper_works/ingest/ndnp/batch_xml_ingest_spec.rb +64 -0
- data/spec/lib/newspaper_works/ingest/ndnp/container_ingest_spec.rb +44 -0
- data/spec/lib/newspaper_works/ingest/ndnp/container_ingester_spec.rb +126 -0
- data/spec/lib/newspaper_works/ingest/ndnp/container_metadata_spec.rb +36 -0
- data/spec/lib/newspaper_works/ingest/ndnp/issue_ingest_spec.rb +108 -0
- data/spec/lib/newspaper_works/ingest/ndnp/issue_ingester_spec.rb +155 -0
- data/spec/lib/newspaper_works/ingest/ndnp/issue_metadata_spec.rb +84 -0
- data/spec/lib/newspaper_works/ingest/ndnp/page_ingest_spec.rb +79 -0
- data/spec/lib/newspaper_works/ingest/ndnp/page_ingester_spec.rb +184 -0
- data/spec/lib/newspaper_works/ingest/ndnp/page_metadata_spec.rb +85 -0
- data/spec/lib/newspaper_works/ingest/newspaper_issue_ingest_spec.rb +83 -0
- data/spec/lib/newspaper_works/ingest/newspaper_page_ingest_spec.rb +77 -0
- data/spec/lib/newspaper_works/ingest/page_image_spec.rb +29 -0
- data/spec/lib/newspaper_works/ingest/pdf_images_spec.rb +32 -0
- data/spec/lib/newspaper_works/ingest/pdf_issue_spec.rb +29 -0
- data/spec/lib/newspaper_works/ingest/pdf_issues_spec.rb +62 -0
- data/spec/lib/newspaper_works/ingest/pdf_pages_spec.rb +110 -0
- data/spec/lib/newspaper_works/ingest/pub_finder_spec.rb +58 -0
- data/spec/lib/newspaper_works/ingest/publication_info_spec.rb +61 -0
- data/spec/lib/newspaper_works/ingest_spec.rb +45 -0
- data/spec/lib/newspaper_works/issue_pdf_composer_spec.rb +101 -0
- data/spec/lib/newspaper_works/logging_spec.rb +53 -0
- data/spec/lib/newspaper_works/page_finder_spec.rb +53 -0
- data/spec/lib/newspaper_works/resource_fetcher_spec.rb +65 -0
- data/spec/lib/newspaper_works/text_extraction/alto_reader_spec.rb +49 -0
- data/spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb +84 -0
- data/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb +54 -0
- data/spec/lib/newspaper_works/text_extraction/word_coords_builder_spec.rb +30 -0
- data/spec/lib/tasks/newspaper_works_rake_spec.rb +124 -0
- data/spec/misc_shared.rb +109 -0
- data/spec/model_shared.rb +134 -0
- data/spec/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior_spec.rb +45 -0
- data/spec/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior_spec.rb +27 -0
- data/spec/models/concerns/newspaper_works/newspaper_core_metadata_spec.rb +45 -0
- data/spec/models/concerns/newspaper_works/place_of_publication_behavior_spec.rb +17 -0
- data/spec/models/concerns/newspaper_works/scanned_media_metadata_spec.rb +35 -0
- data/spec/models/newspaper_article_spec.rb +73 -0
- data/spec/models/newspaper_container_spec.rb +111 -0
- data/spec/models/newspaper_issue_spec.rb +91 -0
- data/spec/models/newspaper_page_spec.rb +44 -0
- data/spec/models/newspaper_title_spec.rb +116 -0
- data/spec/models/newspaper_works/derivative_attachment_spec.rb +37 -0
- data/spec/models/newspaper_works/ingest_file_relation_spec.rb +56 -0
- data/spec/models/solr_document_spec.rb +14 -0
- data/spec/ndnp_shared.rb +48 -0
- data/spec/presenters/hyrax/newspaper_article_presenter_spec.rb +53 -0
- data/spec/presenters/hyrax/newspaper_container_presenter_spec.rb +20 -0
- data/spec/presenters/hyrax/newspaper_issue_presenter_spec.rb +65 -0
- data/spec/presenters/hyrax/newspaper_page_presenter_spec.rb +75 -0
- data/spec/presenters/hyrax/newspaper_title_presenter_spec.rb +153 -0
- data/spec/presenters/newspaper_works/iiif_manifest_presenter_behavior_spec.rb +32 -0
- data/spec/presenters/newspaper_works/issue_info_presenter_spec.rb +51 -0
- data/spec/presenters/newspaper_works/newspaper_core_presenter_spec.rb +22 -0
- data/spec/presenters/newspaper_works/persistent_url_presenter_behavior_spec.rb +24 -0
- data/spec/presenters/newspaper_works/place_of_publication_presenter_behavior_spec.rb +17 -0
- data/spec/presenters/newspaper_works/scanned_media_presenter_spec.rb +18 -0
- data/spec/presenters/newspaper_works/title_info_presenter_spec.rb +23 -0
- data/spec/routing/route_spec.rb +52 -0
- data/spec/search_builders/custom_search_builder_spec.rb +34 -0
- data/spec/search_builders/newspaper_works/newspapers_search_builder_spec.rb +33 -0
- data/spec/services/hyrax/article_genre_service_spec.rb +12 -0
- data/spec/services/hyrax/resource_types_service_spec.rb +12 -0
- data/spec/services/newspaper_works/jp2_derivative_service_spec.rb +62 -0
- data/spec/services/newspaper_works/newspaper_page_derivative_service_spec.rb +125 -0
- data/spec/services/newspaper_works/pdf_derivative_service_spec.rb +62 -0
- data/spec/services/newspaper_works/pluggable_derivative_service_spec.rb +204 -0
- data/spec/services/newspaper_works/text_extraction_derivative_service_spec.rb +82 -0
- data/spec/services/newspaper_works/text_formats_from_alto_service_spec.rb +129 -0
- data/spec/services/newspaper_works/tiff_derivative_service_spec.rb +58 -0
- data/spec/spec_helper.rb +261 -0
- data/spec/support/controller_level_helpers.rb +28 -0
- data/spec/test_app_templates/lib/generators/test_app_generator.rb +22 -0
- data/spec/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb_spec.rb +36 -0
- data/spec/views/catalog/_index_header_list_newspaper_page.html.erb_spec.rb +26 -0
- data/spec/views/catalog/_thumbnail_list_newspaper_page.html.erb_spec.rb +35 -0
- data/spec/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb_spec.rb +16 -0
- data/spec/views/hyrax/newspaper_titles/_issue_search_form.html.erb_spec.rb +33 -0
- data/spec/views/hyrax/newspaper_titles/_issues_calendar.html.erb_spec.rb +37 -0
- data/spec/views/hyrax/newspaper_titles/show.html.erb_spec.rb +87 -0
- data/spec/views/newspaper_works/base/_attribute_rows.html.erb_spec.rb +60 -0
- data/spec/views/newspaper_works/base/_newspaper_hierarchy.html.erb_spec.rb +80 -0
- data/spec/views/newspaper_works/base/_show.html.erb_spec.rb +78 -0
- data/spec/views/newspaper_works/newspapers_search/search.html.erb_spec.rb +54 -0
- data/spec/views/records/edit_fields/_place_of_publication.html.erb_spec.rb +26 -0
- data/tasks/newspaperworks_dev.rake +26 -0
- data/test/integration/navigation_test.rb +7 -0
- data/test/lib/generators/newspaper_works/install_generator_test.rb +16 -0
- data/test/newspaper_works_test.rb +7 -0
- data/test/test_helper.rb +17 -0
- data/tmp/.keep +0 -0
- metadata +1037 -0
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
module NewspaperWorks
|
|
2
|
+
module Ingest
|
|
3
|
+
module NDNP
|
|
4
|
+
class IssueIngester
|
|
5
|
+
include NewspaperWorks::Logging
|
|
6
|
+
include NewspaperWorks::Ingest::NDNP::NDNPAssetHelper
|
|
7
|
+
include NewspaperWorks::Ingest::PubFinder
|
|
8
|
+
|
|
9
|
+
attr_accessor :issue, :target, :opts
|
|
10
|
+
|
|
11
|
+
delegate :path, to: :issue
|
|
12
|
+
|
|
13
|
+
COPY_FIELDS = [
|
|
14
|
+
:lccn,
|
|
15
|
+
:edition_number,
|
|
16
|
+
:edition_name,
|
|
17
|
+
:volume,
|
|
18
|
+
:publication_date,
|
|
19
|
+
:held_by,
|
|
20
|
+
:issue_number
|
|
21
|
+
].freeze
|
|
22
|
+
|
|
23
|
+
# @param issue [NewspaperWorks::Ingest::NDNP::IssueIngest]
|
|
24
|
+
# source issue data
|
|
25
|
+
# @param opts [Hash]
|
|
26
|
+
# ingest options, e.g. administrative metadata
|
|
27
|
+
def initialize(issue, opts = {})
|
|
28
|
+
@issue = issue
|
|
29
|
+
@opts = opts
|
|
30
|
+
@target = nil
|
|
31
|
+
configure_logger('ingest')
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def ingest
|
|
35
|
+
construct_issue
|
|
36
|
+
ingest_pages
|
|
37
|
+
NewspaperWorks::ComposeIssuePDFJob.perform_later(@target)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def construct_issue
|
|
41
|
+
create_issue
|
|
42
|
+
find_or_create_linked_publication
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def ingest_pages
|
|
46
|
+
issue.each do |page|
|
|
47
|
+
page_ingester(page).ingest
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
private
|
|
52
|
+
|
|
53
|
+
def page_ingester(page_data)
|
|
54
|
+
NewspaperWorks::Ingest::NDNP::PageIngester.new(
|
|
55
|
+
page_data,
|
|
56
|
+
@target,
|
|
57
|
+
@opts
|
|
58
|
+
)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def publication_date
|
|
62
|
+
parsed = DateTime.iso8601(issue.metadata.publication_date)
|
|
63
|
+
parsed.strftime('%B %-d, %Y')
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def publication_title(issue)
|
|
67
|
+
issue.metadata.publication_title.strip.split(/ \(/)[0]
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def issue_title
|
|
71
|
+
"#{publication_title(issue)}: #{publication_date}"
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def copy_issue_metadata
|
|
75
|
+
metadata = issue.metadata
|
|
76
|
+
# set (required, plural) title from single value obtained from reel:
|
|
77
|
+
@target.title = [issue_title]
|
|
78
|
+
# copy all fields with singular (non-repeatable) values on both
|
|
79
|
+
# target NewspaperIssue object, and metadata source:
|
|
80
|
+
COPY_FIELDS.each do |fieldname|
|
|
81
|
+
@target.send("#{fieldname}=", metadata.send(fieldname.to_s))
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def create_issue
|
|
86
|
+
@target = NewspaperIssue.create
|
|
87
|
+
copy_issue_metadata
|
|
88
|
+
assign_administrative_metadata
|
|
89
|
+
@target.save!
|
|
90
|
+
write_log("Saved metadata to new NewspaperIssue #{@target.id}")
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def find_or_create_linked_publication
|
|
94
|
+
title = publication_title(issue)
|
|
95
|
+
lccn = issue.metadata.lccn
|
|
96
|
+
find_or_create_publication_for_issue(@target, lccn, title, @opts)
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
end
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
module NewspaperWorks
|
|
2
|
+
module Ingest
|
|
3
|
+
module NDNP
|
|
4
|
+
class IssueMetadata
|
|
5
|
+
include NewspaperWorks::Ingest::NDNP::NDNPMetsHelper
|
|
6
|
+
|
|
7
|
+
attr_accessor :path, :doc, :parent
|
|
8
|
+
|
|
9
|
+
def initialize(path, parent = nil)
|
|
10
|
+
@path = path
|
|
11
|
+
@parent = parent
|
|
12
|
+
@doc = nil
|
|
13
|
+
load_doc
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def inspect
|
|
17
|
+
format(
|
|
18
|
+
"<#{self.class}:0x000000000%<oid>x\n" \
|
|
19
|
+
"\tpath: '#{path}',\n",
|
|
20
|
+
oid: object_id << 1
|
|
21
|
+
)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# LCCN (mandatory)
|
|
25
|
+
# @return [String]
|
|
26
|
+
def lccn
|
|
27
|
+
xpath("//mods:identifier[@type='lccn']").text
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Volume number (optional)
|
|
31
|
+
# @return [String,NilClass]
|
|
32
|
+
def volume
|
|
33
|
+
result = xpath("//mods:detail[@type='volume']/mods:number")
|
|
34
|
+
return if result.size.zero?
|
|
35
|
+
result.text
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Issue number (optional)
|
|
39
|
+
# @return [String,NilClass]
|
|
40
|
+
def issue_number
|
|
41
|
+
result = xpath("//mods:detail[@type='issue']/mods:number")
|
|
42
|
+
return if result.size.zero?
|
|
43
|
+
result.text
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Edition name
|
|
47
|
+
# Edition name is optional ("caption" / "label") is optional
|
|
48
|
+
# in NDNP, but as it may be used as a label for readability.
|
|
49
|
+
# @return [String,NilClass]
|
|
50
|
+
def edition_name
|
|
51
|
+
ed_name = xpath("//mods:detail[@type='edition']/mods:caption")
|
|
52
|
+
return ed_name.text unless ed_name.size.zero?
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Edition name, with fallback to edition number (mandatory)
|
|
56
|
+
# @return [String]
|
|
57
|
+
def edition_number
|
|
58
|
+
xpath("//mods:detail[@type='edition']/mods:number").text
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Issue date (mandatory field) as ISO 8601 datestamp string
|
|
62
|
+
# @return [String] (ISO-8601 date) publication date
|
|
63
|
+
def publication_date
|
|
64
|
+
xpath("//mods:originInfo/mods:dateIssued").text
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def publication_title
|
|
68
|
+
# try from reel first
|
|
69
|
+
reel = parent.nil? ? nil : parent.container
|
|
70
|
+
return reel.metadata.title unless reel.nil?
|
|
71
|
+
# fallback to parsing //mets/@LABEL
|
|
72
|
+
label = xpath('//mets:mets/@LABEL').first
|
|
73
|
+
v = label.nil? ? '' : label.value.split(/[,] [0-9]/)[0]
|
|
74
|
+
# based on label convention:
|
|
75
|
+
# "ACME Times (Springfield, UT), 1911-01-25, First Edition"
|
|
76
|
+
# Returns the name and (*for now TBD*) place of publication
|
|
77
|
+
# as a string in parentheses.
|
|
78
|
+
v.split(/, [0-9]/)[0]
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Original Source Repository (NDNP-mandatory)
|
|
82
|
+
# @return [String]
|
|
83
|
+
def held_by
|
|
84
|
+
xpath("//mods:physicalLocation").first['displayLabel']
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
private
|
|
88
|
+
|
|
89
|
+
def load_doc
|
|
90
|
+
@doc = @parent.doc unless @parent.nil?
|
|
91
|
+
@doc = Nokogiri::XML(File.open(path)) if @doc.nil?
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
module NewspaperWorks
|
|
2
|
+
module Ingest
|
|
3
|
+
module NDNP
|
|
4
|
+
# Mixin for mets-specific XPath and traversal of issue/page data
|
|
5
|
+
module NDNPAssetHelper
|
|
6
|
+
# Set administrative metadata for asset, based on options saved
|
|
7
|
+
# on ingester state.
|
|
8
|
+
# Pre-conditions for use:
|
|
9
|
+
# consuming class implements @target pointing to work asset
|
|
10
|
+
# consuming class implements @opts pointing to Hash
|
|
11
|
+
def assign_administrative_metadata(work = nil)
|
|
12
|
+
NewspaperWorks::Ingest.assign_administrative_metadata(
|
|
13
|
+
work || @target,
|
|
14
|
+
@opts
|
|
15
|
+
)
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
require 'nokogiri'
|
|
2
|
+
|
|
3
|
+
module NewspaperWorks
|
|
4
|
+
module Ingest
|
|
5
|
+
module NDNP
|
|
6
|
+
# Mixin for mets-specific XPath and traversal of issue/page data
|
|
7
|
+
module NDNPMetsHelper
|
|
8
|
+
XML_NS = {
|
|
9
|
+
mets: 'http://www.loc.gov/METS/',
|
|
10
|
+
METS: 'http://www.loc.gov/METS/',
|
|
11
|
+
mods: 'http://www.loc.gov/mods/v3',
|
|
12
|
+
MODS: 'http://www.loc.gov/mods/v3',
|
|
13
|
+
ndnp: 'http://www.loc.gov/ndnp',
|
|
14
|
+
NDNP: 'http://www.loc.gov/ndnp'
|
|
15
|
+
}.freeze
|
|
16
|
+
|
|
17
|
+
# DRY XPath without repeatedly specifying default namespace urlmap
|
|
18
|
+
def xpath(expr, context = nil)
|
|
19
|
+
context ||= doc
|
|
20
|
+
context.xpath(
|
|
21
|
+
expr,
|
|
22
|
+
**XML_NS
|
|
23
|
+
)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def dmd_node
|
|
27
|
+
xpath("//mets:dmdSec[@ID='#{dmdid}']")
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def normalize_path(specified_path)
|
|
31
|
+
return specified_path if specified_path.start_with?('/')
|
|
32
|
+
basename = File.dirname(path)
|
|
33
|
+
File.join(basename, specified_path)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# returns hash of "use" key string to path value
|
|
37
|
+
def page_files
|
|
38
|
+
# get pointers from structmap:
|
|
39
|
+
file_group = xpath("//mets:structMap//mets:div[@DMDID='#{dmdid}']")
|
|
40
|
+
result = xpath('mets:fptr', file_group).map do |fptr|
|
|
41
|
+
file_id = fptr['FILEID']
|
|
42
|
+
file_node = xpath(
|
|
43
|
+
"//mets:fileSec//mets:fileGrp//mets:file[@ID='#{file_id}']"
|
|
44
|
+
).first
|
|
45
|
+
[
|
|
46
|
+
file_node['USE'],
|
|
47
|
+
xpath('mets:FLocat', file_node).first.attribute_with_ns(
|
|
48
|
+
'href',
|
|
49
|
+
'http://www.w3.org/1999/xlink'
|
|
50
|
+
).to_s
|
|
51
|
+
]
|
|
52
|
+
end
|
|
53
|
+
result.to_h
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def container_path
|
|
57
|
+
reel_dir = File.expand_path('..', File.dirname(path))
|
|
58
|
+
reel_base = File.basename(reel_dir)
|
|
59
|
+
File.join(reel_dir, "#{reel_base}_1.xml")
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def container
|
|
63
|
+
reel_path = container_path
|
|
64
|
+
return unless File.exist?(reel_path)
|
|
65
|
+
NewspaperWorks::Ingest::NDNP::ContainerIngest.new(reel_path)
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
module NewspaperWorks
|
|
2
|
+
module Ingest
|
|
3
|
+
module NDNP
|
|
4
|
+
class PageIngest
|
|
5
|
+
include NewspaperWorks::Ingest::NDNP::NDNPMetsHelper
|
|
6
|
+
|
|
7
|
+
attr_accessor :path, :dmdid, :doc, :files
|
|
8
|
+
|
|
9
|
+
def initialize(path = nil, dmdid = nil, parent = nil)
|
|
10
|
+
raise ArgumentError, 'No path provided' if path.nil?
|
|
11
|
+
@path = path
|
|
12
|
+
@dmdid = dmdid
|
|
13
|
+
@doc = nil
|
|
14
|
+
@parent = parent
|
|
15
|
+
@metadata = nil
|
|
16
|
+
load_doc
|
|
17
|
+
@files = page_files.values.map(&method(:normalize_path))
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def inspect
|
|
21
|
+
format(
|
|
22
|
+
"<#{self.class}:0x000000000%<oid>x\n" \
|
|
23
|
+
"\tpath: '#{path}',\n" \
|
|
24
|
+
"\tdmdid: '#{dmdid}' ...>",
|
|
25
|
+
oid: object_id << 1
|
|
26
|
+
)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def metadata
|
|
30
|
+
return @metadata unless @metadata.nil?
|
|
31
|
+
@metadata = NewspaperWorks::Ingest::NDNP::PageMetadata.new(
|
|
32
|
+
path,
|
|
33
|
+
self,
|
|
34
|
+
dmdid
|
|
35
|
+
)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
private
|
|
39
|
+
|
|
40
|
+
def load_doc
|
|
41
|
+
@doc = @parent.doc unless @parent.nil?
|
|
42
|
+
@doc = Nokogiri::XML(File.open(path)) if @doc.nil?
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
require 'newspaper_works/logging'
|
|
2
|
+
|
|
3
|
+
module NewspaperWorks
|
|
4
|
+
module Ingest
|
|
5
|
+
module NDNP
|
|
6
|
+
class PageIngester
|
|
7
|
+
include NewspaperWorks::Logging
|
|
8
|
+
include NewspaperWorks::Ingest::NDNP::NDNPAssetHelper
|
|
9
|
+
|
|
10
|
+
attr_accessor :page, :issue, :target, :opts
|
|
11
|
+
|
|
12
|
+
delegate :path, :dmdid, to: :page
|
|
13
|
+
|
|
14
|
+
COPY_FIELDS = [
|
|
15
|
+
:width,
|
|
16
|
+
:height,
|
|
17
|
+
:page_number,
|
|
18
|
+
:identifier
|
|
19
|
+
].freeze
|
|
20
|
+
|
|
21
|
+
COPY_FIELDS_PLURALIZE = [
|
|
22
|
+
:identifier
|
|
23
|
+
].freeze
|
|
24
|
+
|
|
25
|
+
# @param page [NewspaperWorks::Ingest::NDNP::PageIngest]
|
|
26
|
+
# source page data
|
|
27
|
+
# @param issue [NewspaperIssue]
|
|
28
|
+
# source issue data
|
|
29
|
+
# @param opts [Hash]
|
|
30
|
+
# ingest options, e.g. administrative metadata
|
|
31
|
+
def initialize(page, issue, opts = {})
|
|
32
|
+
@page = page
|
|
33
|
+
@issue = issue
|
|
34
|
+
@opts = opts
|
|
35
|
+
# target is to-be-created NewspaperPage:
|
|
36
|
+
@target = nil
|
|
37
|
+
@work_files = nil
|
|
38
|
+
configure_logger('ingest')
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def ingest
|
|
42
|
+
construct_page
|
|
43
|
+
ingest_page_files
|
|
44
|
+
link_reel
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def construct_page
|
|
48
|
+
@target = NewspaperPage.create!(title: page_title)
|
|
49
|
+
write_log(
|
|
50
|
+
"Created NewspaperPage work #{@target.id} "\
|
|
51
|
+
"with title '#{@target.title[0]}'"
|
|
52
|
+
)
|
|
53
|
+
copy_page_metadata
|
|
54
|
+
assign_administrative_metadata
|
|
55
|
+
link_issue
|
|
56
|
+
@target.save!
|
|
57
|
+
write_log("Saved metadata to NewspaperPage work #{@target.id}")
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Ingest primary, derivative files; other derivatives including
|
|
61
|
+
# thumbnail, plain-text, json will be made by NewspaperWorks
|
|
62
|
+
# derivative service components as a consequence of commiting
|
|
63
|
+
# files assigned (via actor stack, via WorkFiles).
|
|
64
|
+
def ingest_page_files
|
|
65
|
+
@work_files = NewspaperWorks::Data::WorkFiles.new(@target)
|
|
66
|
+
page.files.each do |path|
|
|
67
|
+
ext = path.downcase.split('.')[-1]
|
|
68
|
+
if ['tif', 'tiff'].include?(ext)
|
|
69
|
+
ingest_primary_file(path)
|
|
70
|
+
else
|
|
71
|
+
ingest_derivative_file(path)
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
write_log("Beginning file attachment process (WorkFiles.commit!) "\
|
|
75
|
+
"for work #{@target.id}")
|
|
76
|
+
@work_files.commit!
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def link_reel
|
|
80
|
+
reel_data = @page.container
|
|
81
|
+
return if reel_data.nil?
|
|
82
|
+
ingester = NewspaperWorks::Ingest::NDNP::ContainerIngester.new(
|
|
83
|
+
reel_data,
|
|
84
|
+
issue.publication,
|
|
85
|
+
@opts
|
|
86
|
+
)
|
|
87
|
+
# find-or-create container, linked to publication:
|
|
88
|
+
ingester.ingest
|
|
89
|
+
# link target page to container asset for reel:
|
|
90
|
+
ingester.link(@target)
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
private
|
|
94
|
+
|
|
95
|
+
def ingest_primary_file(path)
|
|
96
|
+
unless File.exist?(path)
|
|
97
|
+
pdf_path = page.files.select { |p| p.end_with?('pdf') }[0]
|
|
98
|
+
# make and get TIFF path (to generated tmp file):
|
|
99
|
+
path = make_tiff(pdf_path)
|
|
100
|
+
end
|
|
101
|
+
write_log("Assigned primary file to work #{@target.id}, #{path}")
|
|
102
|
+
@work_files.assign(path)
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def ingest_derivative_file(path)
|
|
106
|
+
write_log("Assigned derivative file to work #{@target.id}, #{path}")
|
|
107
|
+
@work_files.derivatives.assign(path)
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def link_issue
|
|
111
|
+
issue.ordered_members << @target # page
|
|
112
|
+
issue.save!
|
|
113
|
+
write_log(
|
|
114
|
+
"Linked NewspaperIssue work #{issue.id} "\
|
|
115
|
+
"to NewspaperPage work #{@target.id}"
|
|
116
|
+
)
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# dir whitelist
|
|
120
|
+
def whitelist
|
|
121
|
+
Hyrax.config.whitelisted_ingest_dirs
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# Generate TIFF in temporary file, return its path, given path to PDF
|
|
125
|
+
# @param pdf_path [String] path to single-page PDF
|
|
126
|
+
# @return [String] path to generated TIFF
|
|
127
|
+
def make_tiff(pdf_path)
|
|
128
|
+
write_log(
|
|
129
|
+
"Creating TIFF from PDF in lieu of missing for work "\
|
|
130
|
+
" (#{@target.id})",
|
|
131
|
+
Logger::WARN
|
|
132
|
+
)
|
|
133
|
+
whitelist.push(Dir.tmpdir) unless whitelist.include?(Dir.tmpdir)
|
|
134
|
+
NewspaperWorks::Ingest::PdfPages.new(pdf_path).to_a[0]
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Page title as issue title plus page title
|
|
138
|
+
# e.g. "ACME Tribune (1910-01-02): Page 2"
|
|
139
|
+
# @return [String] composed page title
|
|
140
|
+
def page_title
|
|
141
|
+
["#{issue.title.first}: Page #{@page.metadata.page_number}"]
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
def copy_page_metadata
|
|
145
|
+
metadata = page.metadata
|
|
146
|
+
# copy all fields with singular (non-repeatable) values on both
|
|
147
|
+
# target NewspaperIssue object, and metadata source:
|
|
148
|
+
COPY_FIELDS.each do |fieldname|
|
|
149
|
+
value = metadata.send(fieldname.to_s)
|
|
150
|
+
pluralize = COPY_FIELDS_PLURALIZE.include?(fieldname)
|
|
151
|
+
@target.send("#{fieldname}=", pluralize ? [value] : value)
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
end
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
require 'nokogiri'
|
|
2
|
+
|
|
3
|
+
module NewspaperWorks
|
|
4
|
+
module Ingest
|
|
5
|
+
module NDNP
|
|
6
|
+
class PageMetadata
|
|
7
|
+
# mixin convenience methods for NDNP XML, plus XML_NS hash
|
|
8
|
+
include NewspaperWorks::Ingest::NDNP::NDNPMetsHelper
|
|
9
|
+
|
|
10
|
+
attr_accessor :path, :dmdid, :doc
|
|
11
|
+
|
|
12
|
+
def initialize(path = nil, parent = nil, dmdid = nil)
|
|
13
|
+
raise ArgumentError, 'No context provided' if path.nil? && parent.nil?
|
|
14
|
+
@path = path
|
|
15
|
+
@parent = parent
|
|
16
|
+
@dmdid = dmdid
|
|
17
|
+
@doc = nil
|
|
18
|
+
load_doc
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def inspect
|
|
22
|
+
format(
|
|
23
|
+
"<#{self.class}:0x000000000%<oid>x\n" \
|
|
24
|
+
"\tpath: '#{path}',\n" \
|
|
25
|
+
"\tdmdid: '#{dmdid}' ...>",
|
|
26
|
+
oid: object_id << 1
|
|
27
|
+
)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Printed page number, if printed; optional field in NDNP spec.
|
|
31
|
+
# "Number" is used liberally, and may contain both alpha
|
|
32
|
+
# and numeric characters. As such, return value is String.
|
|
33
|
+
#
|
|
34
|
+
# If NDNP issue data fails to provide an explicitly
|
|
35
|
+
# human-readable page number, fallback to sequence
|
|
36
|
+
# number, in String form.
|
|
37
|
+
#
|
|
38
|
+
# @return [String, NilClass] Page "number" string
|
|
39
|
+
def page_number
|
|
40
|
+
detail = dmd_node.xpath(
|
|
41
|
+
".//mods:mods//mods:detail[@type='page number']",
|
|
42
|
+
**XML_NS
|
|
43
|
+
)
|
|
44
|
+
if detail.size.zero?
|
|
45
|
+
fallback = page_sequence_number
|
|
46
|
+
return fallback.nil? ? nil : fallback.to_s
|
|
47
|
+
end
|
|
48
|
+
detail.xpath("mods:number", **XML_NS).first.text
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Page sequence number, indexical to order in issue.
|
|
52
|
+
# "Number" here is one-indexed positive integer, position in
|
|
53
|
+
# issue. Mandatory for page of issue, nil for page of reel.
|
|
54
|
+
# @return [Integer,NilClass] Page sequence number, positive integer
|
|
55
|
+
def page_sequence_number
|
|
56
|
+
detail = dmd_node.xpath(
|
|
57
|
+
".//mods:mods//mods:extent[@unit='pages']",
|
|
58
|
+
**XML_NS
|
|
59
|
+
)
|
|
60
|
+
node = detail.xpath("mods:start", **XML_NS).first
|
|
61
|
+
node.text.to_i unless node.nil?
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Extract identifier from page ALTO, based on file name.
|
|
65
|
+
# XML parsing of big documents are expensive, so use regex to
|
|
66
|
+
# scan for fileName element, and return its value.
|
|
67
|
+
# @return [String,NilClass] file name or path, or nil.
|
|
68
|
+
def identifier
|
|
69
|
+
matches = page_alto.scan(/<fileName>([^<]*)<\/fileName>/).first
|
|
70
|
+
matches.size.zero? ? nil : stripped_filename(matches[0])
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def height
|
|
74
|
+
alto_page_meta('HEIGHT').to_i
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def width
|
|
78
|
+
alto_page_meta('WIDTH').to_i
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
private
|
|
82
|
+
|
|
83
|
+
# filename stripped of base path and file extension
|
|
84
|
+
def stripped_filename(path)
|
|
85
|
+
File.basename(path).split('.')[0]
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def load_doc
|
|
89
|
+
@doc = @parent.doc unless @parent.nil?
|
|
90
|
+
@doc = Nokogiri::XML(File.open(path)) if @doc.nil?
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def alto_path
|
|
94
|
+
specified_path = page_files['ocr']
|
|
95
|
+
normalize_path(specified_path)
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def page_alto
|
|
99
|
+
File.read(alto_path)
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def alto_page_meta(key)
|
|
103
|
+
matches = page_alto.scan(/(<Page [^>]*>)/).first
|
|
104
|
+
return if matches.size.zero?
|
|
105
|
+
# parse xml <Page> start tag fragment, get attributes:
|
|
106
|
+
page_tag = Nokogiri::XML(matches[0]).root
|
|
107
|
+
page_tag[key]
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
require 'newspaper_works/ingest/ndnp/ndnp_mets_helper'
|
|
2
|
+
require 'newspaper_works/ingest/ndnp/ndnp_asset_helper'
|
|
3
|
+
require 'newspaper_works/ingest/ndnp/page_ingest'
|
|
4
|
+
require 'newspaper_works/ingest/ndnp/page_ingester'
|
|
5
|
+
require 'newspaper_works/ingest/ndnp/page_metadata'
|
|
6
|
+
require 'newspaper_works/ingest/ndnp/issue_ingest'
|
|
7
|
+
require 'newspaper_works/ingest/ndnp/issue_ingester'
|
|
8
|
+
require 'newspaper_works/ingest/ndnp/issue_metadata'
|
|
9
|
+
require 'newspaper_works/ingest/ndnp/container_ingest'
|
|
10
|
+
require 'newspaper_works/ingest/ndnp/container_ingester'
|
|
11
|
+
require 'newspaper_works/ingest/ndnp/container_metadata'
|
|
12
|
+
require 'newspaper_works/ingest/ndnp/batch_xml_ingest'
|
|
13
|
+
require 'newspaper_works/ingest/ndnp/batch_ingester'
|
|
14
|
+
|
|
15
|
+
module NewspaperWorks
|
|
16
|
+
module Ingest
|
|
17
|
+
# Module for NDNP-specific ingest components
|
|
18
|
+
module NDNP
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
module NewspaperWorks
|
|
2
|
+
module Ingest
|
|
3
|
+
class NewspaperIssueIngest < BaseIngest
|
|
4
|
+
@configured = false
|
|
5
|
+
|
|
6
|
+
class << self
|
|
7
|
+
def configure
|
|
8
|
+
return if @configured == true
|
|
9
|
+
# PDF ingest may save page images to /tmp (via Dir.tmpdir), which
|
|
10
|
+
# needs whitelisting for use by NewspaperWorks::Data::WorkFiles.commit!
|
|
11
|
+
# via Hyrax CreateWithRemoteFilesActor:
|
|
12
|
+
whitelist = Hyrax.config.whitelisted_ingest_dirs
|
|
13
|
+
whitelist.push(Dir.tmpdir) unless whitelist.include?(Dir.tmpdir)
|
|
14
|
+
@configured = true
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def import
|
|
19
|
+
# first, handle the PDF itself on the issue...
|
|
20
|
+
super
|
|
21
|
+
# ...then create child works from split pages
|
|
22
|
+
create_child_pages
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Creates child pages with attached TIFF masters, can be called by
|
|
26
|
+
# `import`, or independently if `load` is called first. The
|
|
27
|
+
# latter is appropriate if framework is already handling the
|
|
28
|
+
# NewspaperIssue file attachment (e.g. Hyrax upload via browser).
|
|
29
|
+
def create_child_pages
|
|
30
|
+
self.class.configure
|
|
31
|
+
pages = NewspaperWorks::Ingest::PdfPages.new(path).to_a
|
|
32
|
+
pages.each_with_index do |tiffpath, idx|
|
|
33
|
+
page = new_child_page_with_file(tiffpath, idx)
|
|
34
|
+
@work.ordered_members << page
|
|
35
|
+
end
|
|
36
|
+
@work.save!(validate: false) unless pages.empty?
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def new_child_page_with_file(tiffpath, idx)
|
|
40
|
+
page_number = idx + 1
|
|
41
|
+
page = NewspaperPage.new
|
|
42
|
+
page.title = ["#{@work.title.first}: Page #{page_number}"]
|
|
43
|
+
# technically, a sequence number distinct from displayed page number
|
|
44
|
+
page.page_number = page_number.to_s
|
|
45
|
+
# Set depositor and admin-set id:
|
|
46
|
+
page.depositor = @work.depositor
|
|
47
|
+
page.admin_set_id = @work.admin_set_id
|
|
48
|
+
# copying permissions also by effect copies visibility:
|
|
49
|
+
page.permissions_attributes = @work.permissions.map(&:to_hash)
|
|
50
|
+
NewspaperPageIngest.new(page).ingest(tiffpath)
|
|
51
|
+
page.save!
|
|
52
|
+
page
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|