newspaper_works 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.fcrepo_wrapper +4 -0
- data/.gitignore +43 -0
- data/.rubocop.yml +143 -0
- data/.solr_wrapper +8 -0
- data/.travis.yml +50 -0
- data/Gemfile +47 -0
- data/LICENSE +203 -0
- data/README.md +159 -0
- data/Rakefile +38 -0
- data/app/actors/hyrax/actors/newspaper_article_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_container_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_issue_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_page_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_title_actor.rb +8 -0
- data/app/actors/newspaper_works/actors/newspaper_works_upload_actor.rb +88 -0
- data/app/assets/config/newspaper_works_manifest.js +2 -0
- data/app/assets/images/newspaper_works/.keep +0 -0
- data/app/assets/javascripts/newspaper_works/autocomplete_fix.js +33 -0
- data/app/assets/javascripts/newspaper_works/ocr_search.js.erb +6 -0
- data/app/assets/javascripts/newspaper_works/thumbnail_highlights.js.erb +102 -0
- data/app/assets/javascripts/newspaper_works.js +4 -0
- data/app/assets/stylesheets/newspaper_works/_issue_search.scss +13 -0
- data/app/assets/stylesheets/newspaper_works/_issues_calendar.scss +18 -0
- data/app/assets/stylesheets/newspaper_works/_newspaper_works.scss +4 -0
- data/app/assets/stylesheets/newspaper_works/_newspapers_search.scss +38 -0
- data/app/assets/stylesheets/newspaper_works/_search_results.scss +12 -0
- data/app/controllers/hyrax/newspaper_articles_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_containers_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_issues_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_pages_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_titles_controller.rb +13 -0
- data/app/controllers/newspaper_works/newspapers_controller.rb +117 -0
- data/app/controllers/newspaper_works/newspapers_search_controller.rb +26 -0
- data/app/forms/hyrax/newspaper_article_form.rb +11 -0
- data/app/forms/hyrax/newspaper_container_form.rb +11 -0
- data/app/forms/hyrax/newspaper_issue_form.rb +11 -0
- data/app/forms/hyrax/newspaper_page_form.rb +15 -0
- data/app/forms/hyrax/newspaper_title_form.rb +12 -0
- data/app/forms/newspaper_works/newspaper_core_form_data.rb +17 -0
- data/app/helpers/hyrax/newspaper_articles_helper.rb +5 -0
- data/app/helpers/hyrax/newspaper_containers_helper.rb +5 -0
- data/app/helpers/hyrax/newspaper_issues_helper.rb +5 -0
- data/app/helpers/hyrax/newspaper_pages_helper.rb +5 -0
- data/app/helpers/newspaper_works/application_helper.rb +5 -0
- data/app/helpers/newspaper_works/breadcrumb_helper.rb +92 -0
- data/app/helpers/newspaper_works/newspaper_works_helper_behavior.rb +103 -0
- data/app/helpers/newspaper_works/newspapers_helper.rb +5 -0
- data/app/indexers/concerns/newspaper_works/indexes_full_text.rb +17 -0
- data/app/indexers/concerns/newspaper_works/indexes_place_of_publication.rb +67 -0
- data/app/indexers/concerns/newspaper_works/indexes_publication_date_range.rb +35 -0
- data/app/indexers/concerns/newspaper_works/indexes_relationships.rb +125 -0
- data/app/indexers/newspaper_article_indexer.rb +16 -0
- data/app/indexers/newspaper_container_indexer.rb +18 -0
- data/app/indexers/newspaper_issue_indexer.rb +26 -0
- data/app/indexers/newspaper_page_indexer.rb +9 -0
- data/app/indexers/newspaper_title_indexer.rb +19 -0
- data/app/indexers/newspaper_works/newspaper_core_indexer.rb +21 -0
- data/app/jobs/newspaper_works/application_job.rb +4 -0
- data/app/jobs/newspaper_works/compose_issue_pdf_job.rb +13 -0
- data/app/jobs/newspaper_works/create_issue_pages_job.rb +19 -0
- data/app/mailers/newspaper_works/application_mailer.rb +8 -0
- data/app/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior.rb +82 -0
- data/app/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior.rb +27 -0
- data/app/models/concerns/newspaper_works/newspaper_core_metadata.rb +67 -0
- data/app/models/concerns/newspaper_works/place_of_publication_behavior.rb +15 -0
- data/app/models/concerns/newspaper_works/scanned_media_metadata.rb +43 -0
- data/app/models/concerns/newspaper_works/solr/document.rb +25 -0
- data/app/models/file_set.rb +10 -0
- data/app/models/newspaper_article.rb +158 -0
- data/app/models/newspaper_container.rb +86 -0
- data/app/models/newspaper_issue.rb +115 -0
- data/app/models/newspaper_page.rb +70 -0
- data/app/models/newspaper_title.rb +111 -0
- data/app/models/newspaper_works/application_record.rb +6 -0
- data/app/models/newspaper_works/derivative_attachment.rb +8 -0
- data/app/models/newspaper_works/ingest_file_relation.rb +14 -0
- data/app/presenters/hyrax/newspaper_article_presenter.rb +38 -0
- data/app/presenters/hyrax/newspaper_container_presenter.rb +11 -0
- data/app/presenters/hyrax/newspaper_issue_presenter.rb +62 -0
- data/app/presenters/hyrax/newspaper_page_presenter.rb +72 -0
- data/app/presenters/hyrax/newspaper_title_presenter.rb +86 -0
- data/app/presenters/newspaper_works/iiif_manifest_presenter_behavior.rb +29 -0
- data/app/presenters/newspaper_works/issue_info_presenter.rb +29 -0
- data/app/presenters/newspaper_works/newspaper_core_presenter.rb +9 -0
- data/app/presenters/newspaper_works/persistent_url_presenter_behavior.rb +16 -0
- data/app/presenters/newspaper_works/place_of_publication_presenter_behavior.rb +8 -0
- data/app/presenters/newspaper_works/scanned_media_presenter.rb +7 -0
- data/app/presenters/newspaper_works/title_info_presenter.rb +13 -0
- data/app/search_builders/concerns/newspaper_works/exclude_models.rb +16 -0
- data/app/search_builders/concerns/newspaper_works/highlight_search_params.rb +14 -0
- data/app/search_builders/newspaper_works/newspapers_search_builder.rb +26 -0
- data/app/services/hyrax/article_genre_service.rb +9 -0
- data/app/services/newspaper_works/jp2_derivative_service.rb +120 -0
- data/app/services/newspaper_works/newspaper_page_derivative_service.rb +91 -0
- data/app/services/newspaper_works/pdf_derivative_service.rb +45 -0
- data/app/services/newspaper_works/pluggable_derivative_service.rb +114 -0
- data/app/services/newspaper_works/text_extraction_derivative_service.rb +56 -0
- data/app/services/newspaper_works/text_formats_from_alto_service.rb +77 -0
- data/app/services/newspaper_works/tiff_derivative_service.rb +54 -0
- data/app/validators/newspaper_works/publication_date_start_end_validator.rb +48 -0
- data/app/validators/newspaper_works/publication_date_validator.rb +16 -0
- data/app/views/catalog/_index_gallery_newspaper_article_wrapper.html.erb +9 -0
- data/app/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb +9 -0
- data/app/views/catalog/_index_header_gallery_newspaper_article.html.erb +23 -0
- data/app/views/catalog/_index_header_gallery_newspaper_page.html.erb +23 -0
- data/app/views/catalog/_index_header_list_newspaper_article.html.erb +7 -0
- data/app/views/catalog/_index_header_list_newspaper_page.html.erb +7 -0
- data/app/views/catalog/_snippets_more.html.erb +16 -0
- data/app/views/catalog/_thumbnail_list_newspaper_article.html.erb +6 -0
- data/app/views/catalog/_thumbnail_list_newspaper_page.html.erb +6 -0
- data/app/views/hyrax/file_sets/_actions.html.erb +45 -0
- data/app/views/hyrax/newspaper_articles/_newspaper_article.html.erb +2 -0
- data/app/views/hyrax/newspaper_articles/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_containers/_newspaper_container.html.erb +2 -0
- data/app/views/hyrax/newspaper_containers/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_issues/_newspaper_issue.html.erb +2 -0
- data/app/views/hyrax/newspaper_issues/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_pages/_newspaper_page.html.erb +2 -0
- data/app/views/hyrax/newspaper_pages/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb +5 -0
- data/app/views/hyrax/newspaper_titles/_issue_search_form.html.erb +33 -0
- data/app/views/hyrax/newspaper_titles/_issues_calendar.html.erb +63 -0
- data/app/views/hyrax/newspaper_titles/_newspaper_title.html.erb +2 -0
- data/app/views/hyrax/newspaper_titles/show.html.erb +54 -0
- data/app/views/newspaper_works/base/_attribute_rows.html.erb +42 -0
- data/app/views/newspaper_works/base/_attributes.html.erb +16 -0
- data/app/views/newspaper_works/base/_metadata.html.erb +6 -0
- data/app/views/newspaper_works/base/_newspaper_hierarchy.html.erb +14 -0
- data/app/views/newspaper_works/base/_persistent_url.html.erb +1 -0
- data/app/views/newspaper_works/base/_show.html.erb +45 -0
- data/app/views/newspaper_works/newspapers_search/_date_fields.html.erb +29 -0
- data/app/views/newspaper_works/newspapers_search/_facet_layout.html.erb +8 -0
- data/app/views/newspaper_works/newspapers_search/_facet_limit.html.erb +17 -0
- data/app/views/newspaper_works/newspapers_search/_front_pages_input.html.erb +5 -0
- data/app/views/newspaper_works/newspapers_search/_keyword_input.html.erb +18 -0
- data/app/views/newspaper_works/newspapers_search/_newspapers_facets.html.erb +5 -0
- data/app/views/newspaper_works/newspapers_search/_newspapers_search_form.html.erb +13 -0
- data/app/views/newspaper_works/newspapers_search/_newspapers_search_help.html.erb +8 -0
- data/app/views/newspaper_works/newspapers_search/search.html.erb +13 -0
- data/app/views/records/edit_fields/_alternate_title.html.erb +4 -0
- data/app/views/records/edit_fields/_genre.html.erb +4 -0
- data/app/views/records/edit_fields/_place_of_publication.html.erb +14 -0
- data/app/views/records/edit_fields/_subtitle.html.erb +4 -0
- data/bin/rails +13 -0
- data/config/fcrepo_wrapper_test.yml +5 -0
- data/config/initializers/assets.rb +2 -0
- data/config/locales/newspaper_article.de.yml +12 -0
- data/config/locales/newspaper_article.en.yml +12 -0
- data/config/locales/newspaper_article.es.yml +12 -0
- data/config/locales/newspaper_article.fr.yml +12 -0
- data/config/locales/newspaper_article.it.yml +12 -0
- data/config/locales/newspaper_article.pt-BR.yml +12 -0
- data/config/locales/newspaper_article.zh.yml +12 -0
- data/config/locales/newspaper_container.de.yml +8 -0
- data/config/locales/newspaper_container.en.yml +8 -0
- data/config/locales/newspaper_container.es.yml +8 -0
- data/config/locales/newspaper_container.fr.yml +8 -0
- data/config/locales/newspaper_container.it.yml +8 -0
- data/config/locales/newspaper_container.pt-BR.yml +8 -0
- data/config/locales/newspaper_container.zh.yml +8 -0
- data/config/locales/newspaper_issue.de.yml +8 -0
- data/config/locales/newspaper_issue.en.yml +8 -0
- data/config/locales/newspaper_issue.es.yml +8 -0
- data/config/locales/newspaper_issue.fr.yml +8 -0
- data/config/locales/newspaper_issue.it.yml +8 -0
- data/config/locales/newspaper_issue.pt-BR.yml +8 -0
- data/config/locales/newspaper_issue.zh.yml +8 -0
- data/config/locales/newspaper_page.de.yml +15 -0
- data/config/locales/newspaper_page.en.yml +15 -0
- data/config/locales/newspaper_page.es.yml +15 -0
- data/config/locales/newspaper_page.fr.yml +15 -0
- data/config/locales/newspaper_page.it.yml +15 -0
- data/config/locales/newspaper_page.pt-BR.yml +15 -0
- data/config/locales/newspaper_page.zh.yml +15 -0
- data/config/locales/newspaper_title.de.yml +8 -0
- data/config/locales/newspaper_title.en.yml +8 -0
- data/config/locales/newspaper_title.es.yml +8 -0
- data/config/locales/newspaper_title.fr.yml +8 -0
- data/config/locales/newspaper_title.it.yml +8 -0
- data/config/locales/newspaper_title.pt-BR.yml +8 -0
- data/config/locales/newspaper_title.zh.yml +8 -0
- data/config/locales/newspaper_works.de.yml +50 -0
- data/config/locales/newspaper_works.en.yml +52 -0
- data/config/locales/newspaper_works.es.yml +52 -0
- data/config/locales/newspaper_works.fr.yml +52 -0
- data/config/locales/newspaper_works.it.yml +52 -0
- data/config/locales/newspaper_works.pt-BR.yml +52 -0
- data/config/locales/newspaper_works.zh.yml +52 -0
- data/config/routes.rb +9 -0
- data/config/solr_wrapper_test.yml +9 -0
- data/config/test-fixture/solr-config/_rest_managed.json +3 -0
- data/config/test-fixture/solr-config/admin-extra.html +31 -0
- data/config/test-fixture/solr-config/elevate.xml +36 -0
- data/config/test-fixture/solr-config/mapping-ISOLatin1Accent.txt +246 -0
- data/config/test-fixture/solr-config/protwords.txt +21 -0
- data/config/test-fixture/solr-config/schema.xml +366 -0
- data/config/test-fixture/solr-config/scripts.conf +24 -0
- data/config/test-fixture/solr-config/solrconfig.xml +322 -0
- data/config/test-fixture/solr-config/spellings.txt +2 -0
- data/config/test-fixture/solr-config/stopwords.txt +58 -0
- data/config/test-fixture/solr-config/stopwords_en.txt +58 -0
- data/config/test-fixture/solr-config/synonyms.txt +31 -0
- data/config/test-fixture/solr-config/xslt/example.xsl +132 -0
- data/config/test-fixture/solr-config/xslt/example_atom.xsl +67 -0
- data/config/test-fixture/solr-config/xslt/example_rss.xsl +66 -0
- data/config/test-fixture/solr-config/xslt/luke.xsl +337 -0
- data/config/vendor/imagemagick-6-policy.xml +76 -0
- data/db/migrate/20181214181358_create_newspaper_works_derivative_attachments.rb +12 -0
- data/db/migrate/20190107165909_create_newspaper_works_ingest_file_relations.rb +11 -0
- data/lib/generators/newspaper_works/assets_generator.rb +29 -0
- data/lib/generators/newspaper_works/blacklight_advanced_search_generator.rb +44 -0
- data/lib/generators/newspaper_works/blacklight_iiif_search_generator.rb +41 -0
- data/lib/generators/newspaper_works/catalog_controller_generator.rb +60 -0
- data/lib/generators/newspaper_works/install_generator.rb +97 -0
- data/lib/generators/newspaper_works/templates/annotation_behavior.rb +6 -0
- data/lib/generators/newspaper_works/templates/config/authorities/newspaper_article_genres.yml +86 -0
- data/lib/generators/newspaper_works/templates/config/initializers/newspaper_works.rb +12 -0
- data/lib/generators/newspaper_works/templates/config/initializers/patch_blacklight_advanced_search.rb +74 -0
- data/lib/generators/newspaper_works/templates/custom_search_builder.rb +23 -0
- data/lib/generators/newspaper_works/templates/newspaper_works.scss +1 -0
- data/lib/generators/newspaper_works/templates/newspaper_works_helper.rb +3 -0
- data/lib/generators/newspaper_works/templates/search_behavior.rb +6 -0
- data/lib/newspaper_works/configuration.rb +14 -0
- data/lib/newspaper_works/data/fileset_helper.rb +25 -0
- data/lib/newspaper_works/data/path_helper.rb +40 -0
- data/lib/newspaper_works/data/work_derivatives.rb +314 -0
- data/lib/newspaper_works/data/work_file.rb +92 -0
- data/lib/newspaper_works/data/work_files.rb +181 -0
- data/lib/newspaper_works/data.rb +35 -0
- data/lib/newspaper_works/engine.rb +42 -0
- data/lib/newspaper_works/errors.rb +14 -0
- data/lib/newspaper_works/ingest/base_ingest.rb +69 -0
- data/lib/newspaper_works/ingest/base_publication_info.rb +35 -0
- data/lib/newspaper_works/ingest/batch_ingest_helper.rb +44 -0
- data/lib/newspaper_works/ingest/batch_issue_ingester.rb +129 -0
- data/lib/newspaper_works/ingest/chronam_publication_info.rb +133 -0
- data/lib/newspaper_works/ingest/from_command.rb +52 -0
- data/lib/newspaper_works/ingest/image_ingest_issues.rb +43 -0
- data/lib/newspaper_works/ingest/issue_images.rb +51 -0
- data/lib/newspaper_works/ingest/lc_publication_info.rb +144 -0
- data/lib/newspaper_works/ingest/named_issue_metadata.rb +60 -0
- data/lib/newspaper_works/ingest/ndnp/batch_ingester.rb +64 -0
- data/lib/newspaper_works/ingest/ndnp/batch_xml_ingest.rb +72 -0
- data/lib/newspaper_works/ingest/ndnp/container_ingest.rb +99 -0
- data/lib/newspaper_works/ingest/ndnp/container_ingester.rb +84 -0
- data/lib/newspaper_works/ingest/ndnp/container_metadata.rb +87 -0
- data/lib/newspaper_works/ingest/ndnp/issue_ingest.rb +81 -0
- data/lib/newspaper_works/ingest/ndnp/issue_ingester.rb +101 -0
- data/lib/newspaper_works/ingest/ndnp/issue_metadata.rb +96 -0
- data/lib/newspaper_works/ingest/ndnp/ndnp_asset_helper.rb +20 -0
- data/lib/newspaper_works/ingest/ndnp/ndnp_mets_helper.rb +70 -0
- data/lib/newspaper_works/ingest/ndnp/page_ingest.rb +47 -0
- data/lib/newspaper_works/ingest/ndnp/page_ingester.rb +157 -0
- data/lib/newspaper_works/ingest/ndnp/page_metadata.rb +112 -0
- data/lib/newspaper_works/ingest/ndnp.rb +21 -0
- data/lib/newspaper_works/ingest/newspaper_issue_ingest.rb +56 -0
- data/lib/newspaper_works/ingest/newspaper_page_ingest.rb +6 -0
- data/lib/newspaper_works/ingest/page_image.rb +52 -0
- data/lib/newspaper_works/ingest/path_enumeration.rb +52 -0
- data/lib/newspaper_works/ingest/pdf_images.rb +85 -0
- data/lib/newspaper_works/ingest/pdf_issue.rb +20 -0
- data/lib/newspaper_works/ingest/pdf_issues.rb +39 -0
- data/lib/newspaper_works/ingest/pdf_pages.rb +114 -0
- data/lib/newspaper_works/ingest/pub_finder.rb +89 -0
- data/lib/newspaper_works/ingest/publication_info.rb +44 -0
- data/lib/newspaper_works/ingest.rb +90 -0
- data/lib/newspaper_works/issue_pdf_composer.rb +111 -0
- data/lib/newspaper_works/logging.rb +54 -0
- data/lib/newspaper_works/page_finder.rb +62 -0
- data/lib/newspaper_works/resource_fetcher.rb +78 -0
- data/lib/newspaper_works/text_extraction/alto_reader.rb +122 -0
- data/lib/newspaper_works/text_extraction/page_ocr.rb +100 -0
- data/lib/newspaper_works/text_extraction/render_alto.rb +84 -0
- data/lib/newspaper_works/text_extraction/word_coords_builder.rb +30 -0
- data/lib/newspaper_works/text_extraction.rb +10 -0
- data/lib/newspaper_works/version.rb +3 -0
- data/lib/newspaper_works.rb +19 -0
- data/lib/tasks/newspaper_works_tasks.rake +39 -0
- data/newspaper_works.gemspec +49 -0
- data/spec/.keep.txt +1 -0
- data/spec/actors/newspaper_works/actors/newspaper_works_upload_actor_spec.rb +69 -0
- data/spec/controllers/catalog_controller_spec.rb +63 -0
- data/spec/controllers/newspaper_works/newspapers_controller_spec.rb +114 -0
- data/spec/controllers/newspaper_works/newspapers_search_controller_spec.rb +21 -0
- data/spec/factories/ability.rb +6 -0
- data/spec/factories/newspaper_issue.rb +7 -0
- data/spec/factories/newspaper_issue_ingest.rb +6 -0
- data/spec/factories/newspaper_page.rb +7 -0
- data/spec/factories/newspaper_page_ingest.rb +6 -0
- data/spec/factories/newspaper_page_solr_document.rb +12 -0
- data/spec/factories/newspaper_title.rb +8 -0
- data/spec/factories/uploaded_pdf_file.rb +9 -0
- data/spec/factories/user.rb +13 -0
- data/spec/features/front_pages_for_title_spec.rb +19 -0
- data/spec/features/newspaper_title_search_spec.rb +30 -0
- data/spec/features/newspapers_search_spec.rb +49 -0
- data/spec/features/search_results_thumbnail_highlights_spec.rb +33 -0
- data/spec/features_shared.rb +71 -0
- data/spec/fixtures/files/4.1.07.jp2 +0 -0
- data/spec/fixtures/files/4.1.07.tiff +0 -0
- data/spec/fixtures/files/README.md +7 -0
- data/spec/fixtures/files/alto-2-0.xsd +714 -0
- data/spec/fixtures/files/broken-truncated.pdf +0 -0
- data/spec/fixtures/files/credits.md +16 -0
- data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
- data/spec/fixtures/files/minimal-1-page.pdf +0 -0
- data/spec/fixtures/files/minimal-2-page.pdf +0 -0
- data/spec/fixtures/files/minimal-alto.xml +31 -0
- data/spec/fixtures/files/ndnp-alto-sample.xml +24 -0
- data/spec/fixtures/files/ndnp-sample1-json.json +1 -0
- data/spec/fixtures/files/ndnp-sample1-txt.txt +1 -0
- data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
- data/spec/fixtures/files/ocr_alto.xml +202 -0
- data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +202 -0
- data/spec/fixtures/files/ocr_color.tiff +0 -0
- data/spec/fixtures/files/ocr_gray.jp2 +0 -0
- data/spec/fixtures/files/ocr_gray.tiff +0 -0
- data/spec/fixtures/files/ocr_mono.tiff +0 -0
- data/spec/fixtures/files/page1.tiff +0 -0
- data/spec/fixtures/files/resource_mocks/chronam/http404-expected +0 -0
- data/spec/fixtures/files/resource_mocks/chronam/sn84038814.rdf +1028 -0
- data/spec/fixtures/files/resource_mocks/chronam/sn93059126.rdf +36 -0
- data/spec/fixtures/files/resource_mocks/chronam/sn94051019.rdf +37 -0
- data/spec/fixtures/files/resource_mocks/geonames/Chicopee +1104 -0
- data/spec/fixtures/files/resource_mocks/geonames/Denver +1104 -0
- data/spec/fixtures/files/resource_mocks/geonames/Marysville +279 -0
- data/spec/fixtures/files/resource_mocks/geonames/Marysville2 +279 -0
- data/spec/fixtures/files/resource_mocks/geonames/SLC +1104 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn2099999999 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn82014496 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn83020109 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn83021453 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn83045396 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn84038814 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn93059126 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn94051019 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn99999999 +1 -0
- data/spec/fixtures/files/resource_mocks/urls.json +82 -0
- data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
- data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
- data/spec/fixtures/files/thumbnail.jpg +0 -0
- data/spec/forms/hyrax/newspaper_article_form_spec.rb +33 -0
- data/spec/forms/hyrax/newspaper_container_form_spec.rb +30 -0
- data/spec/forms/hyrax/newspaper_issue_form_spec.rb +31 -0
- data/spec/forms/hyrax/newspaper_page_form_spec.rb +28 -0
- data/spec/forms/hyrax/newspaper_title_form_spec.rb +31 -0
- data/spec/forms/newspaper_works/newspaper_core_form_data_spec.rb +12 -0
- data/spec/helpers/newspaper_works/breadcrumb_helper_spec.rb +82 -0
- data/spec/helpers/newspaper_works_helper_spec.rb +57 -0
- data/spec/indexers/concerns/newspaper_works/indexes_full_text_spec.rb +31 -0
- data/spec/indexers/concerns/newspaper_works/indexes_place_of_publication_spec.rb +53 -0
- data/spec/indexers/concerns/newspaper_works/indexes_publication_date_range_spec.rb +39 -0
- data/spec/indexers/concerns/newspaper_works/indexes_relationships_spec.rb +86 -0
- data/spec/indexers/newspaper_article_indexer_spec.rb +29 -0
- data/spec/indexers/newspaper_issue_indexer_spec.rb +19 -0
- data/spec/indexers/newspaper_title_indexer_spec.rb +22 -0
- data/spec/indexers/newspaper_works/newspaper_core_indexer_spec.rb +23 -0
- data/spec/lib/newspaper_works/configuration_spec.rb +18 -0
- data/spec/lib/newspaper_works/data/work_derivatives_spec.rb +245 -0
- data/spec/lib/newspaper_works/data/work_file_spec.rb +99 -0
- data/spec/lib/newspaper_works/data/work_files_spec.rb +224 -0
- data/spec/lib/newspaper_works/ingest/batch_issue_ingester_spec.rb +158 -0
- data/spec/lib/newspaper_works/ingest/chronam_publication_info_spec.rb +35 -0
- data/spec/lib/newspaper_works/ingest/from_command_spec.rb +75 -0
- data/spec/lib/newspaper_works/ingest/image_ingest_issues_spec.rb +62 -0
- data/spec/lib/newspaper_works/ingest/ingest_shared.rb +75 -0
- data/spec/lib/newspaper_works/ingest/issue_images_spec.rb +65 -0
- data/spec/lib/newspaper_works/ingest/lc_publication_info_spec.rb +34 -0
- data/spec/lib/newspaper_works/ingest/ndnp/batch_ingester_spec.rb +131 -0
- data/spec/lib/newspaper_works/ingest/ndnp/batch_xml_ingest_spec.rb +64 -0
- data/spec/lib/newspaper_works/ingest/ndnp/container_ingest_spec.rb +44 -0
- data/spec/lib/newspaper_works/ingest/ndnp/container_ingester_spec.rb +126 -0
- data/spec/lib/newspaper_works/ingest/ndnp/container_metadata_spec.rb +36 -0
- data/spec/lib/newspaper_works/ingest/ndnp/issue_ingest_spec.rb +108 -0
- data/spec/lib/newspaper_works/ingest/ndnp/issue_ingester_spec.rb +155 -0
- data/spec/lib/newspaper_works/ingest/ndnp/issue_metadata_spec.rb +84 -0
- data/spec/lib/newspaper_works/ingest/ndnp/page_ingest_spec.rb +79 -0
- data/spec/lib/newspaper_works/ingest/ndnp/page_ingester_spec.rb +184 -0
- data/spec/lib/newspaper_works/ingest/ndnp/page_metadata_spec.rb +85 -0
- data/spec/lib/newspaper_works/ingest/newspaper_issue_ingest_spec.rb +83 -0
- data/spec/lib/newspaper_works/ingest/newspaper_page_ingest_spec.rb +77 -0
- data/spec/lib/newspaper_works/ingest/page_image_spec.rb +29 -0
- data/spec/lib/newspaper_works/ingest/pdf_images_spec.rb +32 -0
- data/spec/lib/newspaper_works/ingest/pdf_issue_spec.rb +29 -0
- data/spec/lib/newspaper_works/ingest/pdf_issues_spec.rb +62 -0
- data/spec/lib/newspaper_works/ingest/pdf_pages_spec.rb +110 -0
- data/spec/lib/newspaper_works/ingest/pub_finder_spec.rb +58 -0
- data/spec/lib/newspaper_works/ingest/publication_info_spec.rb +61 -0
- data/spec/lib/newspaper_works/ingest_spec.rb +45 -0
- data/spec/lib/newspaper_works/issue_pdf_composer_spec.rb +101 -0
- data/spec/lib/newspaper_works/logging_spec.rb +53 -0
- data/spec/lib/newspaper_works/page_finder_spec.rb +53 -0
- data/spec/lib/newspaper_works/resource_fetcher_spec.rb +65 -0
- data/spec/lib/newspaper_works/text_extraction/alto_reader_spec.rb +49 -0
- data/spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb +84 -0
- data/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb +54 -0
- data/spec/lib/newspaper_works/text_extraction/word_coords_builder_spec.rb +30 -0
- data/spec/lib/tasks/newspaper_works_rake_spec.rb +124 -0
- data/spec/misc_shared.rb +109 -0
- data/spec/model_shared.rb +134 -0
- data/spec/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior_spec.rb +45 -0
- data/spec/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior_spec.rb +27 -0
- data/spec/models/concerns/newspaper_works/newspaper_core_metadata_spec.rb +45 -0
- data/spec/models/concerns/newspaper_works/place_of_publication_behavior_spec.rb +17 -0
- data/spec/models/concerns/newspaper_works/scanned_media_metadata_spec.rb +35 -0
- data/spec/models/newspaper_article_spec.rb +73 -0
- data/spec/models/newspaper_container_spec.rb +111 -0
- data/spec/models/newspaper_issue_spec.rb +91 -0
- data/spec/models/newspaper_page_spec.rb +44 -0
- data/spec/models/newspaper_title_spec.rb +116 -0
- data/spec/models/newspaper_works/derivative_attachment_spec.rb +37 -0
- data/spec/models/newspaper_works/ingest_file_relation_spec.rb +56 -0
- data/spec/models/solr_document_spec.rb +14 -0
- data/spec/ndnp_shared.rb +48 -0
- data/spec/presenters/hyrax/newspaper_article_presenter_spec.rb +53 -0
- data/spec/presenters/hyrax/newspaper_container_presenter_spec.rb +20 -0
- data/spec/presenters/hyrax/newspaper_issue_presenter_spec.rb +65 -0
- data/spec/presenters/hyrax/newspaper_page_presenter_spec.rb +75 -0
- data/spec/presenters/hyrax/newspaper_title_presenter_spec.rb +153 -0
- data/spec/presenters/newspaper_works/iiif_manifest_presenter_behavior_spec.rb +32 -0
- data/spec/presenters/newspaper_works/issue_info_presenter_spec.rb +51 -0
- data/spec/presenters/newspaper_works/newspaper_core_presenter_spec.rb +22 -0
- data/spec/presenters/newspaper_works/persistent_url_presenter_behavior_spec.rb +24 -0
- data/spec/presenters/newspaper_works/place_of_publication_presenter_behavior_spec.rb +17 -0
- data/spec/presenters/newspaper_works/scanned_media_presenter_spec.rb +18 -0
- data/spec/presenters/newspaper_works/title_info_presenter_spec.rb +23 -0
- data/spec/routing/route_spec.rb +52 -0
- data/spec/search_builders/custom_search_builder_spec.rb +34 -0
- data/spec/search_builders/newspaper_works/newspapers_search_builder_spec.rb +33 -0
- data/spec/services/hyrax/article_genre_service_spec.rb +12 -0
- data/spec/services/hyrax/resource_types_service_spec.rb +12 -0
- data/spec/services/newspaper_works/jp2_derivative_service_spec.rb +62 -0
- data/spec/services/newspaper_works/newspaper_page_derivative_service_spec.rb +125 -0
- data/spec/services/newspaper_works/pdf_derivative_service_spec.rb +62 -0
- data/spec/services/newspaper_works/pluggable_derivative_service_spec.rb +204 -0
- data/spec/services/newspaper_works/text_extraction_derivative_service_spec.rb +82 -0
- data/spec/services/newspaper_works/text_formats_from_alto_service_spec.rb +129 -0
- data/spec/services/newspaper_works/tiff_derivative_service_spec.rb +58 -0
- data/spec/spec_helper.rb +261 -0
- data/spec/support/controller_level_helpers.rb +28 -0
- data/spec/test_app_templates/lib/generators/test_app_generator.rb +22 -0
- data/spec/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb_spec.rb +36 -0
- data/spec/views/catalog/_index_header_list_newspaper_page.html.erb_spec.rb +26 -0
- data/spec/views/catalog/_thumbnail_list_newspaper_page.html.erb_spec.rb +35 -0
- data/spec/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb_spec.rb +16 -0
- data/spec/views/hyrax/newspaper_titles/_issue_search_form.html.erb_spec.rb +33 -0
- data/spec/views/hyrax/newspaper_titles/_issues_calendar.html.erb_spec.rb +37 -0
- data/spec/views/hyrax/newspaper_titles/show.html.erb_spec.rb +87 -0
- data/spec/views/newspaper_works/base/_attribute_rows.html.erb_spec.rb +60 -0
- data/spec/views/newspaper_works/base/_newspaper_hierarchy.html.erb_spec.rb +80 -0
- data/spec/views/newspaper_works/base/_show.html.erb_spec.rb +78 -0
- data/spec/views/newspaper_works/newspapers_search/search.html.erb_spec.rb +54 -0
- data/spec/views/records/edit_fields/_place_of_publication.html.erb_spec.rb +26 -0
- data/tasks/newspaperworks_dev.rake +26 -0
- data/test/integration/navigation_test.rb +7 -0
- data/test/lib/generators/newspaper_works/install_generator_test.rb +16 -0
- data/test/newspaper_works_test.rb +7 -0
- data/test/test_helper.rb +17 -0
- data/tmp/.keep +0 -0
- metadata +1037 -0
@@ -0,0 +1,101 @@
|
|
1
|
+
module NewspaperWorks
|
2
|
+
module Ingest
|
3
|
+
module NDNP
|
4
|
+
class IssueIngester
|
5
|
+
include NewspaperWorks::Logging
|
6
|
+
include NewspaperWorks::Ingest::NDNP::NDNPAssetHelper
|
7
|
+
include NewspaperWorks::Ingest::PubFinder
|
8
|
+
|
9
|
+
attr_accessor :issue, :target, :opts
|
10
|
+
|
11
|
+
delegate :path, to: :issue
|
12
|
+
|
13
|
+
COPY_FIELDS = [
|
14
|
+
:lccn,
|
15
|
+
:edition_number,
|
16
|
+
:edition_name,
|
17
|
+
:volume,
|
18
|
+
:publication_date,
|
19
|
+
:held_by,
|
20
|
+
:issue_number
|
21
|
+
].freeze
|
22
|
+
|
23
|
+
# @param issue [NewspaperWorks::Ingest::NDNP::IssueIngest]
|
24
|
+
# source issue data
|
25
|
+
# @param opts [Hash]
|
26
|
+
# ingest options, e.g. administrative metadata
|
27
|
+
def initialize(issue, opts = {})
|
28
|
+
@issue = issue
|
29
|
+
@opts = opts
|
30
|
+
@target = nil
|
31
|
+
configure_logger('ingest')
|
32
|
+
end
|
33
|
+
|
34
|
+
def ingest
|
35
|
+
construct_issue
|
36
|
+
ingest_pages
|
37
|
+
NewspaperWorks::ComposeIssuePDFJob.perform_later(@target)
|
38
|
+
end
|
39
|
+
|
40
|
+
def construct_issue
|
41
|
+
create_issue
|
42
|
+
find_or_create_linked_publication
|
43
|
+
end
|
44
|
+
|
45
|
+
def ingest_pages
|
46
|
+
issue.each do |page|
|
47
|
+
page_ingester(page).ingest
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
|
53
|
+
def page_ingester(page_data)
|
54
|
+
NewspaperWorks::Ingest::NDNP::PageIngester.new(
|
55
|
+
page_data,
|
56
|
+
@target,
|
57
|
+
@opts
|
58
|
+
)
|
59
|
+
end
|
60
|
+
|
61
|
+
def publication_date
|
62
|
+
parsed = DateTime.iso8601(issue.metadata.publication_date)
|
63
|
+
parsed.strftime('%B %-d, %Y')
|
64
|
+
end
|
65
|
+
|
66
|
+
def publication_title(issue)
|
67
|
+
issue.metadata.publication_title.strip.split(/ \(/)[0]
|
68
|
+
end
|
69
|
+
|
70
|
+
def issue_title
|
71
|
+
"#{publication_title(issue)}: #{publication_date}"
|
72
|
+
end
|
73
|
+
|
74
|
+
def copy_issue_metadata
|
75
|
+
metadata = issue.metadata
|
76
|
+
# set (required, plural) title from single value obtained from reel:
|
77
|
+
@target.title = [issue_title]
|
78
|
+
# copy all fields with singular (non-repeatable) values on both
|
79
|
+
# target NewspaperIssue object, and metadata source:
|
80
|
+
COPY_FIELDS.each do |fieldname|
|
81
|
+
@target.send("#{fieldname}=", metadata.send(fieldname.to_s))
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def create_issue
|
86
|
+
@target = NewspaperIssue.create
|
87
|
+
copy_issue_metadata
|
88
|
+
assign_administrative_metadata
|
89
|
+
@target.save!
|
90
|
+
write_log("Saved metadata to new NewspaperIssue #{@target.id}")
|
91
|
+
end
|
92
|
+
|
93
|
+
def find_or_create_linked_publication
|
94
|
+
title = publication_title(issue)
|
95
|
+
lccn = issue.metadata.lccn
|
96
|
+
find_or_create_publication_for_issue(@target, lccn, title, @opts)
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
module NewspaperWorks
|
2
|
+
module Ingest
|
3
|
+
module NDNP
|
4
|
+
class IssueMetadata
|
5
|
+
include NewspaperWorks::Ingest::NDNP::NDNPMetsHelper
|
6
|
+
|
7
|
+
attr_accessor :path, :doc, :parent
|
8
|
+
|
9
|
+
def initialize(path, parent = nil)
|
10
|
+
@path = path
|
11
|
+
@parent = parent
|
12
|
+
@doc = nil
|
13
|
+
load_doc
|
14
|
+
end
|
15
|
+
|
16
|
+
def inspect
|
17
|
+
format(
|
18
|
+
"<#{self.class}:0x000000000%<oid>x\n" \
|
19
|
+
"\tpath: '#{path}',\n",
|
20
|
+
oid: object_id << 1
|
21
|
+
)
|
22
|
+
end
|
23
|
+
|
24
|
+
# LCCN (mandatory)
|
25
|
+
# @return [String]
|
26
|
+
def lccn
|
27
|
+
xpath("//mods:identifier[@type='lccn']").text
|
28
|
+
end
|
29
|
+
|
30
|
+
# Volume number (optional)
|
31
|
+
# @return [String,NilClass]
|
32
|
+
def volume
|
33
|
+
result = xpath("//mods:detail[@type='volume']/mods:number")
|
34
|
+
return if result.size.zero?
|
35
|
+
result.text
|
36
|
+
end
|
37
|
+
|
38
|
+
# Issue number (optional)
|
39
|
+
# @return [String,NilClass]
|
40
|
+
def issue_number
|
41
|
+
result = xpath("//mods:detail[@type='issue']/mods:number")
|
42
|
+
return if result.size.zero?
|
43
|
+
result.text
|
44
|
+
end
|
45
|
+
|
46
|
+
# Edition name
|
47
|
+
# Edition name is optional ("caption" / "label") is optional
|
48
|
+
# in NDNP, but as it may be used as a label for readability.
|
49
|
+
# @return [String,NilClass]
|
50
|
+
def edition_name
|
51
|
+
ed_name = xpath("//mods:detail[@type='edition']/mods:caption")
|
52
|
+
return ed_name.text unless ed_name.size.zero?
|
53
|
+
end
|
54
|
+
|
55
|
+
# Edition name, with fallback to edition number (mandatory)
|
56
|
+
# @return [String]
|
57
|
+
def edition_number
|
58
|
+
xpath("//mods:detail[@type='edition']/mods:number").text
|
59
|
+
end
|
60
|
+
|
61
|
+
# Issue date (mandatory field) as ISO 8601 datestamp string
|
62
|
+
# @return [String] (ISO-8601 date) publication date
|
63
|
+
def publication_date
|
64
|
+
xpath("//mods:originInfo/mods:dateIssued").text
|
65
|
+
end
|
66
|
+
|
67
|
+
def publication_title
|
68
|
+
# try from reel first
|
69
|
+
reel = parent.nil? ? nil : parent.container
|
70
|
+
return reel.metadata.title unless reel.nil?
|
71
|
+
# fallback to parsing //mets/@LABEL
|
72
|
+
label = xpath('//mets:mets/@LABEL').first
|
73
|
+
v = label.nil? ? '' : label.value.split(/[,] [0-9]/)[0]
|
74
|
+
# based on label convention:
|
75
|
+
# "ACME Times (Springfield, UT), 1911-01-25, First Edition"
|
76
|
+
# Returns the name and (*for now TBD*) place of publication
|
77
|
+
# as a string in parentheses.
|
78
|
+
v.split(/, [0-9]/)[0]
|
79
|
+
end
|
80
|
+
|
81
|
+
# Original Source Repository (NDNP-mandatory)
|
82
|
+
# @return [String]
|
83
|
+
def held_by
|
84
|
+
xpath("//mods:physicalLocation").first['displayLabel']
|
85
|
+
end
|
86
|
+
|
87
|
+
private
|
88
|
+
|
89
|
+
def load_doc
|
90
|
+
@doc = @parent.doc unless @parent.nil?
|
91
|
+
@doc = Nokogiri::XML(File.open(path)) if @doc.nil?
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module NewspaperWorks
|
2
|
+
module Ingest
|
3
|
+
module NDNP
|
4
|
+
# Mixin for mets-specific XPath and traversal of issue/page data
|
5
|
+
module NDNPAssetHelper
|
6
|
+
# Set administrative metadata for asset, based on options saved
|
7
|
+
# on ingester state.
|
8
|
+
# Pre-conditions for use:
|
9
|
+
# consuming class implements @target pointing to work asset
|
10
|
+
# consuming class implements @opts pointing to Hash
|
11
|
+
def assign_administrative_metadata(work = nil)
|
12
|
+
NewspaperWorks::Ingest.assign_administrative_metadata(
|
13
|
+
work || @target,
|
14
|
+
@opts
|
15
|
+
)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module NewspaperWorks
|
4
|
+
module Ingest
|
5
|
+
module NDNP
|
6
|
+
# Mixin for mets-specific XPath and traversal of issue/page data
|
7
|
+
module NDNPMetsHelper
|
8
|
+
XML_NS = {
|
9
|
+
mets: 'http://www.loc.gov/METS/',
|
10
|
+
METS: 'http://www.loc.gov/METS/',
|
11
|
+
mods: 'http://www.loc.gov/mods/v3',
|
12
|
+
MODS: 'http://www.loc.gov/mods/v3',
|
13
|
+
ndnp: 'http://www.loc.gov/ndnp',
|
14
|
+
NDNP: 'http://www.loc.gov/ndnp'
|
15
|
+
}.freeze
|
16
|
+
|
17
|
+
# DRY XPath without repeatedly specifying default namespace urlmap
|
18
|
+
def xpath(expr, context = nil)
|
19
|
+
context ||= doc
|
20
|
+
context.xpath(
|
21
|
+
expr,
|
22
|
+
**XML_NS
|
23
|
+
)
|
24
|
+
end
|
25
|
+
|
26
|
+
def dmd_node
|
27
|
+
xpath("//mets:dmdSec[@ID='#{dmdid}']")
|
28
|
+
end
|
29
|
+
|
30
|
+
def normalize_path(specified_path)
|
31
|
+
return specified_path if specified_path.start_with?('/')
|
32
|
+
basename = File.dirname(path)
|
33
|
+
File.join(basename, specified_path)
|
34
|
+
end
|
35
|
+
|
36
|
+
# returns hash of "use" key string to path value
|
37
|
+
def page_files
|
38
|
+
# get pointers from structmap:
|
39
|
+
file_group = xpath("//mets:structMap//mets:div[@DMDID='#{dmdid}']")
|
40
|
+
result = xpath('mets:fptr', file_group).map do |fptr|
|
41
|
+
file_id = fptr['FILEID']
|
42
|
+
file_node = xpath(
|
43
|
+
"//mets:fileSec//mets:fileGrp//mets:file[@ID='#{file_id}']"
|
44
|
+
).first
|
45
|
+
[
|
46
|
+
file_node['USE'],
|
47
|
+
xpath('mets:FLocat', file_node).first.attribute_with_ns(
|
48
|
+
'href',
|
49
|
+
'http://www.w3.org/1999/xlink'
|
50
|
+
).to_s
|
51
|
+
]
|
52
|
+
end
|
53
|
+
result.to_h
|
54
|
+
end
|
55
|
+
|
56
|
+
def container_path
|
57
|
+
reel_dir = File.expand_path('..', File.dirname(path))
|
58
|
+
reel_base = File.basename(reel_dir)
|
59
|
+
File.join(reel_dir, "#{reel_base}_1.xml")
|
60
|
+
end
|
61
|
+
|
62
|
+
def container
|
63
|
+
reel_path = container_path
|
64
|
+
return unless File.exist?(reel_path)
|
65
|
+
NewspaperWorks::Ingest::NDNP::ContainerIngest.new(reel_path)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module NewspaperWorks
|
2
|
+
module Ingest
|
3
|
+
module NDNP
|
4
|
+
class PageIngest
|
5
|
+
include NewspaperWorks::Ingest::NDNP::NDNPMetsHelper
|
6
|
+
|
7
|
+
attr_accessor :path, :dmdid, :doc, :files
|
8
|
+
|
9
|
+
def initialize(path = nil, dmdid = nil, parent = nil)
|
10
|
+
raise ArgumentError, 'No path provided' if path.nil?
|
11
|
+
@path = path
|
12
|
+
@dmdid = dmdid
|
13
|
+
@doc = nil
|
14
|
+
@parent = parent
|
15
|
+
@metadata = nil
|
16
|
+
load_doc
|
17
|
+
@files = page_files.values.map(&method(:normalize_path))
|
18
|
+
end
|
19
|
+
|
20
|
+
def inspect
|
21
|
+
format(
|
22
|
+
"<#{self.class}:0x000000000%<oid>x\n" \
|
23
|
+
"\tpath: '#{path}',\n" \
|
24
|
+
"\tdmdid: '#{dmdid}' ...>",
|
25
|
+
oid: object_id << 1
|
26
|
+
)
|
27
|
+
end
|
28
|
+
|
29
|
+
def metadata
|
30
|
+
return @metadata unless @metadata.nil?
|
31
|
+
@metadata = NewspaperWorks::Ingest::NDNP::PageMetadata.new(
|
32
|
+
path,
|
33
|
+
self,
|
34
|
+
dmdid
|
35
|
+
)
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def load_doc
|
41
|
+
@doc = @parent.doc unless @parent.nil?
|
42
|
+
@doc = Nokogiri::XML(File.open(path)) if @doc.nil?
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,157 @@
|
|
1
|
+
require 'newspaper_works/logging'
|
2
|
+
|
3
|
+
module NewspaperWorks
|
4
|
+
module Ingest
|
5
|
+
module NDNP
|
6
|
+
class PageIngester
|
7
|
+
include NewspaperWorks::Logging
|
8
|
+
include NewspaperWorks::Ingest::NDNP::NDNPAssetHelper
|
9
|
+
|
10
|
+
attr_accessor :page, :issue, :target, :opts
|
11
|
+
|
12
|
+
delegate :path, :dmdid, to: :page
|
13
|
+
|
14
|
+
COPY_FIELDS = [
|
15
|
+
:width,
|
16
|
+
:height,
|
17
|
+
:page_number,
|
18
|
+
:identifier
|
19
|
+
].freeze
|
20
|
+
|
21
|
+
COPY_FIELDS_PLURALIZE = [
|
22
|
+
:identifier
|
23
|
+
].freeze
|
24
|
+
|
25
|
+
# @param page [NewspaperWorks::Ingest::NDNP::PageIngest]
|
26
|
+
# source page data
|
27
|
+
# @param issue [NewspaperIssue]
|
28
|
+
# source issue data
|
29
|
+
# @param opts [Hash]
|
30
|
+
# ingest options, e.g. administrative metadata
|
31
|
+
def initialize(page, issue, opts = {})
|
32
|
+
@page = page
|
33
|
+
@issue = issue
|
34
|
+
@opts = opts
|
35
|
+
# target is to-be-created NewspaperPage:
|
36
|
+
@target = nil
|
37
|
+
@work_files = nil
|
38
|
+
configure_logger('ingest')
|
39
|
+
end
|
40
|
+
|
41
|
+
def ingest
|
42
|
+
construct_page
|
43
|
+
ingest_page_files
|
44
|
+
link_reel
|
45
|
+
end
|
46
|
+
|
47
|
+
def construct_page
|
48
|
+
@target = NewspaperPage.create!(title: page_title)
|
49
|
+
write_log(
|
50
|
+
"Created NewspaperPage work #{@target.id} "\
|
51
|
+
"with title '#{@target.title[0]}'"
|
52
|
+
)
|
53
|
+
copy_page_metadata
|
54
|
+
assign_administrative_metadata
|
55
|
+
link_issue
|
56
|
+
@target.save!
|
57
|
+
write_log("Saved metadata to NewspaperPage work #{@target.id}")
|
58
|
+
end
|
59
|
+
|
60
|
+
# Ingest primary, derivative files; other derivatives including
|
61
|
+
# thumbnail, plain-text, json will be made by NewspaperWorks
|
62
|
+
# derivative service components as a consequence of commiting
|
63
|
+
# files assigned (via actor stack, via WorkFiles).
|
64
|
+
def ingest_page_files
|
65
|
+
@work_files = NewspaperWorks::Data::WorkFiles.new(@target)
|
66
|
+
page.files.each do |path|
|
67
|
+
ext = path.downcase.split('.')[-1]
|
68
|
+
if ['tif', 'tiff'].include?(ext)
|
69
|
+
ingest_primary_file(path)
|
70
|
+
else
|
71
|
+
ingest_derivative_file(path)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
write_log("Beginning file attachment process (WorkFiles.commit!) "\
|
75
|
+
"for work #{@target.id}")
|
76
|
+
@work_files.commit!
|
77
|
+
end
|
78
|
+
|
79
|
+
def link_reel
|
80
|
+
reel_data = @page.container
|
81
|
+
return if reel_data.nil?
|
82
|
+
ingester = NewspaperWorks::Ingest::NDNP::ContainerIngester.new(
|
83
|
+
reel_data,
|
84
|
+
issue.publication,
|
85
|
+
@opts
|
86
|
+
)
|
87
|
+
# find-or-create container, linked to publication:
|
88
|
+
ingester.ingest
|
89
|
+
# link target page to container asset for reel:
|
90
|
+
ingester.link(@target)
|
91
|
+
end
|
92
|
+
|
93
|
+
private
|
94
|
+
|
95
|
+
def ingest_primary_file(path)
|
96
|
+
unless File.exist?(path)
|
97
|
+
pdf_path = page.files.select { |p| p.end_with?('pdf') }[0]
|
98
|
+
# make and get TIFF path (to generated tmp file):
|
99
|
+
path = make_tiff(pdf_path)
|
100
|
+
end
|
101
|
+
write_log("Assigned primary file to work #{@target.id}, #{path}")
|
102
|
+
@work_files.assign(path)
|
103
|
+
end
|
104
|
+
|
105
|
+
def ingest_derivative_file(path)
|
106
|
+
write_log("Assigned derivative file to work #{@target.id}, #{path}")
|
107
|
+
@work_files.derivatives.assign(path)
|
108
|
+
end
|
109
|
+
|
110
|
+
def link_issue
|
111
|
+
issue.ordered_members << @target # page
|
112
|
+
issue.save!
|
113
|
+
write_log(
|
114
|
+
"Linked NewspaperIssue work #{issue.id} "\
|
115
|
+
"to NewspaperPage work #{@target.id}"
|
116
|
+
)
|
117
|
+
end
|
118
|
+
|
119
|
+
# dir whitelist
|
120
|
+
def whitelist
|
121
|
+
Hyrax.config.whitelisted_ingest_dirs
|
122
|
+
end
|
123
|
+
|
124
|
+
# Generate TIFF in temporary file, return its path, given path to PDF
|
125
|
+
# @param pdf_path [String] path to single-page PDF
|
126
|
+
# @return [String] path to generated TIFF
|
127
|
+
def make_tiff(pdf_path)
|
128
|
+
write_log(
|
129
|
+
"Creating TIFF from PDF in lieu of missing for work "\
|
130
|
+
" (#{@target.id})",
|
131
|
+
Logger::WARN
|
132
|
+
)
|
133
|
+
whitelist.push(Dir.tmpdir) unless whitelist.include?(Dir.tmpdir)
|
134
|
+
NewspaperWorks::Ingest::PdfPages.new(pdf_path).to_a[0]
|
135
|
+
end
|
136
|
+
|
137
|
+
# Page title as issue title plus page title
|
138
|
+
# e.g. "ACME Tribune (1910-01-02): Page 2"
|
139
|
+
# @return [String] composed page title
|
140
|
+
def page_title
|
141
|
+
["#{issue.title.first}: Page #{@page.metadata.page_number}"]
|
142
|
+
end
|
143
|
+
|
144
|
+
def copy_page_metadata
|
145
|
+
metadata = page.metadata
|
146
|
+
# copy all fields with singular (non-repeatable) values on both
|
147
|
+
# target NewspaperIssue object, and metadata source:
|
148
|
+
COPY_FIELDS.each do |fieldname|
|
149
|
+
value = metadata.send(fieldname.to_s)
|
150
|
+
pluralize = COPY_FIELDS_PLURALIZE.include?(fieldname)
|
151
|
+
@target.send("#{fieldname}=", pluralize ? [value] : value)
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module NewspaperWorks
|
4
|
+
module Ingest
|
5
|
+
module NDNP
|
6
|
+
class PageMetadata
|
7
|
+
# mixin convenience methods for NDNP XML, plus XML_NS hash
|
8
|
+
include NewspaperWorks::Ingest::NDNP::NDNPMetsHelper
|
9
|
+
|
10
|
+
attr_accessor :path, :dmdid, :doc
|
11
|
+
|
12
|
+
def initialize(path = nil, parent = nil, dmdid = nil)
|
13
|
+
raise ArgumentError, 'No context provided' if path.nil? && parent.nil?
|
14
|
+
@path = path
|
15
|
+
@parent = parent
|
16
|
+
@dmdid = dmdid
|
17
|
+
@doc = nil
|
18
|
+
load_doc
|
19
|
+
end
|
20
|
+
|
21
|
+
def inspect
|
22
|
+
format(
|
23
|
+
"<#{self.class}:0x000000000%<oid>x\n" \
|
24
|
+
"\tpath: '#{path}',\n" \
|
25
|
+
"\tdmdid: '#{dmdid}' ...>",
|
26
|
+
oid: object_id << 1
|
27
|
+
)
|
28
|
+
end
|
29
|
+
|
30
|
+
# Printed page number, if printed; optional field in NDNP spec.
|
31
|
+
# "Number" is used liberally, and may contain both alpha
|
32
|
+
# and numeric characters. As such, return value is String.
|
33
|
+
#
|
34
|
+
# If NDNP issue data fails to provide an explicitly
|
35
|
+
# human-readable page number, fallback to sequence
|
36
|
+
# number, in String form.
|
37
|
+
#
|
38
|
+
# @return [String, NilClass] Page "number" string
|
39
|
+
def page_number
|
40
|
+
detail = dmd_node.xpath(
|
41
|
+
".//mods:mods//mods:detail[@type='page number']",
|
42
|
+
**XML_NS
|
43
|
+
)
|
44
|
+
if detail.size.zero?
|
45
|
+
fallback = page_sequence_number
|
46
|
+
return fallback.nil? ? nil : fallback.to_s
|
47
|
+
end
|
48
|
+
detail.xpath("mods:number", **XML_NS).first.text
|
49
|
+
end
|
50
|
+
|
51
|
+
# Page sequence number, indexical to order in issue.
|
52
|
+
# "Number" here is one-indexed positive integer, position in
|
53
|
+
# issue. Mandatory for page of issue, nil for page of reel.
|
54
|
+
# @return [Integer,NilClass] Page sequence number, positive integer
|
55
|
+
def page_sequence_number
|
56
|
+
detail = dmd_node.xpath(
|
57
|
+
".//mods:mods//mods:extent[@unit='pages']",
|
58
|
+
**XML_NS
|
59
|
+
)
|
60
|
+
node = detail.xpath("mods:start", **XML_NS).first
|
61
|
+
node.text.to_i unless node.nil?
|
62
|
+
end
|
63
|
+
|
64
|
+
# Extract identifier from page ALTO, based on file name.
|
65
|
+
# XML parsing of big documents are expensive, so use regex to
|
66
|
+
# scan for fileName element, and return its value.
|
67
|
+
# @return [String,NilClass] file name or path, or nil.
|
68
|
+
def identifier
|
69
|
+
matches = page_alto.scan(/<fileName>([^<]*)<\/fileName>/).first
|
70
|
+
matches.size.zero? ? nil : stripped_filename(matches[0])
|
71
|
+
end
|
72
|
+
|
73
|
+
def height
|
74
|
+
alto_page_meta('HEIGHT').to_i
|
75
|
+
end
|
76
|
+
|
77
|
+
def width
|
78
|
+
alto_page_meta('WIDTH').to_i
|
79
|
+
end
|
80
|
+
|
81
|
+
private
|
82
|
+
|
83
|
+
# filename stripped of base path and file extension
|
84
|
+
def stripped_filename(path)
|
85
|
+
File.basename(path).split('.')[0]
|
86
|
+
end
|
87
|
+
|
88
|
+
def load_doc
|
89
|
+
@doc = @parent.doc unless @parent.nil?
|
90
|
+
@doc = Nokogiri::XML(File.open(path)) if @doc.nil?
|
91
|
+
end
|
92
|
+
|
93
|
+
def alto_path
|
94
|
+
specified_path = page_files['ocr']
|
95
|
+
normalize_path(specified_path)
|
96
|
+
end
|
97
|
+
|
98
|
+
def page_alto
|
99
|
+
File.read(alto_path)
|
100
|
+
end
|
101
|
+
|
102
|
+
def alto_page_meta(key)
|
103
|
+
matches = page_alto.scan(/(<Page [^>]*>)/).first
|
104
|
+
return if matches.size.zero?
|
105
|
+
# parse xml <Page> start tag fragment, get attributes:
|
106
|
+
page_tag = Nokogiri::XML(matches[0]).root
|
107
|
+
page_tag[key]
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'newspaper_works/ingest/ndnp/ndnp_mets_helper'
|
2
|
+
require 'newspaper_works/ingest/ndnp/ndnp_asset_helper'
|
3
|
+
require 'newspaper_works/ingest/ndnp/page_ingest'
|
4
|
+
require 'newspaper_works/ingest/ndnp/page_ingester'
|
5
|
+
require 'newspaper_works/ingest/ndnp/page_metadata'
|
6
|
+
require 'newspaper_works/ingest/ndnp/issue_ingest'
|
7
|
+
require 'newspaper_works/ingest/ndnp/issue_ingester'
|
8
|
+
require 'newspaper_works/ingest/ndnp/issue_metadata'
|
9
|
+
require 'newspaper_works/ingest/ndnp/container_ingest'
|
10
|
+
require 'newspaper_works/ingest/ndnp/container_ingester'
|
11
|
+
require 'newspaper_works/ingest/ndnp/container_metadata'
|
12
|
+
require 'newspaper_works/ingest/ndnp/batch_xml_ingest'
|
13
|
+
require 'newspaper_works/ingest/ndnp/batch_ingester'
|
14
|
+
|
15
|
+
module NewspaperWorks
|
16
|
+
module Ingest
|
17
|
+
# Module for NDNP-specific ingest components
|
18
|
+
module NDNP
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
module NewspaperWorks
|
2
|
+
module Ingest
|
3
|
+
class NewspaperIssueIngest < BaseIngest
|
4
|
+
@configured = false
|
5
|
+
|
6
|
+
class << self
|
7
|
+
def configure
|
8
|
+
return if @configured == true
|
9
|
+
# PDF ingest may save page images to /tmp (via Dir.tmpdir), which
|
10
|
+
# needs whitelisting for use by NewspaperWorks::Data::WorkFiles.commit!
|
11
|
+
# via Hyrax CreateWithRemoteFilesActor:
|
12
|
+
whitelist = Hyrax.config.whitelisted_ingest_dirs
|
13
|
+
whitelist.push(Dir.tmpdir) unless whitelist.include?(Dir.tmpdir)
|
14
|
+
@configured = true
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def import
|
19
|
+
# first, handle the PDF itself on the issue...
|
20
|
+
super
|
21
|
+
# ...then create child works from split pages
|
22
|
+
create_child_pages
|
23
|
+
end
|
24
|
+
|
25
|
+
# Creates child pages with attached TIFF masters, can be called by
|
26
|
+
# `import`, or independently if `load` is called first. The
|
27
|
+
# latter is appropriate if framework is already handling the
|
28
|
+
# NewspaperIssue file attachment (e.g. Hyrax upload via browser).
|
29
|
+
def create_child_pages
|
30
|
+
self.class.configure
|
31
|
+
pages = NewspaperWorks::Ingest::PdfPages.new(path).to_a
|
32
|
+
pages.each_with_index do |tiffpath, idx|
|
33
|
+
page = new_child_page_with_file(tiffpath, idx)
|
34
|
+
@work.ordered_members << page
|
35
|
+
end
|
36
|
+
@work.save!(validate: false) unless pages.empty?
|
37
|
+
end
|
38
|
+
|
39
|
+
def new_child_page_with_file(tiffpath, idx)
|
40
|
+
page_number = idx + 1
|
41
|
+
page = NewspaperPage.new
|
42
|
+
page.title = ["#{@work.title.first}: Page #{page_number}"]
|
43
|
+
# technically, a sequence number distinct from displayed page number
|
44
|
+
page.page_number = page_number.to_s
|
45
|
+
# Set depositor and admin-set id:
|
46
|
+
page.depositor = @work.depositor
|
47
|
+
page.admin_set_id = @work.admin_set_id
|
48
|
+
# copying permissions also by effect copies visibility:
|
49
|
+
page.permissions_attributes = @work.permissions.map(&:to_hash)
|
50
|
+
NewspaperPageIngest.new(page).ingest(tiffpath)
|
51
|
+
page.save!
|
52
|
+
page
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|