newspaper_works 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.fcrepo_wrapper +4 -0
- data/.gitignore +43 -0
- data/.rubocop.yml +143 -0
- data/.solr_wrapper +8 -0
- data/.travis.yml +50 -0
- data/Gemfile +47 -0
- data/LICENSE +203 -0
- data/README.md +159 -0
- data/Rakefile +38 -0
- data/app/actors/hyrax/actors/newspaper_article_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_container_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_issue_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_page_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_title_actor.rb +8 -0
- data/app/actors/newspaper_works/actors/newspaper_works_upload_actor.rb +88 -0
- data/app/assets/config/newspaper_works_manifest.js +2 -0
- data/app/assets/images/newspaper_works/.keep +0 -0
- data/app/assets/javascripts/newspaper_works/autocomplete_fix.js +33 -0
- data/app/assets/javascripts/newspaper_works/ocr_search.js.erb +6 -0
- data/app/assets/javascripts/newspaper_works/thumbnail_highlights.js.erb +102 -0
- data/app/assets/javascripts/newspaper_works.js +4 -0
- data/app/assets/stylesheets/newspaper_works/_issue_search.scss +13 -0
- data/app/assets/stylesheets/newspaper_works/_issues_calendar.scss +18 -0
- data/app/assets/stylesheets/newspaper_works/_newspaper_works.scss +4 -0
- data/app/assets/stylesheets/newspaper_works/_newspapers_search.scss +38 -0
- data/app/assets/stylesheets/newspaper_works/_search_results.scss +12 -0
- data/app/controllers/hyrax/newspaper_articles_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_containers_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_issues_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_pages_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_titles_controller.rb +13 -0
- data/app/controllers/newspaper_works/newspapers_controller.rb +117 -0
- data/app/controllers/newspaper_works/newspapers_search_controller.rb +26 -0
- data/app/forms/hyrax/newspaper_article_form.rb +11 -0
- data/app/forms/hyrax/newspaper_container_form.rb +11 -0
- data/app/forms/hyrax/newspaper_issue_form.rb +11 -0
- data/app/forms/hyrax/newspaper_page_form.rb +15 -0
- data/app/forms/hyrax/newspaper_title_form.rb +12 -0
- data/app/forms/newspaper_works/newspaper_core_form_data.rb +17 -0
- data/app/helpers/hyrax/newspaper_articles_helper.rb +5 -0
- data/app/helpers/hyrax/newspaper_containers_helper.rb +5 -0
- data/app/helpers/hyrax/newspaper_issues_helper.rb +5 -0
- data/app/helpers/hyrax/newspaper_pages_helper.rb +5 -0
- data/app/helpers/newspaper_works/application_helper.rb +5 -0
- data/app/helpers/newspaper_works/breadcrumb_helper.rb +92 -0
- data/app/helpers/newspaper_works/newspaper_works_helper_behavior.rb +103 -0
- data/app/helpers/newspaper_works/newspapers_helper.rb +5 -0
- data/app/indexers/concerns/newspaper_works/indexes_full_text.rb +17 -0
- data/app/indexers/concerns/newspaper_works/indexes_place_of_publication.rb +67 -0
- data/app/indexers/concerns/newspaper_works/indexes_publication_date_range.rb +35 -0
- data/app/indexers/concerns/newspaper_works/indexes_relationships.rb +125 -0
- data/app/indexers/newspaper_article_indexer.rb +16 -0
- data/app/indexers/newspaper_container_indexer.rb +18 -0
- data/app/indexers/newspaper_issue_indexer.rb +26 -0
- data/app/indexers/newspaper_page_indexer.rb +9 -0
- data/app/indexers/newspaper_title_indexer.rb +19 -0
- data/app/indexers/newspaper_works/newspaper_core_indexer.rb +21 -0
- data/app/jobs/newspaper_works/application_job.rb +4 -0
- data/app/jobs/newspaper_works/compose_issue_pdf_job.rb +13 -0
- data/app/jobs/newspaper_works/create_issue_pages_job.rb +19 -0
- data/app/mailers/newspaper_works/application_mailer.rb +8 -0
- data/app/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior.rb +82 -0
- data/app/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior.rb +27 -0
- data/app/models/concerns/newspaper_works/newspaper_core_metadata.rb +67 -0
- data/app/models/concerns/newspaper_works/place_of_publication_behavior.rb +15 -0
- data/app/models/concerns/newspaper_works/scanned_media_metadata.rb +43 -0
- data/app/models/concerns/newspaper_works/solr/document.rb +25 -0
- data/app/models/file_set.rb +10 -0
- data/app/models/newspaper_article.rb +158 -0
- data/app/models/newspaper_container.rb +86 -0
- data/app/models/newspaper_issue.rb +115 -0
- data/app/models/newspaper_page.rb +70 -0
- data/app/models/newspaper_title.rb +111 -0
- data/app/models/newspaper_works/application_record.rb +6 -0
- data/app/models/newspaper_works/derivative_attachment.rb +8 -0
- data/app/models/newspaper_works/ingest_file_relation.rb +14 -0
- data/app/presenters/hyrax/newspaper_article_presenter.rb +38 -0
- data/app/presenters/hyrax/newspaper_container_presenter.rb +11 -0
- data/app/presenters/hyrax/newspaper_issue_presenter.rb +62 -0
- data/app/presenters/hyrax/newspaper_page_presenter.rb +72 -0
- data/app/presenters/hyrax/newspaper_title_presenter.rb +86 -0
- data/app/presenters/newspaper_works/iiif_manifest_presenter_behavior.rb +29 -0
- data/app/presenters/newspaper_works/issue_info_presenter.rb +29 -0
- data/app/presenters/newspaper_works/newspaper_core_presenter.rb +9 -0
- data/app/presenters/newspaper_works/persistent_url_presenter_behavior.rb +16 -0
- data/app/presenters/newspaper_works/place_of_publication_presenter_behavior.rb +8 -0
- data/app/presenters/newspaper_works/scanned_media_presenter.rb +7 -0
- data/app/presenters/newspaper_works/title_info_presenter.rb +13 -0
- data/app/search_builders/concerns/newspaper_works/exclude_models.rb +16 -0
- data/app/search_builders/concerns/newspaper_works/highlight_search_params.rb +14 -0
- data/app/search_builders/newspaper_works/newspapers_search_builder.rb +26 -0
- data/app/services/hyrax/article_genre_service.rb +9 -0
- data/app/services/newspaper_works/jp2_derivative_service.rb +120 -0
- data/app/services/newspaper_works/newspaper_page_derivative_service.rb +91 -0
- data/app/services/newspaper_works/pdf_derivative_service.rb +45 -0
- data/app/services/newspaper_works/pluggable_derivative_service.rb +114 -0
- data/app/services/newspaper_works/text_extraction_derivative_service.rb +56 -0
- data/app/services/newspaper_works/text_formats_from_alto_service.rb +77 -0
- data/app/services/newspaper_works/tiff_derivative_service.rb +54 -0
- data/app/validators/newspaper_works/publication_date_start_end_validator.rb +48 -0
- data/app/validators/newspaper_works/publication_date_validator.rb +16 -0
- data/app/views/catalog/_index_gallery_newspaper_article_wrapper.html.erb +9 -0
- data/app/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb +9 -0
- data/app/views/catalog/_index_header_gallery_newspaper_article.html.erb +23 -0
- data/app/views/catalog/_index_header_gallery_newspaper_page.html.erb +23 -0
- data/app/views/catalog/_index_header_list_newspaper_article.html.erb +7 -0
- data/app/views/catalog/_index_header_list_newspaper_page.html.erb +7 -0
- data/app/views/catalog/_snippets_more.html.erb +16 -0
- data/app/views/catalog/_thumbnail_list_newspaper_article.html.erb +6 -0
- data/app/views/catalog/_thumbnail_list_newspaper_page.html.erb +6 -0
- data/app/views/hyrax/file_sets/_actions.html.erb +45 -0
- data/app/views/hyrax/newspaper_articles/_newspaper_article.html.erb +2 -0
- data/app/views/hyrax/newspaper_articles/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_containers/_newspaper_container.html.erb +2 -0
- data/app/views/hyrax/newspaper_containers/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_issues/_newspaper_issue.html.erb +2 -0
- data/app/views/hyrax/newspaper_issues/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_pages/_newspaper_page.html.erb +2 -0
- data/app/views/hyrax/newspaper_pages/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb +5 -0
- data/app/views/hyrax/newspaper_titles/_issue_search_form.html.erb +33 -0
- data/app/views/hyrax/newspaper_titles/_issues_calendar.html.erb +63 -0
- data/app/views/hyrax/newspaper_titles/_newspaper_title.html.erb +2 -0
- data/app/views/hyrax/newspaper_titles/show.html.erb +54 -0
- data/app/views/newspaper_works/base/_attribute_rows.html.erb +42 -0
- data/app/views/newspaper_works/base/_attributes.html.erb +16 -0
- data/app/views/newspaper_works/base/_metadata.html.erb +6 -0
- data/app/views/newspaper_works/base/_newspaper_hierarchy.html.erb +14 -0
- data/app/views/newspaper_works/base/_persistent_url.html.erb +1 -0
- data/app/views/newspaper_works/base/_show.html.erb +45 -0
- data/app/views/newspaper_works/newspapers_search/_date_fields.html.erb +29 -0
- data/app/views/newspaper_works/newspapers_search/_facet_layout.html.erb +8 -0
- data/app/views/newspaper_works/newspapers_search/_facet_limit.html.erb +17 -0
- data/app/views/newspaper_works/newspapers_search/_front_pages_input.html.erb +5 -0
- data/app/views/newspaper_works/newspapers_search/_keyword_input.html.erb +18 -0
- data/app/views/newspaper_works/newspapers_search/_newspapers_facets.html.erb +5 -0
- data/app/views/newspaper_works/newspapers_search/_newspapers_search_form.html.erb +13 -0
- data/app/views/newspaper_works/newspapers_search/_newspapers_search_help.html.erb +8 -0
- data/app/views/newspaper_works/newspapers_search/search.html.erb +13 -0
- data/app/views/records/edit_fields/_alternate_title.html.erb +4 -0
- data/app/views/records/edit_fields/_genre.html.erb +4 -0
- data/app/views/records/edit_fields/_place_of_publication.html.erb +14 -0
- data/app/views/records/edit_fields/_subtitle.html.erb +4 -0
- data/bin/rails +13 -0
- data/config/fcrepo_wrapper_test.yml +5 -0
- data/config/initializers/assets.rb +2 -0
- data/config/locales/newspaper_article.de.yml +12 -0
- data/config/locales/newspaper_article.en.yml +12 -0
- data/config/locales/newspaper_article.es.yml +12 -0
- data/config/locales/newspaper_article.fr.yml +12 -0
- data/config/locales/newspaper_article.it.yml +12 -0
- data/config/locales/newspaper_article.pt-BR.yml +12 -0
- data/config/locales/newspaper_article.zh.yml +12 -0
- data/config/locales/newspaper_container.de.yml +8 -0
- data/config/locales/newspaper_container.en.yml +8 -0
- data/config/locales/newspaper_container.es.yml +8 -0
- data/config/locales/newspaper_container.fr.yml +8 -0
- data/config/locales/newspaper_container.it.yml +8 -0
- data/config/locales/newspaper_container.pt-BR.yml +8 -0
- data/config/locales/newspaper_container.zh.yml +8 -0
- data/config/locales/newspaper_issue.de.yml +8 -0
- data/config/locales/newspaper_issue.en.yml +8 -0
- data/config/locales/newspaper_issue.es.yml +8 -0
- data/config/locales/newspaper_issue.fr.yml +8 -0
- data/config/locales/newspaper_issue.it.yml +8 -0
- data/config/locales/newspaper_issue.pt-BR.yml +8 -0
- data/config/locales/newspaper_issue.zh.yml +8 -0
- data/config/locales/newspaper_page.de.yml +15 -0
- data/config/locales/newspaper_page.en.yml +15 -0
- data/config/locales/newspaper_page.es.yml +15 -0
- data/config/locales/newspaper_page.fr.yml +15 -0
- data/config/locales/newspaper_page.it.yml +15 -0
- data/config/locales/newspaper_page.pt-BR.yml +15 -0
- data/config/locales/newspaper_page.zh.yml +15 -0
- data/config/locales/newspaper_title.de.yml +8 -0
- data/config/locales/newspaper_title.en.yml +8 -0
- data/config/locales/newspaper_title.es.yml +8 -0
- data/config/locales/newspaper_title.fr.yml +8 -0
- data/config/locales/newspaper_title.it.yml +8 -0
- data/config/locales/newspaper_title.pt-BR.yml +8 -0
- data/config/locales/newspaper_title.zh.yml +8 -0
- data/config/locales/newspaper_works.de.yml +50 -0
- data/config/locales/newspaper_works.en.yml +52 -0
- data/config/locales/newspaper_works.es.yml +52 -0
- data/config/locales/newspaper_works.fr.yml +52 -0
- data/config/locales/newspaper_works.it.yml +52 -0
- data/config/locales/newspaper_works.pt-BR.yml +52 -0
- data/config/locales/newspaper_works.zh.yml +52 -0
- data/config/routes.rb +9 -0
- data/config/solr_wrapper_test.yml +9 -0
- data/config/test-fixture/solr-config/_rest_managed.json +3 -0
- data/config/test-fixture/solr-config/admin-extra.html +31 -0
- data/config/test-fixture/solr-config/elevate.xml +36 -0
- data/config/test-fixture/solr-config/mapping-ISOLatin1Accent.txt +246 -0
- data/config/test-fixture/solr-config/protwords.txt +21 -0
- data/config/test-fixture/solr-config/schema.xml +366 -0
- data/config/test-fixture/solr-config/scripts.conf +24 -0
- data/config/test-fixture/solr-config/solrconfig.xml +322 -0
- data/config/test-fixture/solr-config/spellings.txt +2 -0
- data/config/test-fixture/solr-config/stopwords.txt +58 -0
- data/config/test-fixture/solr-config/stopwords_en.txt +58 -0
- data/config/test-fixture/solr-config/synonyms.txt +31 -0
- data/config/test-fixture/solr-config/xslt/example.xsl +132 -0
- data/config/test-fixture/solr-config/xslt/example_atom.xsl +67 -0
- data/config/test-fixture/solr-config/xslt/example_rss.xsl +66 -0
- data/config/test-fixture/solr-config/xslt/luke.xsl +337 -0
- data/config/vendor/imagemagick-6-policy.xml +76 -0
- data/db/migrate/20181214181358_create_newspaper_works_derivative_attachments.rb +12 -0
- data/db/migrate/20190107165909_create_newspaper_works_ingest_file_relations.rb +11 -0
- data/lib/generators/newspaper_works/assets_generator.rb +29 -0
- data/lib/generators/newspaper_works/blacklight_advanced_search_generator.rb +44 -0
- data/lib/generators/newspaper_works/blacklight_iiif_search_generator.rb +41 -0
- data/lib/generators/newspaper_works/catalog_controller_generator.rb +60 -0
- data/lib/generators/newspaper_works/install_generator.rb +97 -0
- data/lib/generators/newspaper_works/templates/annotation_behavior.rb +6 -0
- data/lib/generators/newspaper_works/templates/config/authorities/newspaper_article_genres.yml +86 -0
- data/lib/generators/newspaper_works/templates/config/initializers/newspaper_works.rb +12 -0
- data/lib/generators/newspaper_works/templates/config/initializers/patch_blacklight_advanced_search.rb +74 -0
- data/lib/generators/newspaper_works/templates/custom_search_builder.rb +23 -0
- data/lib/generators/newspaper_works/templates/newspaper_works.scss +1 -0
- data/lib/generators/newspaper_works/templates/newspaper_works_helper.rb +3 -0
- data/lib/generators/newspaper_works/templates/search_behavior.rb +6 -0
- data/lib/newspaper_works/configuration.rb +14 -0
- data/lib/newspaper_works/data/fileset_helper.rb +25 -0
- data/lib/newspaper_works/data/path_helper.rb +40 -0
- data/lib/newspaper_works/data/work_derivatives.rb +314 -0
- data/lib/newspaper_works/data/work_file.rb +92 -0
- data/lib/newspaper_works/data/work_files.rb +181 -0
- data/lib/newspaper_works/data.rb +35 -0
- data/lib/newspaper_works/engine.rb +42 -0
- data/lib/newspaper_works/errors.rb +14 -0
- data/lib/newspaper_works/ingest/base_ingest.rb +69 -0
- data/lib/newspaper_works/ingest/base_publication_info.rb +35 -0
- data/lib/newspaper_works/ingest/batch_ingest_helper.rb +44 -0
- data/lib/newspaper_works/ingest/batch_issue_ingester.rb +129 -0
- data/lib/newspaper_works/ingest/chronam_publication_info.rb +133 -0
- data/lib/newspaper_works/ingest/from_command.rb +52 -0
- data/lib/newspaper_works/ingest/image_ingest_issues.rb +43 -0
- data/lib/newspaper_works/ingest/issue_images.rb +51 -0
- data/lib/newspaper_works/ingest/lc_publication_info.rb +144 -0
- data/lib/newspaper_works/ingest/named_issue_metadata.rb +60 -0
- data/lib/newspaper_works/ingest/ndnp/batch_ingester.rb +64 -0
- data/lib/newspaper_works/ingest/ndnp/batch_xml_ingest.rb +72 -0
- data/lib/newspaper_works/ingest/ndnp/container_ingest.rb +99 -0
- data/lib/newspaper_works/ingest/ndnp/container_ingester.rb +84 -0
- data/lib/newspaper_works/ingest/ndnp/container_metadata.rb +87 -0
- data/lib/newspaper_works/ingest/ndnp/issue_ingest.rb +81 -0
- data/lib/newspaper_works/ingest/ndnp/issue_ingester.rb +101 -0
- data/lib/newspaper_works/ingest/ndnp/issue_metadata.rb +96 -0
- data/lib/newspaper_works/ingest/ndnp/ndnp_asset_helper.rb +20 -0
- data/lib/newspaper_works/ingest/ndnp/ndnp_mets_helper.rb +70 -0
- data/lib/newspaper_works/ingest/ndnp/page_ingest.rb +47 -0
- data/lib/newspaper_works/ingest/ndnp/page_ingester.rb +157 -0
- data/lib/newspaper_works/ingest/ndnp/page_metadata.rb +112 -0
- data/lib/newspaper_works/ingest/ndnp.rb +21 -0
- data/lib/newspaper_works/ingest/newspaper_issue_ingest.rb +56 -0
- data/lib/newspaper_works/ingest/newspaper_page_ingest.rb +6 -0
- data/lib/newspaper_works/ingest/page_image.rb +52 -0
- data/lib/newspaper_works/ingest/path_enumeration.rb +52 -0
- data/lib/newspaper_works/ingest/pdf_images.rb +85 -0
- data/lib/newspaper_works/ingest/pdf_issue.rb +20 -0
- data/lib/newspaper_works/ingest/pdf_issues.rb +39 -0
- data/lib/newspaper_works/ingest/pdf_pages.rb +114 -0
- data/lib/newspaper_works/ingest/pub_finder.rb +89 -0
- data/lib/newspaper_works/ingest/publication_info.rb +44 -0
- data/lib/newspaper_works/ingest.rb +90 -0
- data/lib/newspaper_works/issue_pdf_composer.rb +111 -0
- data/lib/newspaper_works/logging.rb +54 -0
- data/lib/newspaper_works/page_finder.rb +62 -0
- data/lib/newspaper_works/resource_fetcher.rb +78 -0
- data/lib/newspaper_works/text_extraction/alto_reader.rb +122 -0
- data/lib/newspaper_works/text_extraction/page_ocr.rb +100 -0
- data/lib/newspaper_works/text_extraction/render_alto.rb +84 -0
- data/lib/newspaper_works/text_extraction/word_coords_builder.rb +30 -0
- data/lib/newspaper_works/text_extraction.rb +10 -0
- data/lib/newspaper_works/version.rb +3 -0
- data/lib/newspaper_works.rb +19 -0
- data/lib/tasks/newspaper_works_tasks.rake +39 -0
- data/newspaper_works.gemspec +49 -0
- data/spec/.keep.txt +1 -0
- data/spec/actors/newspaper_works/actors/newspaper_works_upload_actor_spec.rb +69 -0
- data/spec/controllers/catalog_controller_spec.rb +63 -0
- data/spec/controllers/newspaper_works/newspapers_controller_spec.rb +114 -0
- data/spec/controllers/newspaper_works/newspapers_search_controller_spec.rb +21 -0
- data/spec/factories/ability.rb +6 -0
- data/spec/factories/newspaper_issue.rb +7 -0
- data/spec/factories/newspaper_issue_ingest.rb +6 -0
- data/spec/factories/newspaper_page.rb +7 -0
- data/spec/factories/newspaper_page_ingest.rb +6 -0
- data/spec/factories/newspaper_page_solr_document.rb +12 -0
- data/spec/factories/newspaper_title.rb +8 -0
- data/spec/factories/uploaded_pdf_file.rb +9 -0
- data/spec/factories/user.rb +13 -0
- data/spec/features/front_pages_for_title_spec.rb +19 -0
- data/spec/features/newspaper_title_search_spec.rb +30 -0
- data/spec/features/newspapers_search_spec.rb +49 -0
- data/spec/features/search_results_thumbnail_highlights_spec.rb +33 -0
- data/spec/features_shared.rb +71 -0
- data/spec/fixtures/files/4.1.07.jp2 +0 -0
- data/spec/fixtures/files/4.1.07.tiff +0 -0
- data/spec/fixtures/files/README.md +7 -0
- data/spec/fixtures/files/alto-2-0.xsd +714 -0
- data/spec/fixtures/files/broken-truncated.pdf +0 -0
- data/spec/fixtures/files/credits.md +16 -0
- data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
- data/spec/fixtures/files/minimal-1-page.pdf +0 -0
- data/spec/fixtures/files/minimal-2-page.pdf +0 -0
- data/spec/fixtures/files/minimal-alto.xml +31 -0
- data/spec/fixtures/files/ndnp-alto-sample.xml +24 -0
- data/spec/fixtures/files/ndnp-sample1-json.json +1 -0
- data/spec/fixtures/files/ndnp-sample1-txt.txt +1 -0
- data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
- data/spec/fixtures/files/ocr_alto.xml +202 -0
- data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +202 -0
- data/spec/fixtures/files/ocr_color.tiff +0 -0
- data/spec/fixtures/files/ocr_gray.jp2 +0 -0
- data/spec/fixtures/files/ocr_gray.tiff +0 -0
- data/spec/fixtures/files/ocr_mono.tiff +0 -0
- data/spec/fixtures/files/page1.tiff +0 -0
- data/spec/fixtures/files/resource_mocks/chronam/http404-expected +0 -0
- data/spec/fixtures/files/resource_mocks/chronam/sn84038814.rdf +1028 -0
- data/spec/fixtures/files/resource_mocks/chronam/sn93059126.rdf +36 -0
- data/spec/fixtures/files/resource_mocks/chronam/sn94051019.rdf +37 -0
- data/spec/fixtures/files/resource_mocks/geonames/Chicopee +1104 -0
- data/spec/fixtures/files/resource_mocks/geonames/Denver +1104 -0
- data/spec/fixtures/files/resource_mocks/geonames/Marysville +279 -0
- data/spec/fixtures/files/resource_mocks/geonames/Marysville2 +279 -0
- data/spec/fixtures/files/resource_mocks/geonames/SLC +1104 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn2099999999 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn82014496 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn83020109 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn83021453 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn83045396 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn84038814 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn93059126 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn94051019 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn99999999 +1 -0
- data/spec/fixtures/files/resource_mocks/urls.json +82 -0
- data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
- data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
- data/spec/fixtures/files/thumbnail.jpg +0 -0
- data/spec/forms/hyrax/newspaper_article_form_spec.rb +33 -0
- data/spec/forms/hyrax/newspaper_container_form_spec.rb +30 -0
- data/spec/forms/hyrax/newspaper_issue_form_spec.rb +31 -0
- data/spec/forms/hyrax/newspaper_page_form_spec.rb +28 -0
- data/spec/forms/hyrax/newspaper_title_form_spec.rb +31 -0
- data/spec/forms/newspaper_works/newspaper_core_form_data_spec.rb +12 -0
- data/spec/helpers/newspaper_works/breadcrumb_helper_spec.rb +82 -0
- data/spec/helpers/newspaper_works_helper_spec.rb +57 -0
- data/spec/indexers/concerns/newspaper_works/indexes_full_text_spec.rb +31 -0
- data/spec/indexers/concerns/newspaper_works/indexes_place_of_publication_spec.rb +53 -0
- data/spec/indexers/concerns/newspaper_works/indexes_publication_date_range_spec.rb +39 -0
- data/spec/indexers/concerns/newspaper_works/indexes_relationships_spec.rb +86 -0
- data/spec/indexers/newspaper_article_indexer_spec.rb +29 -0
- data/spec/indexers/newspaper_issue_indexer_spec.rb +19 -0
- data/spec/indexers/newspaper_title_indexer_spec.rb +22 -0
- data/spec/indexers/newspaper_works/newspaper_core_indexer_spec.rb +23 -0
- data/spec/lib/newspaper_works/configuration_spec.rb +18 -0
- data/spec/lib/newspaper_works/data/work_derivatives_spec.rb +245 -0
- data/spec/lib/newspaper_works/data/work_file_spec.rb +99 -0
- data/spec/lib/newspaper_works/data/work_files_spec.rb +224 -0
- data/spec/lib/newspaper_works/ingest/batch_issue_ingester_spec.rb +158 -0
- data/spec/lib/newspaper_works/ingest/chronam_publication_info_spec.rb +35 -0
- data/spec/lib/newspaper_works/ingest/from_command_spec.rb +75 -0
- data/spec/lib/newspaper_works/ingest/image_ingest_issues_spec.rb +62 -0
- data/spec/lib/newspaper_works/ingest/ingest_shared.rb +75 -0
- data/spec/lib/newspaper_works/ingest/issue_images_spec.rb +65 -0
- data/spec/lib/newspaper_works/ingest/lc_publication_info_spec.rb +34 -0
- data/spec/lib/newspaper_works/ingest/ndnp/batch_ingester_spec.rb +131 -0
- data/spec/lib/newspaper_works/ingest/ndnp/batch_xml_ingest_spec.rb +64 -0
- data/spec/lib/newspaper_works/ingest/ndnp/container_ingest_spec.rb +44 -0
- data/spec/lib/newspaper_works/ingest/ndnp/container_ingester_spec.rb +126 -0
- data/spec/lib/newspaper_works/ingest/ndnp/container_metadata_spec.rb +36 -0
- data/spec/lib/newspaper_works/ingest/ndnp/issue_ingest_spec.rb +108 -0
- data/spec/lib/newspaper_works/ingest/ndnp/issue_ingester_spec.rb +155 -0
- data/spec/lib/newspaper_works/ingest/ndnp/issue_metadata_spec.rb +84 -0
- data/spec/lib/newspaper_works/ingest/ndnp/page_ingest_spec.rb +79 -0
- data/spec/lib/newspaper_works/ingest/ndnp/page_ingester_spec.rb +184 -0
- data/spec/lib/newspaper_works/ingest/ndnp/page_metadata_spec.rb +85 -0
- data/spec/lib/newspaper_works/ingest/newspaper_issue_ingest_spec.rb +83 -0
- data/spec/lib/newspaper_works/ingest/newspaper_page_ingest_spec.rb +77 -0
- data/spec/lib/newspaper_works/ingest/page_image_spec.rb +29 -0
- data/spec/lib/newspaper_works/ingest/pdf_images_spec.rb +32 -0
- data/spec/lib/newspaper_works/ingest/pdf_issue_spec.rb +29 -0
- data/spec/lib/newspaper_works/ingest/pdf_issues_spec.rb +62 -0
- data/spec/lib/newspaper_works/ingest/pdf_pages_spec.rb +110 -0
- data/spec/lib/newspaper_works/ingest/pub_finder_spec.rb +58 -0
- data/spec/lib/newspaper_works/ingest/publication_info_spec.rb +61 -0
- data/spec/lib/newspaper_works/ingest_spec.rb +45 -0
- data/spec/lib/newspaper_works/issue_pdf_composer_spec.rb +101 -0
- data/spec/lib/newspaper_works/logging_spec.rb +53 -0
- data/spec/lib/newspaper_works/page_finder_spec.rb +53 -0
- data/spec/lib/newspaper_works/resource_fetcher_spec.rb +65 -0
- data/spec/lib/newspaper_works/text_extraction/alto_reader_spec.rb +49 -0
- data/spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb +84 -0
- data/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb +54 -0
- data/spec/lib/newspaper_works/text_extraction/word_coords_builder_spec.rb +30 -0
- data/spec/lib/tasks/newspaper_works_rake_spec.rb +124 -0
- data/spec/misc_shared.rb +109 -0
- data/spec/model_shared.rb +134 -0
- data/spec/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior_spec.rb +45 -0
- data/spec/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior_spec.rb +27 -0
- data/spec/models/concerns/newspaper_works/newspaper_core_metadata_spec.rb +45 -0
- data/spec/models/concerns/newspaper_works/place_of_publication_behavior_spec.rb +17 -0
- data/spec/models/concerns/newspaper_works/scanned_media_metadata_spec.rb +35 -0
- data/spec/models/newspaper_article_spec.rb +73 -0
- data/spec/models/newspaper_container_spec.rb +111 -0
- data/spec/models/newspaper_issue_spec.rb +91 -0
- data/spec/models/newspaper_page_spec.rb +44 -0
- data/spec/models/newspaper_title_spec.rb +116 -0
- data/spec/models/newspaper_works/derivative_attachment_spec.rb +37 -0
- data/spec/models/newspaper_works/ingest_file_relation_spec.rb +56 -0
- data/spec/models/solr_document_spec.rb +14 -0
- data/spec/ndnp_shared.rb +48 -0
- data/spec/presenters/hyrax/newspaper_article_presenter_spec.rb +53 -0
- data/spec/presenters/hyrax/newspaper_container_presenter_spec.rb +20 -0
- data/spec/presenters/hyrax/newspaper_issue_presenter_spec.rb +65 -0
- data/spec/presenters/hyrax/newspaper_page_presenter_spec.rb +75 -0
- data/spec/presenters/hyrax/newspaper_title_presenter_spec.rb +153 -0
- data/spec/presenters/newspaper_works/iiif_manifest_presenter_behavior_spec.rb +32 -0
- data/spec/presenters/newspaper_works/issue_info_presenter_spec.rb +51 -0
- data/spec/presenters/newspaper_works/newspaper_core_presenter_spec.rb +22 -0
- data/spec/presenters/newspaper_works/persistent_url_presenter_behavior_spec.rb +24 -0
- data/spec/presenters/newspaper_works/place_of_publication_presenter_behavior_spec.rb +17 -0
- data/spec/presenters/newspaper_works/scanned_media_presenter_spec.rb +18 -0
- data/spec/presenters/newspaper_works/title_info_presenter_spec.rb +23 -0
- data/spec/routing/route_spec.rb +52 -0
- data/spec/search_builders/custom_search_builder_spec.rb +34 -0
- data/spec/search_builders/newspaper_works/newspapers_search_builder_spec.rb +33 -0
- data/spec/services/hyrax/article_genre_service_spec.rb +12 -0
- data/spec/services/hyrax/resource_types_service_spec.rb +12 -0
- data/spec/services/newspaper_works/jp2_derivative_service_spec.rb +62 -0
- data/spec/services/newspaper_works/newspaper_page_derivative_service_spec.rb +125 -0
- data/spec/services/newspaper_works/pdf_derivative_service_spec.rb +62 -0
- data/spec/services/newspaper_works/pluggable_derivative_service_spec.rb +204 -0
- data/spec/services/newspaper_works/text_extraction_derivative_service_spec.rb +82 -0
- data/spec/services/newspaper_works/text_formats_from_alto_service_spec.rb +129 -0
- data/spec/services/newspaper_works/tiff_derivative_service_spec.rb +58 -0
- data/spec/spec_helper.rb +261 -0
- data/spec/support/controller_level_helpers.rb +28 -0
- data/spec/test_app_templates/lib/generators/test_app_generator.rb +22 -0
- data/spec/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb_spec.rb +36 -0
- data/spec/views/catalog/_index_header_list_newspaper_page.html.erb_spec.rb +26 -0
- data/spec/views/catalog/_thumbnail_list_newspaper_page.html.erb_spec.rb +35 -0
- data/spec/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb_spec.rb +16 -0
- data/spec/views/hyrax/newspaper_titles/_issue_search_form.html.erb_spec.rb +33 -0
- data/spec/views/hyrax/newspaper_titles/_issues_calendar.html.erb_spec.rb +37 -0
- data/spec/views/hyrax/newspaper_titles/show.html.erb_spec.rb +87 -0
- data/spec/views/newspaper_works/base/_attribute_rows.html.erb_spec.rb +60 -0
- data/spec/views/newspaper_works/base/_newspaper_hierarchy.html.erb_spec.rb +80 -0
- data/spec/views/newspaper_works/base/_show.html.erb_spec.rb +78 -0
- data/spec/views/newspaper_works/newspapers_search/search.html.erb_spec.rb +54 -0
- data/spec/views/records/edit_fields/_place_of_publication.html.erb_spec.rb +26 -0
- data/tasks/newspaperworks_dev.rake +26 -0
- data/test/integration/navigation_test.rb +7 -0
- data/test/lib/generators/newspaper_works/install_generator_test.rb +16 -0
- data/test/newspaper_works_test.rb +7 -0
- data/test/test_helper.rb +17 -0
- data/tmp/.keep +0 -0
- metadata +1037 -0
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
require 'faraday'
|
|
2
|
+
require 'nokogiri'
|
|
3
|
+
require 'uri'
|
|
4
|
+
|
|
5
|
+
module NewspaperWorks
|
|
6
|
+
module Ingest
|
|
7
|
+
class LCPublicationInfo < BasePublicationInfo
|
|
8
|
+
attr_accessor :place_of_publication, :full_title, :lccn, :place_name, :doc
|
|
9
|
+
|
|
10
|
+
XML_NS = {
|
|
11
|
+
mods: 'http://www.loc.gov/mods/v3',
|
|
12
|
+
MODS: 'http://www.loc.gov/mods/v3'
|
|
13
|
+
}.freeze
|
|
14
|
+
|
|
15
|
+
BASE_URL = 'https://lccn.loc.gov'.freeze
|
|
16
|
+
|
|
17
|
+
def initialize(lccn)
|
|
18
|
+
super(lccn)
|
|
19
|
+
@doc = nil
|
|
20
|
+
@full_title = nil
|
|
21
|
+
@place_of_publication = nil
|
|
22
|
+
@place_name = nil
|
|
23
|
+
load
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def inspect
|
|
27
|
+
format(
|
|
28
|
+
"<#{self.class}:0x000000000%<oid>x " \
|
|
29
|
+
"\tlccn: '#{@lccn}'>",
|
|
30
|
+
oid: object_id << 1
|
|
31
|
+
)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def url
|
|
35
|
+
"#{BASE_URL}/#{@lccn}/mods"
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def load_lc
|
|
39
|
+
resp = NewspaperWorks::ResourceFetcher.get url
|
|
40
|
+
@doc = Nokogiri.XML(resp['body'])
|
|
41
|
+
return if empty?
|
|
42
|
+
# try title[@type="uniform"] first:
|
|
43
|
+
title = find('//mods:titleInfo[@type="uniform"]/mods:title').first
|
|
44
|
+
# if no type="uniform" title, try non-alternate bare titleInfo:
|
|
45
|
+
# -- in either case, should omit any non-sorted article (e.g. "The")
|
|
46
|
+
title = find('//mods:titleInfo[count(@type)=0]/mods:title').first if title.nil?
|
|
47
|
+
@full_title = title.text unless title.nil?
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def mods_place_name
|
|
51
|
+
# prefer geographic subject hierarchy for place name construction:
|
|
52
|
+
city = find('//mods:hierarchicalGeographic/mods:city').first
|
|
53
|
+
# State (e.g. "Utah"), Province (e.g. "Ontario"), other (e.g. "England")
|
|
54
|
+
state = find('//mods:hierarchicalGeographic/mods:state').first
|
|
55
|
+
# if state is nil, fallback to country in its place
|
|
56
|
+
state = find('//mods:hierarchicalGeographic/mods:country').first if state.nil?
|
|
57
|
+
return "#{city.text}, #{state.text}" if city && state
|
|
58
|
+
# fallback to placeTerm text, which may be abbreviated in such a
|
|
59
|
+
# way that geonames struggles to find on search; for a list of
|
|
60
|
+
# abbreviations, see:
|
|
61
|
+
# https://www.loc.gov/aba/publications/FreeSHM/H0810.pdf
|
|
62
|
+
name = find('//mods:originInfo//mods:placeTerm[@type="text"]').first
|
|
63
|
+
name.nil? ? nil : name.text
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def load_place
|
|
67
|
+
@place_name = mods_place_name || place_name_from_title(@full_title)
|
|
68
|
+
return if @place_name.nil?
|
|
69
|
+
uri = NewspaperWorks::Ingest.geonames_place_uri(@place_name)
|
|
70
|
+
@place_of_publication = uri
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def empty?
|
|
74
|
+
@doc.nil? || @doc.root.children.empty?
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def load
|
|
78
|
+
load_lc
|
|
79
|
+
load_place unless @full_title.nil?
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def title
|
|
83
|
+
return if empty?
|
|
84
|
+
NewspaperWorks::Ingest.normalize_title(@full_title.split(/ [\(]/)[0])
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# ISO-639-2 three character language code, default is 'eng' (English)
|
|
88
|
+
def language(default = 'eng')
|
|
89
|
+
return if empty?
|
|
90
|
+
v = find('//mods:language/mods:languageTerm').first
|
|
91
|
+
v.nil? ? default : v.text
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def issn
|
|
95
|
+
return if empty?
|
|
96
|
+
v = find('//mods:mods/mods:identifier[@type="issn"]').first
|
|
97
|
+
v.nil? ? nil : v.text
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def oclcnum
|
|
101
|
+
return if empty?
|
|
102
|
+
v = find('//mods:mods/mods:identifier[@type="oclc"]').first
|
|
103
|
+
v.nil? ? nil : oclc_prefixed(v.text)
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def preceded_by
|
|
107
|
+
related_by('preceding')
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def succeeded_by
|
|
111
|
+
related_by('succeeding')
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
private
|
|
115
|
+
|
|
116
|
+
def related_by(key)
|
|
117
|
+
return if empty?
|
|
118
|
+
v = find("//mods:relatedItem[@type='#{key}']")
|
|
119
|
+
return nil if v.empty?
|
|
120
|
+
lccn = lccn_for(v[0])
|
|
121
|
+
return "#{BASE_URL}/#{lccn}" unless lccn.nil?
|
|
122
|
+
# No LCCN, ergo no URL, but a related item with a literal title?
|
|
123
|
+
titles = find('mods:title', v[0])
|
|
124
|
+
titles.empty? ? nil : titles[0].text
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def lccn_for(related_item)
|
|
128
|
+
identifiers = find('mods:identifier[@type="local"]', related_item)
|
|
129
|
+
selected = identifiers.select { |v| v.text.start_with?('(DLC)') }
|
|
130
|
+
return if selected.size.zero?
|
|
131
|
+
selected.first.text.split(')')[-1].sub(' ', '')
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
def find(expr, context = nil)
|
|
135
|
+
context ||= @doc
|
|
136
|
+
return if context.nil? && empty?
|
|
137
|
+
context.xpath(
|
|
138
|
+
expr,
|
|
139
|
+
**XML_NS
|
|
140
|
+
)
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
end
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
require 'date'
|
|
2
|
+
|
|
3
|
+
module NewspaperWorks
|
|
4
|
+
module Ingest
|
|
5
|
+
# Mixin for deducing issue metadata from path, publication info.
|
|
6
|
+
# precondition: consuming class has accessor for:
|
|
7
|
+
# - `path`: full path to issue
|
|
8
|
+
# - `publication`: a `NewspaperWorks::Ingest::PublicationInfo object.
|
|
9
|
+
module NamedIssueMetadata
|
|
10
|
+
# Memoized filename from path:
|
|
11
|
+
# @return [String]
|
|
12
|
+
def filename
|
|
13
|
+
return @filename unless @filename.nil?
|
|
14
|
+
@filename = File.basename(path)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def validate_path
|
|
18
|
+
# expect path to exist:
|
|
19
|
+
raise ArgumentError unless File.exist?(path)
|
|
20
|
+
# `YYYYMMDDEE` with valid date digits, optional `EE` edition
|
|
21
|
+
ptn = /^([0-9]{4})(1[012]|[0][1-9])(3[01]|[12][0-9]|0[1-9])([0-9]{2})?/
|
|
22
|
+
raise ArgumentError unless ptn.match(filename)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Publication date stamp
|
|
26
|
+
# @return [String] ISO 8601 date stamp
|
|
27
|
+
def publication_date
|
|
28
|
+
year = filename.slice(0, 4).to_i
|
|
29
|
+
month = filename.slice(4, 2).to_i
|
|
30
|
+
day = filename.slice(6, 2).to_i
|
|
31
|
+
DateTime.new(year, month, day).iso8601[0..9]
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Issue edition number
|
|
35
|
+
# @return [Integer] number of issue edition
|
|
36
|
+
def edition_number
|
|
37
|
+
# use file name minus file extension (if applicable, e.g. PDF):
|
|
38
|
+
base = filename.split('.')[0..-2].join('.')
|
|
39
|
+
# default for PDF or issue dir not specifying edition value in
|
|
40
|
+
# name (before file extension, if applicable):
|
|
41
|
+
return 1 if base.size < 10
|
|
42
|
+
# ...otherwise use explicitly provided edition number in filename
|
|
43
|
+
base.slice(8, 2).to_i
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# rubocop:disable Rails/Delegate
|
|
47
|
+
def lccn
|
|
48
|
+
publication.lccn
|
|
49
|
+
end
|
|
50
|
+
# rubocop:enable Rails/Delegate
|
|
51
|
+
|
|
52
|
+
def title
|
|
53
|
+
title_date = DateTime.iso8601(publication_date).strftime('%B %-d, %Y')
|
|
54
|
+
v = "#{publication.title}: #{title_date}"
|
|
55
|
+
v = "#{v} (#{edition_number})" if edition_number.to_i > 1
|
|
56
|
+
[v]
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
require 'date'
|
|
2
|
+
require 'find'
|
|
3
|
+
require 'optparse'
|
|
4
|
+
|
|
5
|
+
module NewspaperWorks
|
|
6
|
+
module Ingest
|
|
7
|
+
module NDNP
|
|
8
|
+
class BatchIngester
|
|
9
|
+
extend NewspaperWorks::Ingest::FromCommand
|
|
10
|
+
include NewspaperWorks::Logging
|
|
11
|
+
|
|
12
|
+
attr_accessor :path, :batch, :opts
|
|
13
|
+
|
|
14
|
+
# normalize path, possibly from directory, to contained batch
|
|
15
|
+
# manifest XML path:
|
|
16
|
+
# @param path [String]
|
|
17
|
+
def self.normalize_path(path)
|
|
18
|
+
return path unless File.directory?(path)
|
|
19
|
+
batch_xml_path = Find.find(path).select do |f|
|
|
20
|
+
f.downcase.end_with?('batch_1.xml', 'batch.xml')
|
|
21
|
+
end
|
|
22
|
+
batch_xml_path.find { |f| f.end_with?('_1.xml') } || batch_xml_path[0]
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# @param path [String] path to batch xml or directory
|
|
26
|
+
# @param opts [Hash]
|
|
27
|
+
# global ingest options, to be passed to ingester components,
|
|
28
|
+
# may include administrative metadata.
|
|
29
|
+
def initialize(path, opts = {})
|
|
30
|
+
@path = self.class.normalize_path(path)
|
|
31
|
+
raise IOError, "No batch file found: #{path}" if @path.empty?
|
|
32
|
+
@opts = opts
|
|
33
|
+
@batch = batch_enumerator
|
|
34
|
+
configure_logger('ingest')
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def ingest
|
|
38
|
+
write_log("Beginning NDNP batch ingest for #{@path}")
|
|
39
|
+
batch.each do |issue|
|
|
40
|
+
issue_ingester(issue).ingest
|
|
41
|
+
end
|
|
42
|
+
write_log(
|
|
43
|
+
"NDNP batch ingest complete!"
|
|
44
|
+
)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
private
|
|
48
|
+
|
|
49
|
+
# Return BatchIngest object as enumerable of issues:
|
|
50
|
+
def batch_enumerator
|
|
51
|
+
NewspaperWorks::Ingest::NDNP::BatchXMLIngest.new(path)
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def issue_ingester(issue)
|
|
55
|
+
NewspaperWorks::Ingest::NDNP::IssueIngester.new(issue, @opts)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def normalize_date(v)
|
|
59
|
+
(v.is_a?(String) ? Date.parse(v) : v).to_s
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
require 'nokogiri'
|
|
2
|
+
|
|
3
|
+
module NewspaperWorks
|
|
4
|
+
module Ingest
|
|
5
|
+
module NDNP
|
|
6
|
+
class BatchXMLIngest
|
|
7
|
+
include Enumerable
|
|
8
|
+
include NewspaperWorks::Ingest::NDNP::NDNPMetsHelper
|
|
9
|
+
|
|
10
|
+
attr_accessor :container_paths, :issue_paths, :path
|
|
11
|
+
|
|
12
|
+
delegate :size, to: :issue_paths
|
|
13
|
+
|
|
14
|
+
def initialize(path)
|
|
15
|
+
@path = path
|
|
16
|
+
load_doc
|
|
17
|
+
@container_paths = xpath('//ndnp:batch//ndnp:reel').map do |e|
|
|
18
|
+
normalize_path(e.text)
|
|
19
|
+
end
|
|
20
|
+
@issue_paths = xpath('//ndnp:batch//ndnp:issue').map do |e|
|
|
21
|
+
normalize_path(e.text)
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def name
|
|
26
|
+
xpath('//ndnp:batch').first.attributes['name'].value
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def get(path)
|
|
30
|
+
return get_issue(path) if issue_paths.include?(path)
|
|
31
|
+
get_container(path)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def issues
|
|
35
|
+
issue_paths.map { |path| get(path) }
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def containers
|
|
39
|
+
container_paths.map { |path| get(path) }
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def each
|
|
43
|
+
@issue_paths.each do |path|
|
|
44
|
+
yield get_issue(path)
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
private
|
|
49
|
+
|
|
50
|
+
def get_issue(path)
|
|
51
|
+
NewspaperWorks::Ingest::NDNP::IssueIngest.new(path)
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def get_container(path)
|
|
55
|
+
NewspaperWorks::Ingest::NDNP::ContainerIngest.new(path)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def xpath(expr)
|
|
59
|
+
ns = {
|
|
60
|
+
ndnp: 'http://www.loc.gov/ndnp',
|
|
61
|
+
NDNP: 'http://www.loc.gov/ndnp'
|
|
62
|
+
}
|
|
63
|
+
@doc.xpath(expr, **ns)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def load_doc
|
|
67
|
+
@doc = Nokogiri::XML(File.open(path))
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
module NewspaperWorks
|
|
2
|
+
module Ingest
|
|
3
|
+
module NDNP
|
|
4
|
+
class ContainerIngest
|
|
5
|
+
# Enumerable of IssueIngest objects for issues in pages
|
|
6
|
+
include Enumerable
|
|
7
|
+
include NewspaperWorks::Ingest::NDNP::NDNPMetsHelper
|
|
8
|
+
|
|
9
|
+
attr_accessor :path, :doc, :dmdids, :issue_paths
|
|
10
|
+
|
|
11
|
+
def initialize(path)
|
|
12
|
+
@path = path
|
|
13
|
+
@doc = nil
|
|
14
|
+
@metadata = nil
|
|
15
|
+
# identifiers of control images, which we make accessible, but are
|
|
16
|
+
# not the primary focus of enumeration:
|
|
17
|
+
@dmdids = nil
|
|
18
|
+
@issue_paths = []
|
|
19
|
+
load_doc
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def inspect
|
|
23
|
+
format(
|
|
24
|
+
"<#{self.class}:0x000000000%<oid>x\n" \
|
|
25
|
+
"\tpath: '#{path}',\n",
|
|
26
|
+
oid: object_id << 1
|
|
27
|
+
)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def identifier
|
|
31
|
+
metadata.reel_number
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Return control image as PageIngest object.
|
|
35
|
+
# These objects will not have pagination/sequence data, but
|
|
36
|
+
# will provide an equivalent programmatic interface for file access
|
|
37
|
+
# of control images, as one would access normal page files.
|
|
38
|
+
# @return [NewspaperWorks::Ingest::NDNP::PageIngest]
|
|
39
|
+
def page_by_dmdid(dmdid)
|
|
40
|
+
NewspaperWorks::Ingest::NDNP::PageIngest.new(@path, dmdid, self)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Get IssueIngest object, given path to its XML
|
|
44
|
+
# return [NewspaperWorks::Ingest::NDNP::IssueIngest]
|
|
45
|
+
def issue_by_path(path)
|
|
46
|
+
NewspaperWorks::Ingest::NDNP::IssueIngest.new(path)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def each
|
|
50
|
+
@issue_paths.each do |path|
|
|
51
|
+
yield issue_by_path(path)
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def size
|
|
56
|
+
@issue_paths.size
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def metadata
|
|
60
|
+
return @metadata unless @metadata.nil?
|
|
61
|
+
@metadata = NewspaperWorks::Ingest::NDNP::ContainerMetadata.new(
|
|
62
|
+
path,
|
|
63
|
+
self
|
|
64
|
+
)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
private
|
|
68
|
+
|
|
69
|
+
def load_doc
|
|
70
|
+
@doc = Nokogiri::XML(File.open(path)) if @doc.nil?
|
|
71
|
+
page_divs = doc.xpath(
|
|
72
|
+
"//mets:structMap/mets:div[@TYPE='np:reel']/" \
|
|
73
|
+
"mets:div[@TYPE='np:target']",
|
|
74
|
+
mets: 'http://www.loc.gov/METS/'
|
|
75
|
+
)
|
|
76
|
+
# identifiers for reel control images:
|
|
77
|
+
@dmdids = page_divs.map { |div| div.attr('DMDID') }
|
|
78
|
+
load_issue_paths
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Load instance attribute for issue paths,
|
|
82
|
+
# based on listing of directory in which reel XML is present.
|
|
83
|
+
# This is done without context of batch xml,
|
|
84
|
+
# with file name expectations based on convention,
|
|
85
|
+
# as expressed in NDNP technical guidelines,
|
|
86
|
+
# which presume that the issue XML file name will (sans extension)
|
|
87
|
+
# match directory name for the issue, in date+edition syntax.
|
|
88
|
+
def load_issue_paths
|
|
89
|
+
issue_dir_paths = Dir["#{File.dirname(path)}/*/"].select do |v|
|
|
90
|
+
!File.basename(v).match(/^[0-9]+$/).nil?
|
|
91
|
+
end
|
|
92
|
+
@issue_paths = issue_dir_paths.map do |path|
|
|
93
|
+
File.join(path, "#{File.basename(path)}.xml")
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
module NewspaperWorks
|
|
2
|
+
module Ingest
|
|
3
|
+
module NDNP
|
|
4
|
+
# Ingester for reel/container, given reel source data
|
|
5
|
+
# and required publication (NewspaperTitle) asset.
|
|
6
|
+
# Responsibile for creating/finding container, linking
|
|
7
|
+
# to (parent) publication and (child) pages.
|
|
8
|
+
class ContainerIngester
|
|
9
|
+
include NewspaperWorks::Ingest::NDNP::NDNPAssetHelper
|
|
10
|
+
|
|
11
|
+
attr_accessor :source, :target, :publication, :opts
|
|
12
|
+
|
|
13
|
+
# Create ingester in context of source reel data, NewspaperTitle
|
|
14
|
+
# @param source [NewspaperWorks::Ingest::NDNP::ContainerIngest]
|
|
15
|
+
# @param publication [NewspaperTitle] Required publication to link to
|
|
16
|
+
# @param opts [Hash]
|
|
17
|
+
# ingest options, e.g. administrative metadata
|
|
18
|
+
def initialize(source, publication, opts = {})
|
|
19
|
+
@source = source
|
|
20
|
+
@publication = publication
|
|
21
|
+
@opts = opts
|
|
22
|
+
# initially nil, populate w/ NewspaperContainer object via .ingest
|
|
23
|
+
@target = nil
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def ingest
|
|
27
|
+
find_or_create_container
|
|
28
|
+
link_publication
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Link a page to target container
|
|
32
|
+
# @param page [NewspaperPage]
|
|
33
|
+
def link(page)
|
|
34
|
+
@target.ordered_members << page
|
|
35
|
+
# save each link attempt (for now no deferring/bundling)
|
|
36
|
+
@target.save!
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def find_or_create_container
|
|
40
|
+
@target = find_container
|
|
41
|
+
create_container if @target.nil?
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
private
|
|
45
|
+
|
|
46
|
+
def metadata
|
|
47
|
+
@source.metadata
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def find_container
|
|
51
|
+
NewspaperContainer.where(identifier: metadata.reel_number).first
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def create_container
|
|
55
|
+
@target = NewspaperContainer.create
|
|
56
|
+
copy_metadata
|
|
57
|
+
assign_administrative_metadata
|
|
58
|
+
@target.save!
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def copy_metadata
|
|
62
|
+
reel_number = metadata.reel_number
|
|
63
|
+
@target.identifier = [reel_number]
|
|
64
|
+
@target.title = ["Microform reel (#{reel_number})"]
|
|
65
|
+
copy_fields = [
|
|
66
|
+
:held_by,
|
|
67
|
+
:publication_date_start,
|
|
68
|
+
:publication_date_end
|
|
69
|
+
]
|
|
70
|
+
copy_fields.each do |fieldname|
|
|
71
|
+
value = metadata.send(fieldname.to_s)
|
|
72
|
+
@target.send("#{fieldname}=", value)
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def link_publication
|
|
77
|
+
return unless @target.publication.nil?
|
|
78
|
+
@publication.members << @target
|
|
79
|
+
@publication.save!
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
module NewspaperWorks
|
|
2
|
+
module Ingest
|
|
3
|
+
module NDNP
|
|
4
|
+
class ContainerMetadata
|
|
5
|
+
include NewspaperWorks::Ingest::NDNP::NDNPMetsHelper
|
|
6
|
+
|
|
7
|
+
attr_accessor :path, :doc
|
|
8
|
+
|
|
9
|
+
def initialize(path, parent = nil)
|
|
10
|
+
@path = path
|
|
11
|
+
@parent = parent
|
|
12
|
+
@doc = nil
|
|
13
|
+
load_doc
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def inspect
|
|
17
|
+
format(
|
|
18
|
+
"<#{self.class}:0x000000000%<oid>x\n" \
|
|
19
|
+
"\tpath: '#{path}',\n",
|
|
20
|
+
oid: object_id << 1
|
|
21
|
+
)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# Reel Number (NDNP-mandatory)
|
|
25
|
+
# @return [String] a serial number string for reel, may correspond
|
|
26
|
+
# to an issued barcode
|
|
27
|
+
def reel_number
|
|
28
|
+
v = xpath("//mods:identifier[@type='reel number']").first
|
|
29
|
+
return v.text unless v.nil?
|
|
30
|
+
xpath('//mets:mets/@LABEL').first.value
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Original Source Repository (NDNP-mandatory)
|
|
34
|
+
# @return [String]
|
|
35
|
+
def held_by
|
|
36
|
+
v = xpath("//mods:physicalLocation").first
|
|
37
|
+
return v['displayLabel'] unless v.nil?
|
|
38
|
+
# fallback to look at mods:note/@displayLabel, when the
|
|
39
|
+
# @type="agencyResponsibleForReproduction"
|
|
40
|
+
xpath(
|
|
41
|
+
'//mods:note[@type="agencyResponsibleForReproduction"]' \
|
|
42
|
+
'/@displayLabel'
|
|
43
|
+
).first.value
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Media genre/form (Page Physical Description, e.g. "microform")
|
|
47
|
+
# NDNP Mandatory.
|
|
48
|
+
# @return [String]
|
|
49
|
+
def genre
|
|
50
|
+
form = xpath('//mods:physicalDescription/MODS:form').first
|
|
51
|
+
form.attributes['type'].value
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Titles (on Reel) (optional)
|
|
55
|
+
# @return [String] title
|
|
56
|
+
def title
|
|
57
|
+
techmd('ndnp:titles')
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Start Date (optional)
|
|
61
|
+
# @return [String] ISO 8601 formatted date
|
|
62
|
+
def publication_date_start
|
|
63
|
+
techmd('ndnp:startDate')
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# End Date (optional)
|
|
67
|
+
# @return [String] ISO 8601 formatted date
|
|
68
|
+
def publication_date_end
|
|
69
|
+
techmd('ndnp:endDate')
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
private
|
|
73
|
+
|
|
74
|
+
def load_doc
|
|
75
|
+
@doc = @parent.doc unless @parent.nil?
|
|
76
|
+
@doc = Nokogiri::XML(File.open(path)) if @doc.nil?
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def techmd(spec = nil)
|
|
80
|
+
base = xpath('//ndnp:reelTechMD')
|
|
81
|
+
return base if spec.nil?
|
|
82
|
+
base.xpath(spec).first.text
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
module NewspaperWorks
|
|
2
|
+
module Ingest
|
|
3
|
+
module NDNP
|
|
4
|
+
class IssueIngest
|
|
5
|
+
include Enumerable
|
|
6
|
+
include NewspaperWorks::Ingest::NDNP::NDNPMetsHelper
|
|
7
|
+
|
|
8
|
+
attr_accessor :path, :doc, :dmdids
|
|
9
|
+
|
|
10
|
+
def initialize(path)
|
|
11
|
+
@path = path
|
|
12
|
+
@doc = nil
|
|
13
|
+
@metadata = nil
|
|
14
|
+
# Enumeration based on list of DMDID loaded by load_doc
|
|
15
|
+
@dmdids = nil
|
|
16
|
+
load_doc
|
|
17
|
+
# cache dmdid -> PageIngest
|
|
18
|
+
@page_cache = {}
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def inspect
|
|
22
|
+
format(
|
|
23
|
+
"<#{self.class}:0x000000000%<oid>x\n" \
|
|
24
|
+
"\tpath: '#{path}',\n",
|
|
25
|
+
oid: object_id << 1
|
|
26
|
+
)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def identifier
|
|
30
|
+
metadata.lccn
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def page_by_dmdid(dmdid)
|
|
34
|
+
return @page_cache[dmdid] if @page_cache.key?(dmdid)
|
|
35
|
+
p = NewspaperWorks::Ingest::NDNP::PageIngest.new(@path, dmdid, self)
|
|
36
|
+
@page_cache[dmdid] = p
|
|
37
|
+
p
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def page_by_sequence_number(n)
|
|
41
|
+
page_by_dmdid(
|
|
42
|
+
doc.xpath(
|
|
43
|
+
"//mods:extent//mods:start[text()='#{n}']",
|
|
44
|
+
mets: 'http://www.loc.gov/METS/',
|
|
45
|
+
mods: 'http://www.loc.gov/mods/v3'
|
|
46
|
+
).first.ancestors('dmdSec').first['ID']
|
|
47
|
+
)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def each
|
|
51
|
+
@dmdids.each do |dmdid|
|
|
52
|
+
yield page_by_dmdid(dmdid)
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def size
|
|
57
|
+
@dmdids.size
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def metadata
|
|
61
|
+
return @metadata unless @metadata.nil?
|
|
62
|
+
@metadata = NewspaperWorks::Ingest::NDNP::IssueMetadata.new(
|
|
63
|
+
path,
|
|
64
|
+
self
|
|
65
|
+
)
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
private
|
|
69
|
+
|
|
70
|
+
def load_doc
|
|
71
|
+
@doc = Nokogiri::XML(File.open(path)) if @doc.nil?
|
|
72
|
+
page_divs = doc.xpath(
|
|
73
|
+
"//mets:structMap//mets:div[@TYPE='np:page']",
|
|
74
|
+
mets: 'http://www.loc.gov/METS/'
|
|
75
|
+
)
|
|
76
|
+
@dmdids = page_divs.map { |div| div.attr('DMDID') }
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|