newspaper_works 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.fcrepo_wrapper +4 -0
- data/.gitignore +43 -0
- data/.rubocop.yml +143 -0
- data/.solr_wrapper +8 -0
- data/.travis.yml +50 -0
- data/Gemfile +47 -0
- data/LICENSE +203 -0
- data/README.md +159 -0
- data/Rakefile +38 -0
- data/app/actors/hyrax/actors/newspaper_article_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_container_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_issue_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_page_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_title_actor.rb +8 -0
- data/app/actors/newspaper_works/actors/newspaper_works_upload_actor.rb +88 -0
- data/app/assets/config/newspaper_works_manifest.js +2 -0
- data/app/assets/images/newspaper_works/.keep +0 -0
- data/app/assets/javascripts/newspaper_works/autocomplete_fix.js +33 -0
- data/app/assets/javascripts/newspaper_works/ocr_search.js.erb +6 -0
- data/app/assets/javascripts/newspaper_works/thumbnail_highlights.js.erb +102 -0
- data/app/assets/javascripts/newspaper_works.js +4 -0
- data/app/assets/stylesheets/newspaper_works/_issue_search.scss +13 -0
- data/app/assets/stylesheets/newspaper_works/_issues_calendar.scss +18 -0
- data/app/assets/stylesheets/newspaper_works/_newspaper_works.scss +4 -0
- data/app/assets/stylesheets/newspaper_works/_newspapers_search.scss +38 -0
- data/app/assets/stylesheets/newspaper_works/_search_results.scss +12 -0
- data/app/controllers/hyrax/newspaper_articles_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_containers_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_issues_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_pages_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_titles_controller.rb +13 -0
- data/app/controllers/newspaper_works/newspapers_controller.rb +117 -0
- data/app/controllers/newspaper_works/newspapers_search_controller.rb +26 -0
- data/app/forms/hyrax/newspaper_article_form.rb +11 -0
- data/app/forms/hyrax/newspaper_container_form.rb +11 -0
- data/app/forms/hyrax/newspaper_issue_form.rb +11 -0
- data/app/forms/hyrax/newspaper_page_form.rb +15 -0
- data/app/forms/hyrax/newspaper_title_form.rb +12 -0
- data/app/forms/newspaper_works/newspaper_core_form_data.rb +17 -0
- data/app/helpers/hyrax/newspaper_articles_helper.rb +5 -0
- data/app/helpers/hyrax/newspaper_containers_helper.rb +5 -0
- data/app/helpers/hyrax/newspaper_issues_helper.rb +5 -0
- data/app/helpers/hyrax/newspaper_pages_helper.rb +5 -0
- data/app/helpers/newspaper_works/application_helper.rb +5 -0
- data/app/helpers/newspaper_works/breadcrumb_helper.rb +92 -0
- data/app/helpers/newspaper_works/newspaper_works_helper_behavior.rb +103 -0
- data/app/helpers/newspaper_works/newspapers_helper.rb +5 -0
- data/app/indexers/concerns/newspaper_works/indexes_full_text.rb +17 -0
- data/app/indexers/concerns/newspaper_works/indexes_place_of_publication.rb +67 -0
- data/app/indexers/concerns/newspaper_works/indexes_publication_date_range.rb +35 -0
- data/app/indexers/concerns/newspaper_works/indexes_relationships.rb +125 -0
- data/app/indexers/newspaper_article_indexer.rb +16 -0
- data/app/indexers/newspaper_container_indexer.rb +18 -0
- data/app/indexers/newspaper_issue_indexer.rb +26 -0
- data/app/indexers/newspaper_page_indexer.rb +9 -0
- data/app/indexers/newspaper_title_indexer.rb +19 -0
- data/app/indexers/newspaper_works/newspaper_core_indexer.rb +21 -0
- data/app/jobs/newspaper_works/application_job.rb +4 -0
- data/app/jobs/newspaper_works/compose_issue_pdf_job.rb +13 -0
- data/app/jobs/newspaper_works/create_issue_pages_job.rb +19 -0
- data/app/mailers/newspaper_works/application_mailer.rb +8 -0
- data/app/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior.rb +82 -0
- data/app/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior.rb +27 -0
- data/app/models/concerns/newspaper_works/newspaper_core_metadata.rb +67 -0
- data/app/models/concerns/newspaper_works/place_of_publication_behavior.rb +15 -0
- data/app/models/concerns/newspaper_works/scanned_media_metadata.rb +43 -0
- data/app/models/concerns/newspaper_works/solr/document.rb +25 -0
- data/app/models/file_set.rb +10 -0
- data/app/models/newspaper_article.rb +158 -0
- data/app/models/newspaper_container.rb +86 -0
- data/app/models/newspaper_issue.rb +115 -0
- data/app/models/newspaper_page.rb +70 -0
- data/app/models/newspaper_title.rb +111 -0
- data/app/models/newspaper_works/application_record.rb +6 -0
- data/app/models/newspaper_works/derivative_attachment.rb +8 -0
- data/app/models/newspaper_works/ingest_file_relation.rb +14 -0
- data/app/presenters/hyrax/newspaper_article_presenter.rb +38 -0
- data/app/presenters/hyrax/newspaper_container_presenter.rb +11 -0
- data/app/presenters/hyrax/newspaper_issue_presenter.rb +62 -0
- data/app/presenters/hyrax/newspaper_page_presenter.rb +72 -0
- data/app/presenters/hyrax/newspaper_title_presenter.rb +86 -0
- data/app/presenters/newspaper_works/iiif_manifest_presenter_behavior.rb +29 -0
- data/app/presenters/newspaper_works/issue_info_presenter.rb +29 -0
- data/app/presenters/newspaper_works/newspaper_core_presenter.rb +9 -0
- data/app/presenters/newspaper_works/persistent_url_presenter_behavior.rb +16 -0
- data/app/presenters/newspaper_works/place_of_publication_presenter_behavior.rb +8 -0
- data/app/presenters/newspaper_works/scanned_media_presenter.rb +7 -0
- data/app/presenters/newspaper_works/title_info_presenter.rb +13 -0
- data/app/search_builders/concerns/newspaper_works/exclude_models.rb +16 -0
- data/app/search_builders/concerns/newspaper_works/highlight_search_params.rb +14 -0
- data/app/search_builders/newspaper_works/newspapers_search_builder.rb +26 -0
- data/app/services/hyrax/article_genre_service.rb +9 -0
- data/app/services/newspaper_works/jp2_derivative_service.rb +120 -0
- data/app/services/newspaper_works/newspaper_page_derivative_service.rb +91 -0
- data/app/services/newspaper_works/pdf_derivative_service.rb +45 -0
- data/app/services/newspaper_works/pluggable_derivative_service.rb +114 -0
- data/app/services/newspaper_works/text_extraction_derivative_service.rb +56 -0
- data/app/services/newspaper_works/text_formats_from_alto_service.rb +77 -0
- data/app/services/newspaper_works/tiff_derivative_service.rb +54 -0
- data/app/validators/newspaper_works/publication_date_start_end_validator.rb +48 -0
- data/app/validators/newspaper_works/publication_date_validator.rb +16 -0
- data/app/views/catalog/_index_gallery_newspaper_article_wrapper.html.erb +9 -0
- data/app/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb +9 -0
- data/app/views/catalog/_index_header_gallery_newspaper_article.html.erb +23 -0
- data/app/views/catalog/_index_header_gallery_newspaper_page.html.erb +23 -0
- data/app/views/catalog/_index_header_list_newspaper_article.html.erb +7 -0
- data/app/views/catalog/_index_header_list_newspaper_page.html.erb +7 -0
- data/app/views/catalog/_snippets_more.html.erb +16 -0
- data/app/views/catalog/_thumbnail_list_newspaper_article.html.erb +6 -0
- data/app/views/catalog/_thumbnail_list_newspaper_page.html.erb +6 -0
- data/app/views/hyrax/file_sets/_actions.html.erb +45 -0
- data/app/views/hyrax/newspaper_articles/_newspaper_article.html.erb +2 -0
- data/app/views/hyrax/newspaper_articles/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_containers/_newspaper_container.html.erb +2 -0
- data/app/views/hyrax/newspaper_containers/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_issues/_newspaper_issue.html.erb +2 -0
- data/app/views/hyrax/newspaper_issues/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_pages/_newspaper_page.html.erb +2 -0
- data/app/views/hyrax/newspaper_pages/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb +5 -0
- data/app/views/hyrax/newspaper_titles/_issue_search_form.html.erb +33 -0
- data/app/views/hyrax/newspaper_titles/_issues_calendar.html.erb +63 -0
- data/app/views/hyrax/newspaper_titles/_newspaper_title.html.erb +2 -0
- data/app/views/hyrax/newspaper_titles/show.html.erb +54 -0
- data/app/views/newspaper_works/base/_attribute_rows.html.erb +42 -0
- data/app/views/newspaper_works/base/_attributes.html.erb +16 -0
- data/app/views/newspaper_works/base/_metadata.html.erb +6 -0
- data/app/views/newspaper_works/base/_newspaper_hierarchy.html.erb +14 -0
- data/app/views/newspaper_works/base/_persistent_url.html.erb +1 -0
- data/app/views/newspaper_works/base/_show.html.erb +45 -0
- data/app/views/newspaper_works/newspapers_search/_date_fields.html.erb +29 -0
- data/app/views/newspaper_works/newspapers_search/_facet_layout.html.erb +8 -0
- data/app/views/newspaper_works/newspapers_search/_facet_limit.html.erb +17 -0
- data/app/views/newspaper_works/newspapers_search/_front_pages_input.html.erb +5 -0
- data/app/views/newspaper_works/newspapers_search/_keyword_input.html.erb +18 -0
- data/app/views/newspaper_works/newspapers_search/_newspapers_facets.html.erb +5 -0
- data/app/views/newspaper_works/newspapers_search/_newspapers_search_form.html.erb +13 -0
- data/app/views/newspaper_works/newspapers_search/_newspapers_search_help.html.erb +8 -0
- data/app/views/newspaper_works/newspapers_search/search.html.erb +13 -0
- data/app/views/records/edit_fields/_alternate_title.html.erb +4 -0
- data/app/views/records/edit_fields/_genre.html.erb +4 -0
- data/app/views/records/edit_fields/_place_of_publication.html.erb +14 -0
- data/app/views/records/edit_fields/_subtitle.html.erb +4 -0
- data/bin/rails +13 -0
- data/config/fcrepo_wrapper_test.yml +5 -0
- data/config/initializers/assets.rb +2 -0
- data/config/locales/newspaper_article.de.yml +12 -0
- data/config/locales/newspaper_article.en.yml +12 -0
- data/config/locales/newspaper_article.es.yml +12 -0
- data/config/locales/newspaper_article.fr.yml +12 -0
- data/config/locales/newspaper_article.it.yml +12 -0
- data/config/locales/newspaper_article.pt-BR.yml +12 -0
- data/config/locales/newspaper_article.zh.yml +12 -0
- data/config/locales/newspaper_container.de.yml +8 -0
- data/config/locales/newspaper_container.en.yml +8 -0
- data/config/locales/newspaper_container.es.yml +8 -0
- data/config/locales/newspaper_container.fr.yml +8 -0
- data/config/locales/newspaper_container.it.yml +8 -0
- data/config/locales/newspaper_container.pt-BR.yml +8 -0
- data/config/locales/newspaper_container.zh.yml +8 -0
- data/config/locales/newspaper_issue.de.yml +8 -0
- data/config/locales/newspaper_issue.en.yml +8 -0
- data/config/locales/newspaper_issue.es.yml +8 -0
- data/config/locales/newspaper_issue.fr.yml +8 -0
- data/config/locales/newspaper_issue.it.yml +8 -0
- data/config/locales/newspaper_issue.pt-BR.yml +8 -0
- data/config/locales/newspaper_issue.zh.yml +8 -0
- data/config/locales/newspaper_page.de.yml +15 -0
- data/config/locales/newspaper_page.en.yml +15 -0
- data/config/locales/newspaper_page.es.yml +15 -0
- data/config/locales/newspaper_page.fr.yml +15 -0
- data/config/locales/newspaper_page.it.yml +15 -0
- data/config/locales/newspaper_page.pt-BR.yml +15 -0
- data/config/locales/newspaper_page.zh.yml +15 -0
- data/config/locales/newspaper_title.de.yml +8 -0
- data/config/locales/newspaper_title.en.yml +8 -0
- data/config/locales/newspaper_title.es.yml +8 -0
- data/config/locales/newspaper_title.fr.yml +8 -0
- data/config/locales/newspaper_title.it.yml +8 -0
- data/config/locales/newspaper_title.pt-BR.yml +8 -0
- data/config/locales/newspaper_title.zh.yml +8 -0
- data/config/locales/newspaper_works.de.yml +50 -0
- data/config/locales/newspaper_works.en.yml +52 -0
- data/config/locales/newspaper_works.es.yml +52 -0
- data/config/locales/newspaper_works.fr.yml +52 -0
- data/config/locales/newspaper_works.it.yml +52 -0
- data/config/locales/newspaper_works.pt-BR.yml +52 -0
- data/config/locales/newspaper_works.zh.yml +52 -0
- data/config/routes.rb +9 -0
- data/config/solr_wrapper_test.yml +9 -0
- data/config/test-fixture/solr-config/_rest_managed.json +3 -0
- data/config/test-fixture/solr-config/admin-extra.html +31 -0
- data/config/test-fixture/solr-config/elevate.xml +36 -0
- data/config/test-fixture/solr-config/mapping-ISOLatin1Accent.txt +246 -0
- data/config/test-fixture/solr-config/protwords.txt +21 -0
- data/config/test-fixture/solr-config/schema.xml +366 -0
- data/config/test-fixture/solr-config/scripts.conf +24 -0
- data/config/test-fixture/solr-config/solrconfig.xml +322 -0
- data/config/test-fixture/solr-config/spellings.txt +2 -0
- data/config/test-fixture/solr-config/stopwords.txt +58 -0
- data/config/test-fixture/solr-config/stopwords_en.txt +58 -0
- data/config/test-fixture/solr-config/synonyms.txt +31 -0
- data/config/test-fixture/solr-config/xslt/example.xsl +132 -0
- data/config/test-fixture/solr-config/xslt/example_atom.xsl +67 -0
- data/config/test-fixture/solr-config/xslt/example_rss.xsl +66 -0
- data/config/test-fixture/solr-config/xslt/luke.xsl +337 -0
- data/config/vendor/imagemagick-6-policy.xml +76 -0
- data/db/migrate/20181214181358_create_newspaper_works_derivative_attachments.rb +12 -0
- data/db/migrate/20190107165909_create_newspaper_works_ingest_file_relations.rb +11 -0
- data/lib/generators/newspaper_works/assets_generator.rb +29 -0
- data/lib/generators/newspaper_works/blacklight_advanced_search_generator.rb +44 -0
- data/lib/generators/newspaper_works/blacklight_iiif_search_generator.rb +41 -0
- data/lib/generators/newspaper_works/catalog_controller_generator.rb +60 -0
- data/lib/generators/newspaper_works/install_generator.rb +97 -0
- data/lib/generators/newspaper_works/templates/annotation_behavior.rb +6 -0
- data/lib/generators/newspaper_works/templates/config/authorities/newspaper_article_genres.yml +86 -0
- data/lib/generators/newspaper_works/templates/config/initializers/newspaper_works.rb +12 -0
- data/lib/generators/newspaper_works/templates/config/initializers/patch_blacklight_advanced_search.rb +74 -0
- data/lib/generators/newspaper_works/templates/custom_search_builder.rb +23 -0
- data/lib/generators/newspaper_works/templates/newspaper_works.scss +1 -0
- data/lib/generators/newspaper_works/templates/newspaper_works_helper.rb +3 -0
- data/lib/generators/newspaper_works/templates/search_behavior.rb +6 -0
- data/lib/newspaper_works/configuration.rb +14 -0
- data/lib/newspaper_works/data/fileset_helper.rb +25 -0
- data/lib/newspaper_works/data/path_helper.rb +40 -0
- data/lib/newspaper_works/data/work_derivatives.rb +314 -0
- data/lib/newspaper_works/data/work_file.rb +92 -0
- data/lib/newspaper_works/data/work_files.rb +181 -0
- data/lib/newspaper_works/data.rb +35 -0
- data/lib/newspaper_works/engine.rb +42 -0
- data/lib/newspaper_works/errors.rb +14 -0
- data/lib/newspaper_works/ingest/base_ingest.rb +69 -0
- data/lib/newspaper_works/ingest/base_publication_info.rb +35 -0
- data/lib/newspaper_works/ingest/batch_ingest_helper.rb +44 -0
- data/lib/newspaper_works/ingest/batch_issue_ingester.rb +129 -0
- data/lib/newspaper_works/ingest/chronam_publication_info.rb +133 -0
- data/lib/newspaper_works/ingest/from_command.rb +52 -0
- data/lib/newspaper_works/ingest/image_ingest_issues.rb +43 -0
- data/lib/newspaper_works/ingest/issue_images.rb +51 -0
- data/lib/newspaper_works/ingest/lc_publication_info.rb +144 -0
- data/lib/newspaper_works/ingest/named_issue_metadata.rb +60 -0
- data/lib/newspaper_works/ingest/ndnp/batch_ingester.rb +64 -0
- data/lib/newspaper_works/ingest/ndnp/batch_xml_ingest.rb +72 -0
- data/lib/newspaper_works/ingest/ndnp/container_ingest.rb +99 -0
- data/lib/newspaper_works/ingest/ndnp/container_ingester.rb +84 -0
- data/lib/newspaper_works/ingest/ndnp/container_metadata.rb +87 -0
- data/lib/newspaper_works/ingest/ndnp/issue_ingest.rb +81 -0
- data/lib/newspaper_works/ingest/ndnp/issue_ingester.rb +101 -0
- data/lib/newspaper_works/ingest/ndnp/issue_metadata.rb +96 -0
- data/lib/newspaper_works/ingest/ndnp/ndnp_asset_helper.rb +20 -0
- data/lib/newspaper_works/ingest/ndnp/ndnp_mets_helper.rb +70 -0
- data/lib/newspaper_works/ingest/ndnp/page_ingest.rb +47 -0
- data/lib/newspaper_works/ingest/ndnp/page_ingester.rb +157 -0
- data/lib/newspaper_works/ingest/ndnp/page_metadata.rb +112 -0
- data/lib/newspaper_works/ingest/ndnp.rb +21 -0
- data/lib/newspaper_works/ingest/newspaper_issue_ingest.rb +56 -0
- data/lib/newspaper_works/ingest/newspaper_page_ingest.rb +6 -0
- data/lib/newspaper_works/ingest/page_image.rb +52 -0
- data/lib/newspaper_works/ingest/path_enumeration.rb +52 -0
- data/lib/newspaper_works/ingest/pdf_images.rb +85 -0
- data/lib/newspaper_works/ingest/pdf_issue.rb +20 -0
- data/lib/newspaper_works/ingest/pdf_issues.rb +39 -0
- data/lib/newspaper_works/ingest/pdf_pages.rb +114 -0
- data/lib/newspaper_works/ingest/pub_finder.rb +89 -0
- data/lib/newspaper_works/ingest/publication_info.rb +44 -0
- data/lib/newspaper_works/ingest.rb +90 -0
- data/lib/newspaper_works/issue_pdf_composer.rb +111 -0
- data/lib/newspaper_works/logging.rb +54 -0
- data/lib/newspaper_works/page_finder.rb +62 -0
- data/lib/newspaper_works/resource_fetcher.rb +78 -0
- data/lib/newspaper_works/text_extraction/alto_reader.rb +122 -0
- data/lib/newspaper_works/text_extraction/page_ocr.rb +100 -0
- data/lib/newspaper_works/text_extraction/render_alto.rb +84 -0
- data/lib/newspaper_works/text_extraction/word_coords_builder.rb +30 -0
- data/lib/newspaper_works/text_extraction.rb +10 -0
- data/lib/newspaper_works/version.rb +3 -0
- data/lib/newspaper_works.rb +19 -0
- data/lib/tasks/newspaper_works_tasks.rake +39 -0
- data/newspaper_works.gemspec +49 -0
- data/spec/.keep.txt +1 -0
- data/spec/actors/newspaper_works/actors/newspaper_works_upload_actor_spec.rb +69 -0
- data/spec/controllers/catalog_controller_spec.rb +63 -0
- data/spec/controllers/newspaper_works/newspapers_controller_spec.rb +114 -0
- data/spec/controllers/newspaper_works/newspapers_search_controller_spec.rb +21 -0
- data/spec/factories/ability.rb +6 -0
- data/spec/factories/newspaper_issue.rb +7 -0
- data/spec/factories/newspaper_issue_ingest.rb +6 -0
- data/spec/factories/newspaper_page.rb +7 -0
- data/spec/factories/newspaper_page_ingest.rb +6 -0
- data/spec/factories/newspaper_page_solr_document.rb +12 -0
- data/spec/factories/newspaper_title.rb +8 -0
- data/spec/factories/uploaded_pdf_file.rb +9 -0
- data/spec/factories/user.rb +13 -0
- data/spec/features/front_pages_for_title_spec.rb +19 -0
- data/spec/features/newspaper_title_search_spec.rb +30 -0
- data/spec/features/newspapers_search_spec.rb +49 -0
- data/spec/features/search_results_thumbnail_highlights_spec.rb +33 -0
- data/spec/features_shared.rb +71 -0
- data/spec/fixtures/files/4.1.07.jp2 +0 -0
- data/spec/fixtures/files/4.1.07.tiff +0 -0
- data/spec/fixtures/files/README.md +7 -0
- data/spec/fixtures/files/alto-2-0.xsd +714 -0
- data/spec/fixtures/files/broken-truncated.pdf +0 -0
- data/spec/fixtures/files/credits.md +16 -0
- data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
- data/spec/fixtures/files/minimal-1-page.pdf +0 -0
- data/spec/fixtures/files/minimal-2-page.pdf +0 -0
- data/spec/fixtures/files/minimal-alto.xml +31 -0
- data/spec/fixtures/files/ndnp-alto-sample.xml +24 -0
- data/spec/fixtures/files/ndnp-sample1-json.json +1 -0
- data/spec/fixtures/files/ndnp-sample1-txt.txt +1 -0
- data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
- data/spec/fixtures/files/ocr_alto.xml +202 -0
- data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +202 -0
- data/spec/fixtures/files/ocr_color.tiff +0 -0
- data/spec/fixtures/files/ocr_gray.jp2 +0 -0
- data/spec/fixtures/files/ocr_gray.tiff +0 -0
- data/spec/fixtures/files/ocr_mono.tiff +0 -0
- data/spec/fixtures/files/page1.tiff +0 -0
- data/spec/fixtures/files/resource_mocks/chronam/http404-expected +0 -0
- data/spec/fixtures/files/resource_mocks/chronam/sn84038814.rdf +1028 -0
- data/spec/fixtures/files/resource_mocks/chronam/sn93059126.rdf +36 -0
- data/spec/fixtures/files/resource_mocks/chronam/sn94051019.rdf +37 -0
- data/spec/fixtures/files/resource_mocks/geonames/Chicopee +1104 -0
- data/spec/fixtures/files/resource_mocks/geonames/Denver +1104 -0
- data/spec/fixtures/files/resource_mocks/geonames/Marysville +279 -0
- data/spec/fixtures/files/resource_mocks/geonames/Marysville2 +279 -0
- data/spec/fixtures/files/resource_mocks/geonames/SLC +1104 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn2099999999 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn82014496 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn83020109 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn83021453 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn83045396 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn84038814 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn93059126 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn94051019 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn99999999 +1 -0
- data/spec/fixtures/files/resource_mocks/urls.json +82 -0
- data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
- data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
- data/spec/fixtures/files/thumbnail.jpg +0 -0
- data/spec/forms/hyrax/newspaper_article_form_spec.rb +33 -0
- data/spec/forms/hyrax/newspaper_container_form_spec.rb +30 -0
- data/spec/forms/hyrax/newspaper_issue_form_spec.rb +31 -0
- data/spec/forms/hyrax/newspaper_page_form_spec.rb +28 -0
- data/spec/forms/hyrax/newspaper_title_form_spec.rb +31 -0
- data/spec/forms/newspaper_works/newspaper_core_form_data_spec.rb +12 -0
- data/spec/helpers/newspaper_works/breadcrumb_helper_spec.rb +82 -0
- data/spec/helpers/newspaper_works_helper_spec.rb +57 -0
- data/spec/indexers/concerns/newspaper_works/indexes_full_text_spec.rb +31 -0
- data/spec/indexers/concerns/newspaper_works/indexes_place_of_publication_spec.rb +53 -0
- data/spec/indexers/concerns/newspaper_works/indexes_publication_date_range_spec.rb +39 -0
- data/spec/indexers/concerns/newspaper_works/indexes_relationships_spec.rb +86 -0
- data/spec/indexers/newspaper_article_indexer_spec.rb +29 -0
- data/spec/indexers/newspaper_issue_indexer_spec.rb +19 -0
- data/spec/indexers/newspaper_title_indexer_spec.rb +22 -0
- data/spec/indexers/newspaper_works/newspaper_core_indexer_spec.rb +23 -0
- data/spec/lib/newspaper_works/configuration_spec.rb +18 -0
- data/spec/lib/newspaper_works/data/work_derivatives_spec.rb +245 -0
- data/spec/lib/newspaper_works/data/work_file_spec.rb +99 -0
- data/spec/lib/newspaper_works/data/work_files_spec.rb +224 -0
- data/spec/lib/newspaper_works/ingest/batch_issue_ingester_spec.rb +158 -0
- data/spec/lib/newspaper_works/ingest/chronam_publication_info_spec.rb +35 -0
- data/spec/lib/newspaper_works/ingest/from_command_spec.rb +75 -0
- data/spec/lib/newspaper_works/ingest/image_ingest_issues_spec.rb +62 -0
- data/spec/lib/newspaper_works/ingest/ingest_shared.rb +75 -0
- data/spec/lib/newspaper_works/ingest/issue_images_spec.rb +65 -0
- data/spec/lib/newspaper_works/ingest/lc_publication_info_spec.rb +34 -0
- data/spec/lib/newspaper_works/ingest/ndnp/batch_ingester_spec.rb +131 -0
- data/spec/lib/newspaper_works/ingest/ndnp/batch_xml_ingest_spec.rb +64 -0
- data/spec/lib/newspaper_works/ingest/ndnp/container_ingest_spec.rb +44 -0
- data/spec/lib/newspaper_works/ingest/ndnp/container_ingester_spec.rb +126 -0
- data/spec/lib/newspaper_works/ingest/ndnp/container_metadata_spec.rb +36 -0
- data/spec/lib/newspaper_works/ingest/ndnp/issue_ingest_spec.rb +108 -0
- data/spec/lib/newspaper_works/ingest/ndnp/issue_ingester_spec.rb +155 -0
- data/spec/lib/newspaper_works/ingest/ndnp/issue_metadata_spec.rb +84 -0
- data/spec/lib/newspaper_works/ingest/ndnp/page_ingest_spec.rb +79 -0
- data/spec/lib/newspaper_works/ingest/ndnp/page_ingester_spec.rb +184 -0
- data/spec/lib/newspaper_works/ingest/ndnp/page_metadata_spec.rb +85 -0
- data/spec/lib/newspaper_works/ingest/newspaper_issue_ingest_spec.rb +83 -0
- data/spec/lib/newspaper_works/ingest/newspaper_page_ingest_spec.rb +77 -0
- data/spec/lib/newspaper_works/ingest/page_image_spec.rb +29 -0
- data/spec/lib/newspaper_works/ingest/pdf_images_spec.rb +32 -0
- data/spec/lib/newspaper_works/ingest/pdf_issue_spec.rb +29 -0
- data/spec/lib/newspaper_works/ingest/pdf_issues_spec.rb +62 -0
- data/spec/lib/newspaper_works/ingest/pdf_pages_spec.rb +110 -0
- data/spec/lib/newspaper_works/ingest/pub_finder_spec.rb +58 -0
- data/spec/lib/newspaper_works/ingest/publication_info_spec.rb +61 -0
- data/spec/lib/newspaper_works/ingest_spec.rb +45 -0
- data/spec/lib/newspaper_works/issue_pdf_composer_spec.rb +101 -0
- data/spec/lib/newspaper_works/logging_spec.rb +53 -0
- data/spec/lib/newspaper_works/page_finder_spec.rb +53 -0
- data/spec/lib/newspaper_works/resource_fetcher_spec.rb +65 -0
- data/spec/lib/newspaper_works/text_extraction/alto_reader_spec.rb +49 -0
- data/spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb +84 -0
- data/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb +54 -0
- data/spec/lib/newspaper_works/text_extraction/word_coords_builder_spec.rb +30 -0
- data/spec/lib/tasks/newspaper_works_rake_spec.rb +124 -0
- data/spec/misc_shared.rb +109 -0
- data/spec/model_shared.rb +134 -0
- data/spec/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior_spec.rb +45 -0
- data/spec/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior_spec.rb +27 -0
- data/spec/models/concerns/newspaper_works/newspaper_core_metadata_spec.rb +45 -0
- data/spec/models/concerns/newspaper_works/place_of_publication_behavior_spec.rb +17 -0
- data/spec/models/concerns/newspaper_works/scanned_media_metadata_spec.rb +35 -0
- data/spec/models/newspaper_article_spec.rb +73 -0
- data/spec/models/newspaper_container_spec.rb +111 -0
- data/spec/models/newspaper_issue_spec.rb +91 -0
- data/spec/models/newspaper_page_spec.rb +44 -0
- data/spec/models/newspaper_title_spec.rb +116 -0
- data/spec/models/newspaper_works/derivative_attachment_spec.rb +37 -0
- data/spec/models/newspaper_works/ingest_file_relation_spec.rb +56 -0
- data/spec/models/solr_document_spec.rb +14 -0
- data/spec/ndnp_shared.rb +48 -0
- data/spec/presenters/hyrax/newspaper_article_presenter_spec.rb +53 -0
- data/spec/presenters/hyrax/newspaper_container_presenter_spec.rb +20 -0
- data/spec/presenters/hyrax/newspaper_issue_presenter_spec.rb +65 -0
- data/spec/presenters/hyrax/newspaper_page_presenter_spec.rb +75 -0
- data/spec/presenters/hyrax/newspaper_title_presenter_spec.rb +153 -0
- data/spec/presenters/newspaper_works/iiif_manifest_presenter_behavior_spec.rb +32 -0
- data/spec/presenters/newspaper_works/issue_info_presenter_spec.rb +51 -0
- data/spec/presenters/newspaper_works/newspaper_core_presenter_spec.rb +22 -0
- data/spec/presenters/newspaper_works/persistent_url_presenter_behavior_spec.rb +24 -0
- data/spec/presenters/newspaper_works/place_of_publication_presenter_behavior_spec.rb +17 -0
- data/spec/presenters/newspaper_works/scanned_media_presenter_spec.rb +18 -0
- data/spec/presenters/newspaper_works/title_info_presenter_spec.rb +23 -0
- data/spec/routing/route_spec.rb +52 -0
- data/spec/search_builders/custom_search_builder_spec.rb +34 -0
- data/spec/search_builders/newspaper_works/newspapers_search_builder_spec.rb +33 -0
- data/spec/services/hyrax/article_genre_service_spec.rb +12 -0
- data/spec/services/hyrax/resource_types_service_spec.rb +12 -0
- data/spec/services/newspaper_works/jp2_derivative_service_spec.rb +62 -0
- data/spec/services/newspaper_works/newspaper_page_derivative_service_spec.rb +125 -0
- data/spec/services/newspaper_works/pdf_derivative_service_spec.rb +62 -0
- data/spec/services/newspaper_works/pluggable_derivative_service_spec.rb +204 -0
- data/spec/services/newspaper_works/text_extraction_derivative_service_spec.rb +82 -0
- data/spec/services/newspaper_works/text_formats_from_alto_service_spec.rb +129 -0
- data/spec/services/newspaper_works/tiff_derivative_service_spec.rb +58 -0
- data/spec/spec_helper.rb +261 -0
- data/spec/support/controller_level_helpers.rb +28 -0
- data/spec/test_app_templates/lib/generators/test_app_generator.rb +22 -0
- data/spec/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb_spec.rb +36 -0
- data/spec/views/catalog/_index_header_list_newspaper_page.html.erb_spec.rb +26 -0
- data/spec/views/catalog/_thumbnail_list_newspaper_page.html.erb_spec.rb +35 -0
- data/spec/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb_spec.rb +16 -0
- data/spec/views/hyrax/newspaper_titles/_issue_search_form.html.erb_spec.rb +33 -0
- data/spec/views/hyrax/newspaper_titles/_issues_calendar.html.erb_spec.rb +37 -0
- data/spec/views/hyrax/newspaper_titles/show.html.erb_spec.rb +87 -0
- data/spec/views/newspaper_works/base/_attribute_rows.html.erb_spec.rb +60 -0
- data/spec/views/newspaper_works/base/_newspaper_hierarchy.html.erb_spec.rb +80 -0
- data/spec/views/newspaper_works/base/_show.html.erb_spec.rb +78 -0
- data/spec/views/newspaper_works/newspapers_search/search.html.erb_spec.rb +54 -0
- data/spec/views/records/edit_fields/_place_of_publication.html.erb_spec.rb +26 -0
- data/tasks/newspaperworks_dev.rake +26 -0
- data/test/integration/navigation_test.rb +7 -0
- data/test/lib/generators/newspaper_works/install_generator_test.rb +16 -0
- data/test/newspaper_works_test.rb +7 -0
- data/test/test_helper.rb +17 -0
- data/tmp/.keep +0 -0
- metadata +1037 -0
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
require 'newspaper_works/data'
|
|
2
|
+
|
|
3
|
+
module NewspaperWorks
|
|
4
|
+
module Ingest
|
|
5
|
+
# base class for ingesting works, implements, as-needed, temp files
|
|
6
|
+
class BaseIngest
|
|
7
|
+
include NewspaperWorks::Data::PathHelper
|
|
8
|
+
|
|
9
|
+
attr_accessor :work, :io, :path, :filename
|
|
10
|
+
|
|
11
|
+
def initialize(work)
|
|
12
|
+
# adapted context:
|
|
13
|
+
@work = work
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def loadpath(source)
|
|
17
|
+
# quick check the file exists and is readable on filesystem:
|
|
18
|
+
raise ArgumentError, 'File not found or readable' unless
|
|
19
|
+
File.readable?(source)
|
|
20
|
+
# path may be relative to Dir.pwd, but no matter for our use
|
|
21
|
+
@path = source.to_s
|
|
22
|
+
@io = File.open(@path)
|
|
23
|
+
@filename ||= File.split(@path)[-1]
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def loadio(source)
|
|
27
|
+
# either an IO with a path, or an IO with filename passed in
|
|
28
|
+
# args; presume we need a filename to describe/identify.
|
|
29
|
+
raise ArgumentError, 'Explicit or inferred file name required' unless
|
|
30
|
+
source.respond_to?('path') || @filename
|
|
31
|
+
@io = source
|
|
32
|
+
@path = source.respond_to?('path') ? source.path : nil
|
|
33
|
+
@filename ||= File.split(@path)[-1]
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def load(source, filename: nil)
|
|
37
|
+
# source is a string path, Pathname object, or quacks like an IO
|
|
38
|
+
unless source.class == String ||
|
|
39
|
+
source.class == Pathname ||
|
|
40
|
+
source.respond_to?('read')
|
|
41
|
+
raise ArgumentError, 'Source is neither path nor IO object'
|
|
42
|
+
end
|
|
43
|
+
# permit the possibility of a filename identifier metadata distinct
|
|
44
|
+
# from the actual path on disk:
|
|
45
|
+
@filename = filename
|
|
46
|
+
ispath = source.class == String || source.class == Pathname
|
|
47
|
+
loader = ispath ? method(:loadpath) : method(:loadio)
|
|
48
|
+
loader.call(source)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# default handler attaches file to work's file set, subclasses
|
|
52
|
+
# may overwride or wrap this.
|
|
53
|
+
def import
|
|
54
|
+
files = NewspaperWorks::Data::WorkFiles.new(work)
|
|
55
|
+
files.assign(path)
|
|
56
|
+
files.commit!
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def user
|
|
60
|
+
defined?(current_user) ? current_user : User.batch_user
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def ingest(source, filename: nil)
|
|
64
|
+
load(source, filename: filename)
|
|
65
|
+
import
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
module NewspaperWorks
|
|
2
|
+
module Ingest
|
|
3
|
+
class BasePublicationInfo
|
|
4
|
+
attr_accessor :lccn, :issn
|
|
5
|
+
|
|
6
|
+
def initialize(lccn)
|
|
7
|
+
@lccn = lccn
|
|
8
|
+
load
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def load
|
|
12
|
+
raise NotImplementedError, "abstract"
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# Return normalized, prefixed OCLC number from numeric Integer or
|
|
16
|
+
# String inputs; prefxes based on number of digits, leaves any
|
|
17
|
+
# prefix in input unchanged.
|
|
18
|
+
# @param oclcnum [String, Integer] prefixed or unprefixed OCLC control #
|
|
19
|
+
# @return [String] normalized, prefixed OCLC number
|
|
20
|
+
def oclc_prefixed(oclcnum)
|
|
21
|
+
# unprefixed number, as string
|
|
22
|
+
digits = oclcnum.to_s.gsub(/[A-Za-z]/, '')
|
|
23
|
+
return "ocm#{digits}" if digits.size == 8
|
|
24
|
+
return "ocn#{digits}" if digits.size == 9
|
|
25
|
+
"on#{digits}"
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def place_name_from_title(title)
|
|
29
|
+
parts = title.split(/ [\(]/)
|
|
30
|
+
return if parts.size < 2
|
|
31
|
+
parts[1].split(')')[0]
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
require 'find'
|
|
2
|
+
|
|
3
|
+
module NewspaperWorks
|
|
4
|
+
module Ingest
|
|
5
|
+
# mixin module for common batch ingest steps
|
|
6
|
+
module BatchIngestHelper
|
|
7
|
+
def detect_media(path)
|
|
8
|
+
result = 'pdf' # default
|
|
9
|
+
Find.find(path) do |p|
|
|
10
|
+
result = 'image' if p.end_with?('jp2') || /TIF[F]?$/i.match(p)
|
|
11
|
+
end
|
|
12
|
+
result
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def lccn_from_path(path)
|
|
16
|
+
File.basename(path)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def normalize_lccn(v)
|
|
20
|
+
p = /^[A-Za-z]{0,3}[0-9]{8}([0-9]{2})?$/
|
|
21
|
+
v = v.gsub(/\s+/, '').downcase.slice(0, 13)
|
|
22
|
+
raise ArgumentError, "LCCN appears invalid: #{v}" unless p.match(v)
|
|
23
|
+
v
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def issue_title(issue_data)
|
|
27
|
+
issue_data.title
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def copy_issue_metadata(source, target)
|
|
31
|
+
target.title = issue_title(source)
|
|
32
|
+
target.lccn = source.lccn
|
|
33
|
+
target.publication_date = source.publication_date
|
|
34
|
+
target.edition_number = source.edition_number
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def attach_file(work, path)
|
|
38
|
+
attachment = NewspaperWorks::Data::WorkFiles.of(work)
|
|
39
|
+
attachment.assign(path)
|
|
40
|
+
attachment.commit!
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
require 'open3'
|
|
2
|
+
require 'tmpdir'
|
|
3
|
+
|
|
4
|
+
module NewspaperWorks
|
|
5
|
+
module Ingest
|
|
6
|
+
class BatchIssueIngester
|
|
7
|
+
# CLI constructor, related class methods:
|
|
8
|
+
extend NewspaperWorks::Ingest::FromCommand
|
|
9
|
+
|
|
10
|
+
include NewspaperWorks::Ingest::PubFinder
|
|
11
|
+
include NewspaperWorks::Ingest::BatchIngestHelper
|
|
12
|
+
include NewspaperWorks::Logging
|
|
13
|
+
|
|
14
|
+
attr_accessor :path, :lccn, :publication, :opts, :issues
|
|
15
|
+
|
|
16
|
+
def initialize(path, opts = {})
|
|
17
|
+
@path = path
|
|
18
|
+
lccn = opts[:lccn]
|
|
19
|
+
@lccn = normalize_lccn(lccn.nil? ? lccn_from_path(path) : lccn)
|
|
20
|
+
# get publication info for LCCN from authority web service:
|
|
21
|
+
@publication = NewspaperWorks::Ingest::PublicationInfo.new(@lccn)
|
|
22
|
+
# issues for publication, as enumerable of PDFIssue
|
|
23
|
+
@issues = issue_enumerator
|
|
24
|
+
@opts = opts
|
|
25
|
+
configure_logger('ingest')
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def issue_enumerator
|
|
29
|
+
impl = NewspaperWorks::Ingest::PDFIssues
|
|
30
|
+
impl = NewspaperWorks::Ingest::ImageIngestIssues if detect_media(path) == 'image'
|
|
31
|
+
# issue enumerator depends on detected media:
|
|
32
|
+
impl.new(path, publication)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def link_publication(issue)
|
|
36
|
+
find_or_create_publication_for_issue(
|
|
37
|
+
issue,
|
|
38
|
+
@lccn,
|
|
39
|
+
@publication.title,
|
|
40
|
+
@opts
|
|
41
|
+
)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def create_issue(issue_data)
|
|
45
|
+
issue = NewspaperIssue.create
|
|
46
|
+
copy_issue_metadata(issue_data, issue)
|
|
47
|
+
NewspaperWorks::Ingest.assign_administrative_metadata(issue, @opts)
|
|
48
|
+
issue.save!
|
|
49
|
+
write_log(
|
|
50
|
+
"Created new NewspaperIssue work with date, lccn, edition metadata:"\
|
|
51
|
+
"\n"\
|
|
52
|
+
"\tLCCN: #{@lccn}\n"\
|
|
53
|
+
"\tPublication Date: #{issue_data.publication_date}\n"\
|
|
54
|
+
"\tEdition number: #{issue_data.edition_number}"
|
|
55
|
+
)
|
|
56
|
+
link_publication(issue)
|
|
57
|
+
issue
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def ingest_pdf(issue, path)
|
|
61
|
+
# ingest primary PDF for issue:
|
|
62
|
+
attach_file(issue, path)
|
|
63
|
+
# queue page creation job:
|
|
64
|
+
CreateIssuePagesJob.perform_later(issue, [path], nil, nil)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def create_page(page_image, issue)
|
|
68
|
+
page = NewspaperPage.create
|
|
69
|
+
page.title = page_image.title
|
|
70
|
+
page.page_number = page_image.page_number
|
|
71
|
+
page.save!
|
|
72
|
+
# Link page as a child of issue, via ordered members:
|
|
73
|
+
issue.ordered_members << page
|
|
74
|
+
NewspaperWorks::Ingest.assign_administrative_metadata(page, @opts)
|
|
75
|
+
issue.save!
|
|
76
|
+
# Ensure we have a source TIFF file, attach to page:
|
|
77
|
+
path = page_image.path
|
|
78
|
+
path = page_image.path.end_with?('jp2') ? make_tiff(path) : path
|
|
79
|
+
attach_file(page, path)
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def ingest_pages(issue, issue_data)
|
|
83
|
+
# Create pages in order they appear (lexical)
|
|
84
|
+
issue_data.each_value { |page_image| create_page(page_image, issue) }
|
|
85
|
+
# Make an issue PDF from constituent pages, via retryable async job,
|
|
86
|
+
# which will not succeed until the PDF derivatives are created
|
|
87
|
+
# for each page, but should eventually succeed on that condition:
|
|
88
|
+
NewspaperWorks::ComposeIssuePDFJob.perform_later(issue)
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def make_tiff(path)
|
|
92
|
+
raise ArgumentError unless path.end_with?('jp2')
|
|
93
|
+
Hyrax.config.whitelisted_ingest_dirs |= [Dir.tmpdir]
|
|
94
|
+
name = File.basename(path).split('.')[0]
|
|
95
|
+
# OpenJPEG2000 has weird quirk, only likes 3-char file ext TIF:
|
|
96
|
+
tiff_path = File.join(Dir.mktmpdir, "#{name}.tif")
|
|
97
|
+
cmd = "opj_decompress -i #{path} -o #{tiff_path}"
|
|
98
|
+
Open3.popen3(cmd) do |_stdin, _stdout, stderr, _wait_thr|
|
|
99
|
+
unless stderr.read.strip.empty?
|
|
100
|
+
msg = "Error converting JP2 to TIFF: #{path}"
|
|
101
|
+
write_log(msg, Logger::ERROR)
|
|
102
|
+
raise msg
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
tiff_path
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def ingest_type
|
|
109
|
+
return 'issue_pdf' if @issues.class == NewspaperWorks::Ingest::PDFIssues
|
|
110
|
+
'page_image'
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def ingest
|
|
114
|
+
write_log("Beginning issue(s) batch ingest for #{@path}")
|
|
115
|
+
write_log("\tPublication: #{@publication.title} (LCCN: #{@lccn})")
|
|
116
|
+
@issues.each do |path, issue_data|
|
|
117
|
+
issue = create_issue(issue_data)
|
|
118
|
+
tactic = ingest_type
|
|
119
|
+
ingest_pdf(issue, path) if tactic == 'issue_pdf'
|
|
120
|
+
ingest_pages(issue, issue_data) if tactic == 'page_image'
|
|
121
|
+
end
|
|
122
|
+
write_log(
|
|
123
|
+
"Issue ingest completed for LCCN #{@lccn}. Asyncrhonous jobs "\
|
|
124
|
+
"may still be creating derivatives for issue, and child page works."
|
|
125
|
+
)
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
end
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
require 'faraday'
|
|
2
|
+
require 'nokogiri'
|
|
3
|
+
|
|
4
|
+
module NewspaperWorks
|
|
5
|
+
module Ingest
|
|
6
|
+
# Publication info from ChronAm as remote authority for metadata
|
|
7
|
+
class ChronAmPublicationInfo < BasePublicationInfo
|
|
8
|
+
attr_accessor :issn, :title, :place_name, :place_of_publication, :language
|
|
9
|
+
|
|
10
|
+
XML_NS = {
|
|
11
|
+
dcterms: 'http://purl.org/dc/terms/',
|
|
12
|
+
frbr: 'http://purl.org/vocab/frbr/core#',
|
|
13
|
+
owl: 'http://www.w3.org/2002/07/owl#',
|
|
14
|
+
rda: 'http://rdvocab.info/elements/',
|
|
15
|
+
rdf: 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
|
|
16
|
+
rdfs: 'http://www.w3.org/2000/01/rdf-schema#'
|
|
17
|
+
}.freeze
|
|
18
|
+
|
|
19
|
+
BASE_URL = 'https://chroniclingamerica.loc.gov/lccn'.freeze
|
|
20
|
+
|
|
21
|
+
def initialize(lccn)
|
|
22
|
+
# true until loaded
|
|
23
|
+
@empty = true
|
|
24
|
+
super(lccn)
|
|
25
|
+
@issn = nil # chronam doesn't have this
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def empty?
|
|
29
|
+
@empty
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def inspect
|
|
33
|
+
format(
|
|
34
|
+
"<#{self.class}:0x000000000%<oid>x " \
|
|
35
|
+
"\tlccn: '#{@lccn}'>",
|
|
36
|
+
oid: object_id << 1
|
|
37
|
+
)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def load_place
|
|
41
|
+
place_match = find('//rda:placeOfPublication')
|
|
42
|
+
return if place_match.nil?
|
|
43
|
+
@place_name = place_match.first.text
|
|
44
|
+
@place_of_publication = NewspaperWorks::Ingest.geonames_place_uri(
|
|
45
|
+
@place_name
|
|
46
|
+
)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def url
|
|
50
|
+
"#{BASE_URL}/#{@lccn}.rdf"
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def load
|
|
54
|
+
resp = NewspaperWorks::ResourceFetcher.get url
|
|
55
|
+
return if resp['status'] == 404
|
|
56
|
+
@doc = Nokogiri.XML(resp['body'])
|
|
57
|
+
@title = normalize_title(find('//dcterms:title').first.text)
|
|
58
|
+
@language = iso_language_for(find('//dcterms:language').first.text)
|
|
59
|
+
@empty = false
|
|
60
|
+
load_place
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def oclcnum
|
|
64
|
+
key = 'info:oclcnum'
|
|
65
|
+
selected = sameas_resources.select { |v| v.text.start_with?(key) }
|
|
66
|
+
return if selected.empty?
|
|
67
|
+
oclc_prefixed(selected[0].text.split('/')[1])
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def preceded_by
|
|
71
|
+
return if empty?
|
|
72
|
+
found = find('//frbr:successorOf/@rdf:resource').first
|
|
73
|
+
return if found.nil?
|
|
74
|
+
normalize_related(found.text)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def succeeded_by
|
|
78
|
+
return if empty?
|
|
79
|
+
found = find('//frbr:successor/@rdf:resource').first
|
|
80
|
+
return if found.nil?
|
|
81
|
+
normalize_related(found.text)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
private
|
|
85
|
+
|
|
86
|
+
def normalize_title(value)
|
|
87
|
+
NewspaperWorks::Ingest.normalize_title(value)
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Returns URL to LC catalog, provided such exists, on the basis of
|
|
91
|
+
# non-empty MODS for given LCCN. Otherwise returns nil.
|
|
92
|
+
def lc_catalog_url(lccn)
|
|
93
|
+
content_url = "https://lccn.loc.gov/#{lccn}"
|
|
94
|
+
url = "#{content_url}/mods"
|
|
95
|
+
resp = NewspaperWorks::ResourceFetcher.get url
|
|
96
|
+
doc = Nokogiri.XML(resp['body'])
|
|
97
|
+
return content_url unless doc.root.children.empty?
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def normalize_related(value)
|
|
101
|
+
lccn = value.split('/')[-1].split('#')[0]
|
|
102
|
+
lc_url = lc_catalog_url(lccn)
|
|
103
|
+
# URL to lccn.loc.gov is preferred authority for publication URL
|
|
104
|
+
return lc_url unless lc_url.nil?
|
|
105
|
+
# URL to HTML representation of content on ChronAm is fallback
|
|
106
|
+
"#{BASE_URL}/#{lccn}"
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def sameas_resources
|
|
110
|
+
find('//owl:sameAs/@rdf:resource') || []
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def find(expr, context = nil)
|
|
114
|
+
context ||= @doc
|
|
115
|
+
return if context.nil?
|
|
116
|
+
context.xpath(expr, **XML_NS)
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# ISO 639-2 three-character code from ISO 639-1 two-character code
|
|
120
|
+
# or equivalent lingvoj resource URL used by ChronAm;
|
|
121
|
+
# uses HTML language tables maintained by LOC.
|
|
122
|
+
def iso_language_for(code)
|
|
123
|
+
# handle case where source language code is lingvoj url:
|
|
124
|
+
code = code.split('/')[-1]
|
|
125
|
+
lookup_url = 'https://www.loc.gov/standards/iso639-2/php/langcodes_name.php'
|
|
126
|
+
lookup_url += "?iso_639_1=#{code}"
|
|
127
|
+
resp = NewspaperWorks::ResourceFetcher.get lookup_url
|
|
128
|
+
html = Nokogiri::HTML(resp['body'])
|
|
129
|
+
html.xpath('//table[1]/tr[2]/td[2]').first.text.strip
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
end
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
module NewspaperWorks
|
|
2
|
+
module Ingest
|
|
3
|
+
# class-method mixin module for ingest command-line invocation
|
|
4
|
+
# usage in classes: `extend NewspaperWorks::Ingest::FromCommand`
|
|
5
|
+
# These are all expected to be class methods in various CLI ingests.
|
|
6
|
+
module FromCommand
|
|
7
|
+
# alternate constructor from ARGV
|
|
8
|
+
# @param options [Array<String>]
|
|
9
|
+
def from_command(options, cmd_name)
|
|
10
|
+
path, opts = batch_path(options, cmd_name)
|
|
11
|
+
missing_path(cmd_name) if path.nil?
|
|
12
|
+
path = normalize_path(path)
|
|
13
|
+
missing_path(cmd_name, "Not found: #{path}") unless File.exist?(path)
|
|
14
|
+
Hyrax.config.whitelisted_ingest_dirs.push(File.dirname(path))
|
|
15
|
+
new(path, opts)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def missing_path(cmd_name, msg = "Missing path argument")
|
|
19
|
+
STDERR.puts "Usage: #{cmd_name} -- --path=PATH"
|
|
20
|
+
STDERR.puts "#{msg}. Exiting."
|
|
21
|
+
# rubocop:disable Rails/Exit
|
|
22
|
+
exit(1) if cmd_name.start_with?('rake')
|
|
23
|
+
# rubocop:enable Rails/Exit
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def batch_path(options, cmd_name)
|
|
27
|
+
path = nil
|
|
28
|
+
params = {}
|
|
29
|
+
parser = OptionParser.new
|
|
30
|
+
args = parser.order!(options) {}
|
|
31
|
+
parser.banner = "Usage: #{cmd_name} -- --path=PATH"
|
|
32
|
+
parser.on('-i PATH', '--path PATH') do |p|
|
|
33
|
+
path = p
|
|
34
|
+
end
|
|
35
|
+
parser.on('--admin_set=ADMIN_SET')
|
|
36
|
+
parser.on('--depositor=DEPOSITOR')
|
|
37
|
+
parser.on('--visibility=VISIBILITY')
|
|
38
|
+
# lccn used by PDF issue ingest, but not NDNP ingest:
|
|
39
|
+
parser.on('--lccn=LCCN')
|
|
40
|
+
parser.parse!(args, into: params)
|
|
41
|
+
[path, params]
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# default normalization is no normalization of path
|
|
45
|
+
# @param path [String]
|
|
46
|
+
# @return [String]
|
|
47
|
+
def normalize_path(path)
|
|
48
|
+
path
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
module NewspaperWorks
|
|
2
|
+
module Ingest
|
|
3
|
+
class ImageIngestIssues
|
|
4
|
+
include Enumerable
|
|
5
|
+
include NewspaperWorks::Ingest::PathEnumeration
|
|
6
|
+
|
|
7
|
+
attr_accessor :path, :publication
|
|
8
|
+
|
|
9
|
+
delegate :lccn, to: :publication
|
|
10
|
+
|
|
11
|
+
def initialize(path, publication)
|
|
12
|
+
# path is path to publication directory containing issues:
|
|
13
|
+
@path = path
|
|
14
|
+
# Publication info
|
|
15
|
+
@publication = publication
|
|
16
|
+
@issue_paths = nil
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def paths
|
|
20
|
+
return @issue_paths unless @issue_paths.nil?
|
|
21
|
+
result = []
|
|
22
|
+
entries = Dir.entries(path).map { |n| File.join(path, n) }
|
|
23
|
+
entries.select { |p| !File.basename(p).start_with?('.') }.each do |p|
|
|
24
|
+
next unless File.directory?(p)
|
|
25
|
+
next unless path_validates?(p)
|
|
26
|
+
result.push(p)
|
|
27
|
+
end
|
|
28
|
+
@issue_paths = result
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def info(path)
|
|
32
|
+
NewspaperWorks::Ingest::IssueImages.new(path, @publication)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
private
|
|
36
|
+
|
|
37
|
+
def path_validates?(p)
|
|
38
|
+
ptn = /^([0-9]{4})(1[012]|[0][1-9])(3[01]|[12][0-9]|0[1-9])([0-9]{2})?/
|
|
39
|
+
ptn.match(File.basename(p)) ? true : false
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
require 'date'
|
|
2
|
+
require 'find'
|
|
3
|
+
|
|
4
|
+
module NewspaperWorks
|
|
5
|
+
module Ingest
|
|
6
|
+
# Represents TIFF/JP2 issue, provides metadata, enumerates PageImage objects
|
|
7
|
+
class IssueImages
|
|
8
|
+
# most acccessors for issue/edition metadata, publication metadata
|
|
9
|
+
# provided by including this mixin:
|
|
10
|
+
include NewspaperWorks::Ingest::NamedIssueMetadata
|
|
11
|
+
|
|
12
|
+
# Path enumeration by mixing in Enumerable, PathEnumeration
|
|
13
|
+
include Enumerable
|
|
14
|
+
include NewspaperWorks::Ingest::PathEnumeration
|
|
15
|
+
|
|
16
|
+
attr_accessor :path, :publication
|
|
17
|
+
|
|
18
|
+
# things that look like images, by file extension:
|
|
19
|
+
IMAGE_EXT = ['tiff', 'tif', 'jp2', 'jpg', 'png'].freeze
|
|
20
|
+
|
|
21
|
+
def initialize(path, publication)
|
|
22
|
+
@path = path
|
|
23
|
+
raise ArgumentError, 'Path not directory' unless File.directory?(path)
|
|
24
|
+
validate_path
|
|
25
|
+
# as a NewspaperWorks::Ingest::PublicationInfo object:
|
|
26
|
+
@publication = publication
|
|
27
|
+
@pages = nil
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def page_paths
|
|
31
|
+
return @pages unless @pages.nil?
|
|
32
|
+
@pages = []
|
|
33
|
+
entries = Dir.entries(path).map { |n| File.join(path, n) }
|
|
34
|
+
entries.sort.each do |p|
|
|
35
|
+
next unless File.ftype(p) == 'file'
|
|
36
|
+
ext = File.basename(p).downcase.split('.')[-1]
|
|
37
|
+
next unless IMAGE_EXT.include?(ext)
|
|
38
|
+
@pages.push(p)
|
|
39
|
+
end
|
|
40
|
+
@pages
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def info(path)
|
|
44
|
+
page_seq_num = page_paths.index(path) + 1
|
|
45
|
+
NewspaperWorks::Ingest::PageImage.new(path, self, page_seq_num)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
alias paths page_paths
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|