newspaper_works 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.fcrepo_wrapper +4 -0
- data/.gitignore +43 -0
- data/.rubocop.yml +143 -0
- data/.solr_wrapper +8 -0
- data/.travis.yml +50 -0
- data/Gemfile +47 -0
- data/LICENSE +203 -0
- data/README.md +159 -0
- data/Rakefile +38 -0
- data/app/actors/hyrax/actors/newspaper_article_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_container_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_issue_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_page_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_title_actor.rb +8 -0
- data/app/actors/newspaper_works/actors/newspaper_works_upload_actor.rb +88 -0
- data/app/assets/config/newspaper_works_manifest.js +2 -0
- data/app/assets/images/newspaper_works/.keep +0 -0
- data/app/assets/javascripts/newspaper_works/autocomplete_fix.js +33 -0
- data/app/assets/javascripts/newspaper_works/ocr_search.js.erb +6 -0
- data/app/assets/javascripts/newspaper_works/thumbnail_highlights.js.erb +102 -0
- data/app/assets/javascripts/newspaper_works.js +4 -0
- data/app/assets/stylesheets/newspaper_works/_issue_search.scss +13 -0
- data/app/assets/stylesheets/newspaper_works/_issues_calendar.scss +18 -0
- data/app/assets/stylesheets/newspaper_works/_newspaper_works.scss +4 -0
- data/app/assets/stylesheets/newspaper_works/_newspapers_search.scss +38 -0
- data/app/assets/stylesheets/newspaper_works/_search_results.scss +12 -0
- data/app/controllers/hyrax/newspaper_articles_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_containers_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_issues_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_pages_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_titles_controller.rb +13 -0
- data/app/controllers/newspaper_works/newspapers_controller.rb +117 -0
- data/app/controllers/newspaper_works/newspapers_search_controller.rb +26 -0
- data/app/forms/hyrax/newspaper_article_form.rb +11 -0
- data/app/forms/hyrax/newspaper_container_form.rb +11 -0
- data/app/forms/hyrax/newspaper_issue_form.rb +11 -0
- data/app/forms/hyrax/newspaper_page_form.rb +15 -0
- data/app/forms/hyrax/newspaper_title_form.rb +12 -0
- data/app/forms/newspaper_works/newspaper_core_form_data.rb +17 -0
- data/app/helpers/hyrax/newspaper_articles_helper.rb +5 -0
- data/app/helpers/hyrax/newspaper_containers_helper.rb +5 -0
- data/app/helpers/hyrax/newspaper_issues_helper.rb +5 -0
- data/app/helpers/hyrax/newspaper_pages_helper.rb +5 -0
- data/app/helpers/newspaper_works/application_helper.rb +5 -0
- data/app/helpers/newspaper_works/breadcrumb_helper.rb +92 -0
- data/app/helpers/newspaper_works/newspaper_works_helper_behavior.rb +103 -0
- data/app/helpers/newspaper_works/newspapers_helper.rb +5 -0
- data/app/indexers/concerns/newspaper_works/indexes_full_text.rb +17 -0
- data/app/indexers/concerns/newspaper_works/indexes_place_of_publication.rb +67 -0
- data/app/indexers/concerns/newspaper_works/indexes_publication_date_range.rb +35 -0
- data/app/indexers/concerns/newspaper_works/indexes_relationships.rb +125 -0
- data/app/indexers/newspaper_article_indexer.rb +16 -0
- data/app/indexers/newspaper_container_indexer.rb +18 -0
- data/app/indexers/newspaper_issue_indexer.rb +26 -0
- data/app/indexers/newspaper_page_indexer.rb +9 -0
- data/app/indexers/newspaper_title_indexer.rb +19 -0
- data/app/indexers/newspaper_works/newspaper_core_indexer.rb +21 -0
- data/app/jobs/newspaper_works/application_job.rb +4 -0
- data/app/jobs/newspaper_works/compose_issue_pdf_job.rb +13 -0
- data/app/jobs/newspaper_works/create_issue_pages_job.rb +19 -0
- data/app/mailers/newspaper_works/application_mailer.rb +8 -0
- data/app/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior.rb +82 -0
- data/app/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior.rb +27 -0
- data/app/models/concerns/newspaper_works/newspaper_core_metadata.rb +67 -0
- data/app/models/concerns/newspaper_works/place_of_publication_behavior.rb +15 -0
- data/app/models/concerns/newspaper_works/scanned_media_metadata.rb +43 -0
- data/app/models/concerns/newspaper_works/solr/document.rb +25 -0
- data/app/models/file_set.rb +10 -0
- data/app/models/newspaper_article.rb +158 -0
- data/app/models/newspaper_container.rb +86 -0
- data/app/models/newspaper_issue.rb +115 -0
- data/app/models/newspaper_page.rb +70 -0
- data/app/models/newspaper_title.rb +111 -0
- data/app/models/newspaper_works/application_record.rb +6 -0
- data/app/models/newspaper_works/derivative_attachment.rb +8 -0
- data/app/models/newspaper_works/ingest_file_relation.rb +14 -0
- data/app/presenters/hyrax/newspaper_article_presenter.rb +38 -0
- data/app/presenters/hyrax/newspaper_container_presenter.rb +11 -0
- data/app/presenters/hyrax/newspaper_issue_presenter.rb +62 -0
- data/app/presenters/hyrax/newspaper_page_presenter.rb +72 -0
- data/app/presenters/hyrax/newspaper_title_presenter.rb +86 -0
- data/app/presenters/newspaper_works/iiif_manifest_presenter_behavior.rb +29 -0
- data/app/presenters/newspaper_works/issue_info_presenter.rb +29 -0
- data/app/presenters/newspaper_works/newspaper_core_presenter.rb +9 -0
- data/app/presenters/newspaper_works/persistent_url_presenter_behavior.rb +16 -0
- data/app/presenters/newspaper_works/place_of_publication_presenter_behavior.rb +8 -0
- data/app/presenters/newspaper_works/scanned_media_presenter.rb +7 -0
- data/app/presenters/newspaper_works/title_info_presenter.rb +13 -0
- data/app/search_builders/concerns/newspaper_works/exclude_models.rb +16 -0
- data/app/search_builders/concerns/newspaper_works/highlight_search_params.rb +14 -0
- data/app/search_builders/newspaper_works/newspapers_search_builder.rb +26 -0
- data/app/services/hyrax/article_genre_service.rb +9 -0
- data/app/services/newspaper_works/jp2_derivative_service.rb +120 -0
- data/app/services/newspaper_works/newspaper_page_derivative_service.rb +91 -0
- data/app/services/newspaper_works/pdf_derivative_service.rb +45 -0
- data/app/services/newspaper_works/pluggable_derivative_service.rb +114 -0
- data/app/services/newspaper_works/text_extraction_derivative_service.rb +56 -0
- data/app/services/newspaper_works/text_formats_from_alto_service.rb +77 -0
- data/app/services/newspaper_works/tiff_derivative_service.rb +54 -0
- data/app/validators/newspaper_works/publication_date_start_end_validator.rb +48 -0
- data/app/validators/newspaper_works/publication_date_validator.rb +16 -0
- data/app/views/catalog/_index_gallery_newspaper_article_wrapper.html.erb +9 -0
- data/app/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb +9 -0
- data/app/views/catalog/_index_header_gallery_newspaper_article.html.erb +23 -0
- data/app/views/catalog/_index_header_gallery_newspaper_page.html.erb +23 -0
- data/app/views/catalog/_index_header_list_newspaper_article.html.erb +7 -0
- data/app/views/catalog/_index_header_list_newspaper_page.html.erb +7 -0
- data/app/views/catalog/_snippets_more.html.erb +16 -0
- data/app/views/catalog/_thumbnail_list_newspaper_article.html.erb +6 -0
- data/app/views/catalog/_thumbnail_list_newspaper_page.html.erb +6 -0
- data/app/views/hyrax/file_sets/_actions.html.erb +45 -0
- data/app/views/hyrax/newspaper_articles/_newspaper_article.html.erb +2 -0
- data/app/views/hyrax/newspaper_articles/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_containers/_newspaper_container.html.erb +2 -0
- data/app/views/hyrax/newspaper_containers/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_issues/_newspaper_issue.html.erb +2 -0
- data/app/views/hyrax/newspaper_issues/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_pages/_newspaper_page.html.erb +2 -0
- data/app/views/hyrax/newspaper_pages/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb +5 -0
- data/app/views/hyrax/newspaper_titles/_issue_search_form.html.erb +33 -0
- data/app/views/hyrax/newspaper_titles/_issues_calendar.html.erb +63 -0
- data/app/views/hyrax/newspaper_titles/_newspaper_title.html.erb +2 -0
- data/app/views/hyrax/newspaper_titles/show.html.erb +54 -0
- data/app/views/newspaper_works/base/_attribute_rows.html.erb +42 -0
- data/app/views/newspaper_works/base/_attributes.html.erb +16 -0
- data/app/views/newspaper_works/base/_metadata.html.erb +6 -0
- data/app/views/newspaper_works/base/_newspaper_hierarchy.html.erb +14 -0
- data/app/views/newspaper_works/base/_persistent_url.html.erb +1 -0
- data/app/views/newspaper_works/base/_show.html.erb +45 -0
- data/app/views/newspaper_works/newspapers_search/_date_fields.html.erb +29 -0
- data/app/views/newspaper_works/newspapers_search/_facet_layout.html.erb +8 -0
- data/app/views/newspaper_works/newspapers_search/_facet_limit.html.erb +17 -0
- data/app/views/newspaper_works/newspapers_search/_front_pages_input.html.erb +5 -0
- data/app/views/newspaper_works/newspapers_search/_keyword_input.html.erb +18 -0
- data/app/views/newspaper_works/newspapers_search/_newspapers_facets.html.erb +5 -0
- data/app/views/newspaper_works/newspapers_search/_newspapers_search_form.html.erb +13 -0
- data/app/views/newspaper_works/newspapers_search/_newspapers_search_help.html.erb +8 -0
- data/app/views/newspaper_works/newspapers_search/search.html.erb +13 -0
- data/app/views/records/edit_fields/_alternate_title.html.erb +4 -0
- data/app/views/records/edit_fields/_genre.html.erb +4 -0
- data/app/views/records/edit_fields/_place_of_publication.html.erb +14 -0
- data/app/views/records/edit_fields/_subtitle.html.erb +4 -0
- data/bin/rails +13 -0
- data/config/fcrepo_wrapper_test.yml +5 -0
- data/config/initializers/assets.rb +2 -0
- data/config/locales/newspaper_article.de.yml +12 -0
- data/config/locales/newspaper_article.en.yml +12 -0
- data/config/locales/newspaper_article.es.yml +12 -0
- data/config/locales/newspaper_article.fr.yml +12 -0
- data/config/locales/newspaper_article.it.yml +12 -0
- data/config/locales/newspaper_article.pt-BR.yml +12 -0
- data/config/locales/newspaper_article.zh.yml +12 -0
- data/config/locales/newspaper_container.de.yml +8 -0
- data/config/locales/newspaper_container.en.yml +8 -0
- data/config/locales/newspaper_container.es.yml +8 -0
- data/config/locales/newspaper_container.fr.yml +8 -0
- data/config/locales/newspaper_container.it.yml +8 -0
- data/config/locales/newspaper_container.pt-BR.yml +8 -0
- data/config/locales/newspaper_container.zh.yml +8 -0
- data/config/locales/newspaper_issue.de.yml +8 -0
- data/config/locales/newspaper_issue.en.yml +8 -0
- data/config/locales/newspaper_issue.es.yml +8 -0
- data/config/locales/newspaper_issue.fr.yml +8 -0
- data/config/locales/newspaper_issue.it.yml +8 -0
- data/config/locales/newspaper_issue.pt-BR.yml +8 -0
- data/config/locales/newspaper_issue.zh.yml +8 -0
- data/config/locales/newspaper_page.de.yml +15 -0
- data/config/locales/newspaper_page.en.yml +15 -0
- data/config/locales/newspaper_page.es.yml +15 -0
- data/config/locales/newspaper_page.fr.yml +15 -0
- data/config/locales/newspaper_page.it.yml +15 -0
- data/config/locales/newspaper_page.pt-BR.yml +15 -0
- data/config/locales/newspaper_page.zh.yml +15 -0
- data/config/locales/newspaper_title.de.yml +8 -0
- data/config/locales/newspaper_title.en.yml +8 -0
- data/config/locales/newspaper_title.es.yml +8 -0
- data/config/locales/newspaper_title.fr.yml +8 -0
- data/config/locales/newspaper_title.it.yml +8 -0
- data/config/locales/newspaper_title.pt-BR.yml +8 -0
- data/config/locales/newspaper_title.zh.yml +8 -0
- data/config/locales/newspaper_works.de.yml +50 -0
- data/config/locales/newspaper_works.en.yml +52 -0
- data/config/locales/newspaper_works.es.yml +52 -0
- data/config/locales/newspaper_works.fr.yml +52 -0
- data/config/locales/newspaper_works.it.yml +52 -0
- data/config/locales/newspaper_works.pt-BR.yml +52 -0
- data/config/locales/newspaper_works.zh.yml +52 -0
- data/config/routes.rb +9 -0
- data/config/solr_wrapper_test.yml +9 -0
- data/config/test-fixture/solr-config/_rest_managed.json +3 -0
- data/config/test-fixture/solr-config/admin-extra.html +31 -0
- data/config/test-fixture/solr-config/elevate.xml +36 -0
- data/config/test-fixture/solr-config/mapping-ISOLatin1Accent.txt +246 -0
- data/config/test-fixture/solr-config/protwords.txt +21 -0
- data/config/test-fixture/solr-config/schema.xml +366 -0
- data/config/test-fixture/solr-config/scripts.conf +24 -0
- data/config/test-fixture/solr-config/solrconfig.xml +322 -0
- data/config/test-fixture/solr-config/spellings.txt +2 -0
- data/config/test-fixture/solr-config/stopwords.txt +58 -0
- data/config/test-fixture/solr-config/stopwords_en.txt +58 -0
- data/config/test-fixture/solr-config/synonyms.txt +31 -0
- data/config/test-fixture/solr-config/xslt/example.xsl +132 -0
- data/config/test-fixture/solr-config/xslt/example_atom.xsl +67 -0
- data/config/test-fixture/solr-config/xslt/example_rss.xsl +66 -0
- data/config/test-fixture/solr-config/xslt/luke.xsl +337 -0
- data/config/vendor/imagemagick-6-policy.xml +76 -0
- data/db/migrate/20181214181358_create_newspaper_works_derivative_attachments.rb +12 -0
- data/db/migrate/20190107165909_create_newspaper_works_ingest_file_relations.rb +11 -0
- data/lib/generators/newspaper_works/assets_generator.rb +29 -0
- data/lib/generators/newspaper_works/blacklight_advanced_search_generator.rb +44 -0
- data/lib/generators/newspaper_works/blacklight_iiif_search_generator.rb +41 -0
- data/lib/generators/newspaper_works/catalog_controller_generator.rb +60 -0
- data/lib/generators/newspaper_works/install_generator.rb +97 -0
- data/lib/generators/newspaper_works/templates/annotation_behavior.rb +6 -0
- data/lib/generators/newspaper_works/templates/config/authorities/newspaper_article_genres.yml +86 -0
- data/lib/generators/newspaper_works/templates/config/initializers/newspaper_works.rb +12 -0
- data/lib/generators/newspaper_works/templates/config/initializers/patch_blacklight_advanced_search.rb +74 -0
- data/lib/generators/newspaper_works/templates/custom_search_builder.rb +23 -0
- data/lib/generators/newspaper_works/templates/newspaper_works.scss +1 -0
- data/lib/generators/newspaper_works/templates/newspaper_works_helper.rb +3 -0
- data/lib/generators/newspaper_works/templates/search_behavior.rb +6 -0
- data/lib/newspaper_works/configuration.rb +14 -0
- data/lib/newspaper_works/data/fileset_helper.rb +25 -0
- data/lib/newspaper_works/data/path_helper.rb +40 -0
- data/lib/newspaper_works/data/work_derivatives.rb +314 -0
- data/lib/newspaper_works/data/work_file.rb +92 -0
- data/lib/newspaper_works/data/work_files.rb +181 -0
- data/lib/newspaper_works/data.rb +35 -0
- data/lib/newspaper_works/engine.rb +42 -0
- data/lib/newspaper_works/errors.rb +14 -0
- data/lib/newspaper_works/ingest/base_ingest.rb +69 -0
- data/lib/newspaper_works/ingest/base_publication_info.rb +35 -0
- data/lib/newspaper_works/ingest/batch_ingest_helper.rb +44 -0
- data/lib/newspaper_works/ingest/batch_issue_ingester.rb +129 -0
- data/lib/newspaper_works/ingest/chronam_publication_info.rb +133 -0
- data/lib/newspaper_works/ingest/from_command.rb +52 -0
- data/lib/newspaper_works/ingest/image_ingest_issues.rb +43 -0
- data/lib/newspaper_works/ingest/issue_images.rb +51 -0
- data/lib/newspaper_works/ingest/lc_publication_info.rb +144 -0
- data/lib/newspaper_works/ingest/named_issue_metadata.rb +60 -0
- data/lib/newspaper_works/ingest/ndnp/batch_ingester.rb +64 -0
- data/lib/newspaper_works/ingest/ndnp/batch_xml_ingest.rb +72 -0
- data/lib/newspaper_works/ingest/ndnp/container_ingest.rb +99 -0
- data/lib/newspaper_works/ingest/ndnp/container_ingester.rb +84 -0
- data/lib/newspaper_works/ingest/ndnp/container_metadata.rb +87 -0
- data/lib/newspaper_works/ingest/ndnp/issue_ingest.rb +81 -0
- data/lib/newspaper_works/ingest/ndnp/issue_ingester.rb +101 -0
- data/lib/newspaper_works/ingest/ndnp/issue_metadata.rb +96 -0
- data/lib/newspaper_works/ingest/ndnp/ndnp_asset_helper.rb +20 -0
- data/lib/newspaper_works/ingest/ndnp/ndnp_mets_helper.rb +70 -0
- data/lib/newspaper_works/ingest/ndnp/page_ingest.rb +47 -0
- data/lib/newspaper_works/ingest/ndnp/page_ingester.rb +157 -0
- data/lib/newspaper_works/ingest/ndnp/page_metadata.rb +112 -0
- data/lib/newspaper_works/ingest/ndnp.rb +21 -0
- data/lib/newspaper_works/ingest/newspaper_issue_ingest.rb +56 -0
- data/lib/newspaper_works/ingest/newspaper_page_ingest.rb +6 -0
- data/lib/newspaper_works/ingest/page_image.rb +52 -0
- data/lib/newspaper_works/ingest/path_enumeration.rb +52 -0
- data/lib/newspaper_works/ingest/pdf_images.rb +85 -0
- data/lib/newspaper_works/ingest/pdf_issue.rb +20 -0
- data/lib/newspaper_works/ingest/pdf_issues.rb +39 -0
- data/lib/newspaper_works/ingest/pdf_pages.rb +114 -0
- data/lib/newspaper_works/ingest/pub_finder.rb +89 -0
- data/lib/newspaper_works/ingest/publication_info.rb +44 -0
- data/lib/newspaper_works/ingest.rb +90 -0
- data/lib/newspaper_works/issue_pdf_composer.rb +111 -0
- data/lib/newspaper_works/logging.rb +54 -0
- data/lib/newspaper_works/page_finder.rb +62 -0
- data/lib/newspaper_works/resource_fetcher.rb +78 -0
- data/lib/newspaper_works/text_extraction/alto_reader.rb +122 -0
- data/lib/newspaper_works/text_extraction/page_ocr.rb +100 -0
- data/lib/newspaper_works/text_extraction/render_alto.rb +84 -0
- data/lib/newspaper_works/text_extraction/word_coords_builder.rb +30 -0
- data/lib/newspaper_works/text_extraction.rb +10 -0
- data/lib/newspaper_works/version.rb +3 -0
- data/lib/newspaper_works.rb +19 -0
- data/lib/tasks/newspaper_works_tasks.rake +39 -0
- data/newspaper_works.gemspec +49 -0
- data/spec/.keep.txt +1 -0
- data/spec/actors/newspaper_works/actors/newspaper_works_upload_actor_spec.rb +69 -0
- data/spec/controllers/catalog_controller_spec.rb +63 -0
- data/spec/controllers/newspaper_works/newspapers_controller_spec.rb +114 -0
- data/spec/controllers/newspaper_works/newspapers_search_controller_spec.rb +21 -0
- data/spec/factories/ability.rb +6 -0
- data/spec/factories/newspaper_issue.rb +7 -0
- data/spec/factories/newspaper_issue_ingest.rb +6 -0
- data/spec/factories/newspaper_page.rb +7 -0
- data/spec/factories/newspaper_page_ingest.rb +6 -0
- data/spec/factories/newspaper_page_solr_document.rb +12 -0
- data/spec/factories/newspaper_title.rb +8 -0
- data/spec/factories/uploaded_pdf_file.rb +9 -0
- data/spec/factories/user.rb +13 -0
- data/spec/features/front_pages_for_title_spec.rb +19 -0
- data/spec/features/newspaper_title_search_spec.rb +30 -0
- data/spec/features/newspapers_search_spec.rb +49 -0
- data/spec/features/search_results_thumbnail_highlights_spec.rb +33 -0
- data/spec/features_shared.rb +71 -0
- data/spec/fixtures/files/4.1.07.jp2 +0 -0
- data/spec/fixtures/files/4.1.07.tiff +0 -0
- data/spec/fixtures/files/README.md +7 -0
- data/spec/fixtures/files/alto-2-0.xsd +714 -0
- data/spec/fixtures/files/broken-truncated.pdf +0 -0
- data/spec/fixtures/files/credits.md +16 -0
- data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
- data/spec/fixtures/files/minimal-1-page.pdf +0 -0
- data/spec/fixtures/files/minimal-2-page.pdf +0 -0
- data/spec/fixtures/files/minimal-alto.xml +31 -0
- data/spec/fixtures/files/ndnp-alto-sample.xml +24 -0
- data/spec/fixtures/files/ndnp-sample1-json.json +1 -0
- data/spec/fixtures/files/ndnp-sample1-txt.txt +1 -0
- data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
- data/spec/fixtures/files/ocr_alto.xml +202 -0
- data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +202 -0
- data/spec/fixtures/files/ocr_color.tiff +0 -0
- data/spec/fixtures/files/ocr_gray.jp2 +0 -0
- data/spec/fixtures/files/ocr_gray.tiff +0 -0
- data/spec/fixtures/files/ocr_mono.tiff +0 -0
- data/spec/fixtures/files/page1.tiff +0 -0
- data/spec/fixtures/files/resource_mocks/chronam/http404-expected +0 -0
- data/spec/fixtures/files/resource_mocks/chronam/sn84038814.rdf +1028 -0
- data/spec/fixtures/files/resource_mocks/chronam/sn93059126.rdf +36 -0
- data/spec/fixtures/files/resource_mocks/chronam/sn94051019.rdf +37 -0
- data/spec/fixtures/files/resource_mocks/geonames/Chicopee +1104 -0
- data/spec/fixtures/files/resource_mocks/geonames/Denver +1104 -0
- data/spec/fixtures/files/resource_mocks/geonames/Marysville +279 -0
- data/spec/fixtures/files/resource_mocks/geonames/Marysville2 +279 -0
- data/spec/fixtures/files/resource_mocks/geonames/SLC +1104 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn2099999999 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn82014496 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn83020109 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn83021453 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn83045396 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn84038814 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn93059126 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn94051019 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn99999999 +1 -0
- data/spec/fixtures/files/resource_mocks/urls.json +82 -0
- data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
- data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
- data/spec/fixtures/files/thumbnail.jpg +0 -0
- data/spec/forms/hyrax/newspaper_article_form_spec.rb +33 -0
- data/spec/forms/hyrax/newspaper_container_form_spec.rb +30 -0
- data/spec/forms/hyrax/newspaper_issue_form_spec.rb +31 -0
- data/spec/forms/hyrax/newspaper_page_form_spec.rb +28 -0
- data/spec/forms/hyrax/newspaper_title_form_spec.rb +31 -0
- data/spec/forms/newspaper_works/newspaper_core_form_data_spec.rb +12 -0
- data/spec/helpers/newspaper_works/breadcrumb_helper_spec.rb +82 -0
- data/spec/helpers/newspaper_works_helper_spec.rb +57 -0
- data/spec/indexers/concerns/newspaper_works/indexes_full_text_spec.rb +31 -0
- data/spec/indexers/concerns/newspaper_works/indexes_place_of_publication_spec.rb +53 -0
- data/spec/indexers/concerns/newspaper_works/indexes_publication_date_range_spec.rb +39 -0
- data/spec/indexers/concerns/newspaper_works/indexes_relationships_spec.rb +86 -0
- data/spec/indexers/newspaper_article_indexer_spec.rb +29 -0
- data/spec/indexers/newspaper_issue_indexer_spec.rb +19 -0
- data/spec/indexers/newspaper_title_indexer_spec.rb +22 -0
- data/spec/indexers/newspaper_works/newspaper_core_indexer_spec.rb +23 -0
- data/spec/lib/newspaper_works/configuration_spec.rb +18 -0
- data/spec/lib/newspaper_works/data/work_derivatives_spec.rb +245 -0
- data/spec/lib/newspaper_works/data/work_file_spec.rb +99 -0
- data/spec/lib/newspaper_works/data/work_files_spec.rb +224 -0
- data/spec/lib/newspaper_works/ingest/batch_issue_ingester_spec.rb +158 -0
- data/spec/lib/newspaper_works/ingest/chronam_publication_info_spec.rb +35 -0
- data/spec/lib/newspaper_works/ingest/from_command_spec.rb +75 -0
- data/spec/lib/newspaper_works/ingest/image_ingest_issues_spec.rb +62 -0
- data/spec/lib/newspaper_works/ingest/ingest_shared.rb +75 -0
- data/spec/lib/newspaper_works/ingest/issue_images_spec.rb +65 -0
- data/spec/lib/newspaper_works/ingest/lc_publication_info_spec.rb +34 -0
- data/spec/lib/newspaper_works/ingest/ndnp/batch_ingester_spec.rb +131 -0
- data/spec/lib/newspaper_works/ingest/ndnp/batch_xml_ingest_spec.rb +64 -0
- data/spec/lib/newspaper_works/ingest/ndnp/container_ingest_spec.rb +44 -0
- data/spec/lib/newspaper_works/ingest/ndnp/container_ingester_spec.rb +126 -0
- data/spec/lib/newspaper_works/ingest/ndnp/container_metadata_spec.rb +36 -0
- data/spec/lib/newspaper_works/ingest/ndnp/issue_ingest_spec.rb +108 -0
- data/spec/lib/newspaper_works/ingest/ndnp/issue_ingester_spec.rb +155 -0
- data/spec/lib/newspaper_works/ingest/ndnp/issue_metadata_spec.rb +84 -0
- data/spec/lib/newspaper_works/ingest/ndnp/page_ingest_spec.rb +79 -0
- data/spec/lib/newspaper_works/ingest/ndnp/page_ingester_spec.rb +184 -0
- data/spec/lib/newspaper_works/ingest/ndnp/page_metadata_spec.rb +85 -0
- data/spec/lib/newspaper_works/ingest/newspaper_issue_ingest_spec.rb +83 -0
- data/spec/lib/newspaper_works/ingest/newspaper_page_ingest_spec.rb +77 -0
- data/spec/lib/newspaper_works/ingest/page_image_spec.rb +29 -0
- data/spec/lib/newspaper_works/ingest/pdf_images_spec.rb +32 -0
- data/spec/lib/newspaper_works/ingest/pdf_issue_spec.rb +29 -0
- data/spec/lib/newspaper_works/ingest/pdf_issues_spec.rb +62 -0
- data/spec/lib/newspaper_works/ingest/pdf_pages_spec.rb +110 -0
- data/spec/lib/newspaper_works/ingest/pub_finder_spec.rb +58 -0
- data/spec/lib/newspaper_works/ingest/publication_info_spec.rb +61 -0
- data/spec/lib/newspaper_works/ingest_spec.rb +45 -0
- data/spec/lib/newspaper_works/issue_pdf_composer_spec.rb +101 -0
- data/spec/lib/newspaper_works/logging_spec.rb +53 -0
- data/spec/lib/newspaper_works/page_finder_spec.rb +53 -0
- data/spec/lib/newspaper_works/resource_fetcher_spec.rb +65 -0
- data/spec/lib/newspaper_works/text_extraction/alto_reader_spec.rb +49 -0
- data/spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb +84 -0
- data/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb +54 -0
- data/spec/lib/newspaper_works/text_extraction/word_coords_builder_spec.rb +30 -0
- data/spec/lib/tasks/newspaper_works_rake_spec.rb +124 -0
- data/spec/misc_shared.rb +109 -0
- data/spec/model_shared.rb +134 -0
- data/spec/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior_spec.rb +45 -0
- data/spec/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior_spec.rb +27 -0
- data/spec/models/concerns/newspaper_works/newspaper_core_metadata_spec.rb +45 -0
- data/spec/models/concerns/newspaper_works/place_of_publication_behavior_spec.rb +17 -0
- data/spec/models/concerns/newspaper_works/scanned_media_metadata_spec.rb +35 -0
- data/spec/models/newspaper_article_spec.rb +73 -0
- data/spec/models/newspaper_container_spec.rb +111 -0
- data/spec/models/newspaper_issue_spec.rb +91 -0
- data/spec/models/newspaper_page_spec.rb +44 -0
- data/spec/models/newspaper_title_spec.rb +116 -0
- data/spec/models/newspaper_works/derivative_attachment_spec.rb +37 -0
- data/spec/models/newspaper_works/ingest_file_relation_spec.rb +56 -0
- data/spec/models/solr_document_spec.rb +14 -0
- data/spec/ndnp_shared.rb +48 -0
- data/spec/presenters/hyrax/newspaper_article_presenter_spec.rb +53 -0
- data/spec/presenters/hyrax/newspaper_container_presenter_spec.rb +20 -0
- data/spec/presenters/hyrax/newspaper_issue_presenter_spec.rb +65 -0
- data/spec/presenters/hyrax/newspaper_page_presenter_spec.rb +75 -0
- data/spec/presenters/hyrax/newspaper_title_presenter_spec.rb +153 -0
- data/spec/presenters/newspaper_works/iiif_manifest_presenter_behavior_spec.rb +32 -0
- data/spec/presenters/newspaper_works/issue_info_presenter_spec.rb +51 -0
- data/spec/presenters/newspaper_works/newspaper_core_presenter_spec.rb +22 -0
- data/spec/presenters/newspaper_works/persistent_url_presenter_behavior_spec.rb +24 -0
- data/spec/presenters/newspaper_works/place_of_publication_presenter_behavior_spec.rb +17 -0
- data/spec/presenters/newspaper_works/scanned_media_presenter_spec.rb +18 -0
- data/spec/presenters/newspaper_works/title_info_presenter_spec.rb +23 -0
- data/spec/routing/route_spec.rb +52 -0
- data/spec/search_builders/custom_search_builder_spec.rb +34 -0
- data/spec/search_builders/newspaper_works/newspapers_search_builder_spec.rb +33 -0
- data/spec/services/hyrax/article_genre_service_spec.rb +12 -0
- data/spec/services/hyrax/resource_types_service_spec.rb +12 -0
- data/spec/services/newspaper_works/jp2_derivative_service_spec.rb +62 -0
- data/spec/services/newspaper_works/newspaper_page_derivative_service_spec.rb +125 -0
- data/spec/services/newspaper_works/pdf_derivative_service_spec.rb +62 -0
- data/spec/services/newspaper_works/pluggable_derivative_service_spec.rb +204 -0
- data/spec/services/newspaper_works/text_extraction_derivative_service_spec.rb +82 -0
- data/spec/services/newspaper_works/text_formats_from_alto_service_spec.rb +129 -0
- data/spec/services/newspaper_works/tiff_derivative_service_spec.rb +58 -0
- data/spec/spec_helper.rb +261 -0
- data/spec/support/controller_level_helpers.rb +28 -0
- data/spec/test_app_templates/lib/generators/test_app_generator.rb +22 -0
- data/spec/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb_spec.rb +36 -0
- data/spec/views/catalog/_index_header_list_newspaper_page.html.erb_spec.rb +26 -0
- data/spec/views/catalog/_thumbnail_list_newspaper_page.html.erb_spec.rb +35 -0
- data/spec/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb_spec.rb +16 -0
- data/spec/views/hyrax/newspaper_titles/_issue_search_form.html.erb_spec.rb +33 -0
- data/spec/views/hyrax/newspaper_titles/_issues_calendar.html.erb_spec.rb +37 -0
- data/spec/views/hyrax/newspaper_titles/show.html.erb_spec.rb +87 -0
- data/spec/views/newspaper_works/base/_attribute_rows.html.erb_spec.rb +60 -0
- data/spec/views/newspaper_works/base/_newspaper_hierarchy.html.erb_spec.rb +80 -0
- data/spec/views/newspaper_works/base/_show.html.erb_spec.rb +78 -0
- data/spec/views/newspaper_works/newspapers_search/search.html.erb_spec.rb +54 -0
- data/spec/views/records/edit_fields/_place_of_publication.html.erb_spec.rb +26 -0
- data/tasks/newspaperworks_dev.rake +26 -0
- data/test/integration/navigation_test.rb +7 -0
- data/test/lib/generators/newspaper_works/install_generator_test.rb +16 -0
- data/test/newspaper_works_test.rb +7 -0
- data/test/test_helper.rb +17 -0
- data/tmp/.keep +0 -0
- metadata +1037 -0
@@ -0,0 +1,52 @@
|
|
1
|
+
module NewspaperWorks
|
2
|
+
module Ingest
|
3
|
+
# Represents TIFF/JP2 page, access to file, page-numbering metadata
|
4
|
+
class PageImage
|
5
|
+
attr_accessor :path, :issue, :sequence
|
6
|
+
|
7
|
+
delegate :lccn, to: :issue
|
8
|
+
|
9
|
+
def initialize(path, issue, sequence)
|
10
|
+
# path to image:
|
11
|
+
@path = path
|
12
|
+
validate_path
|
13
|
+
# Issue is NewspaperWorks::Ingest::IssueImages object
|
14
|
+
@issue = issue
|
15
|
+
# sequence is page sequence number (Integer)
|
16
|
+
@sequence = sequence.to_i
|
17
|
+
end
|
18
|
+
|
19
|
+
# Page number inferred from image filename, or nil, presuming that:
|
20
|
+
# - The page number follows the actual word "page" (case-insenstive)
|
21
|
+
# in filename, possibly separated by a dash or underscore.
|
22
|
+
# - The page number is terminated by the period-plus-file-extension.
|
23
|
+
# - Both of the above can be determined by regular expression match.
|
24
|
+
# - Extraneous leading information in filename (e.g. datestamp) will
|
25
|
+
# be ignored.
|
26
|
+
# - Examples:
|
27
|
+
# - 'Page1.tiff'
|
28
|
+
# - '2019091801-page_1.jp2'
|
29
|
+
# - 'page_C2.tiff'
|
30
|
+
# @return [String, NilClass] page number string, or nil if indecipherable
|
31
|
+
def named_page_number
|
32
|
+
pattern = /(page)([_-]?)([^.]+)([.])/i
|
33
|
+
match = pattern.match(path)
|
34
|
+
match.nil? ? nil : match[3]
|
35
|
+
end
|
36
|
+
|
37
|
+
def page_number
|
38
|
+
named_page_number || @sequence.to_s
|
39
|
+
end
|
40
|
+
|
41
|
+
def title
|
42
|
+
["#{@issue.title.first}: Page #{page_number}"]
|
43
|
+
end
|
44
|
+
|
45
|
+
def validate_path
|
46
|
+
# expect path to be regular file, that exists:
|
47
|
+
raise ArgumentError unless File.exist?(path)
|
48
|
+
raise ArgumentError unless File.ftype(path) == 'file'
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
module NewspaperWorks
|
2
|
+
module Ingest
|
3
|
+
# Provides enumeration of path keys to object values, where:
|
4
|
+
# - Consuming class:
|
5
|
+
# - Defines a `paths` method returning array of paths.
|
6
|
+
# - Defines an `info` method that returns an object for a path.
|
7
|
+
# - Also mixes in Enumerable
|
8
|
+
module PathEnumeration
|
9
|
+
delegate :size, :include?, to: :_paths
|
10
|
+
|
11
|
+
def _paths
|
12
|
+
paths
|
13
|
+
end
|
14
|
+
|
15
|
+
def _info(path)
|
16
|
+
info(path)
|
17
|
+
end
|
18
|
+
|
19
|
+
def each
|
20
|
+
return enum_for(:each) unless block_given?
|
21
|
+
paths.each do |path|
|
22
|
+
yield [path, info(path)]
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def each_key
|
27
|
+
enum_for(:each_key) unless block_given?
|
28
|
+
paths.each { |path| yield path }
|
29
|
+
end
|
30
|
+
|
31
|
+
def each_value
|
32
|
+
return enum_for(:each_value) unless block_given?
|
33
|
+
paths.each do |path|
|
34
|
+
yield info(path)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def values
|
39
|
+
each_value.to_a
|
40
|
+
end
|
41
|
+
|
42
|
+
def entries
|
43
|
+
each.to_a
|
44
|
+
end
|
45
|
+
|
46
|
+
alias each_pair each
|
47
|
+
alias keys _paths
|
48
|
+
alias has_key? include?
|
49
|
+
alias [] _info
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
require 'open3'
|
2
|
+
require 'mini_magick'
|
3
|
+
|
4
|
+
module NewspaperWorks
|
5
|
+
module Ingest
|
6
|
+
# PdfImages uses poppler 0.19+ pdfimages command to extract image
|
7
|
+
# listing metadata from PDF files.
|
8
|
+
# For dpi extraction, falls back to calculating using MiniMagick,
|
9
|
+
# if neccessary.
|
10
|
+
class PdfImages
|
11
|
+
# class constant column numbers
|
12
|
+
COL_WIDTH = 3
|
13
|
+
COL_HEIGHT = 4
|
14
|
+
COL_COLOR = 5
|
15
|
+
COL_CHANNELS = 6
|
16
|
+
COL_BITS = 7
|
17
|
+
# only poppler 0.25+ has this column in output:
|
18
|
+
COL_XPPI = 12
|
19
|
+
|
20
|
+
def initialize(path)
|
21
|
+
@path = path
|
22
|
+
@cmd = format('pdfimages -list %<path>s', path: path)
|
23
|
+
@output = nil
|
24
|
+
@entries = nil
|
25
|
+
end
|
26
|
+
|
27
|
+
def process
|
28
|
+
# call just once
|
29
|
+
if @output.nil?
|
30
|
+
Open3.popen3(@cmd) do |_stdin, stdout, _stderr, _wait_thr|
|
31
|
+
@output = stdout.read.split("\n")
|
32
|
+
end
|
33
|
+
end
|
34
|
+
@output.slice(2, @output.size - 1)
|
35
|
+
end
|
36
|
+
|
37
|
+
def entries
|
38
|
+
if @entries.nil?
|
39
|
+
@entries = []
|
40
|
+
output = process
|
41
|
+
(0..output.size - 1).each do |i|
|
42
|
+
@entries.push(output[i].gsub(/\s+/m, ' ').strip.split(" "))
|
43
|
+
end
|
44
|
+
end
|
45
|
+
@entries
|
46
|
+
end
|
47
|
+
|
48
|
+
def selectcolumn(i, &block)
|
49
|
+
result = entries.map { |e| e[i] }
|
50
|
+
return result.map!(&block) if block_given?
|
51
|
+
result
|
52
|
+
end
|
53
|
+
|
54
|
+
def width
|
55
|
+
selectcolumn(COL_WIDTH, &:to_i).max
|
56
|
+
end
|
57
|
+
|
58
|
+
def height
|
59
|
+
selectcolumn(COL_HEIGHT, &:to_i).max
|
60
|
+
end
|
61
|
+
|
62
|
+
def color
|
63
|
+
# desc is either 'gray', 'cmyk', 'rgb', but 1-bit gray is black/white
|
64
|
+
# so caller may want all of this information, and in case of
|
65
|
+
# mixed color spaces across images, this returns maximum
|
66
|
+
desc = entries.any? { |e| e[COL_COLOR] != 'gray' } ? 'rgb' : 'gray'
|
67
|
+
channels = entries.map { |e| e[COL_CHANNELS].to_i }.max
|
68
|
+
bits = entries.map { |e| e[COL_BITS].to_i }.max
|
69
|
+
[desc, channels, bits]
|
70
|
+
end
|
71
|
+
|
72
|
+
def ppi
|
73
|
+
if entries[0].size <= 12
|
74
|
+
# poppler < 0.25
|
75
|
+
pdf = MiniMagick::Image.open(@path)
|
76
|
+
width_points = pdf.width
|
77
|
+
width_px = width
|
78
|
+
return (72 * width_px / width_points).to_i
|
79
|
+
end
|
80
|
+
# with poppler 0.25+, pdfimages just gives us this:
|
81
|
+
selectcolumn(COL_XPPI, &:to_i).max
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'date'
|
2
|
+
|
3
|
+
module NewspaperWorks
|
4
|
+
module Ingest
|
5
|
+
class PDFIssue
|
6
|
+
attr_accessor :path, :publication
|
7
|
+
|
8
|
+
# most acccessors for issue/edition metadata, publication metadata
|
9
|
+
# provided by including this mixin:
|
10
|
+
include NewspaperWorks::Ingest::NamedIssueMetadata
|
11
|
+
|
12
|
+
def initialize(path, publication)
|
13
|
+
@path = path
|
14
|
+
validate_path
|
15
|
+
# as a NewspaperWorks::Ingest::PublicationInfo object:
|
16
|
+
@publication = publication
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'find'
|
2
|
+
|
3
|
+
module NewspaperWorks
|
4
|
+
module Ingest
|
5
|
+
class PDFIssues
|
6
|
+
include Enumerable
|
7
|
+
include NewspaperWorks::Ingest::PathEnumeration
|
8
|
+
|
9
|
+
attr_accessor :path, :publication, :pdf_paths
|
10
|
+
|
11
|
+
alias paths pdf_paths
|
12
|
+
|
13
|
+
def initialize(path, publication)
|
14
|
+
@path = path
|
15
|
+
# as a NewspaperWorks::Ingest::PublicationInfo object:
|
16
|
+
@publication = publication
|
17
|
+
@pdf_paths = valid_pdfs(path)
|
18
|
+
end
|
19
|
+
|
20
|
+
def valid_pdfs(path)
|
21
|
+
target = []
|
22
|
+
Find.find(path) do |p|
|
23
|
+
next if File.directory?(p)
|
24
|
+
next unless p.end_with?('.pdf')
|
25
|
+
target.push(p)
|
26
|
+
end
|
27
|
+
target
|
28
|
+
end
|
29
|
+
|
30
|
+
def lccn
|
31
|
+
@publication.lccn
|
32
|
+
end
|
33
|
+
|
34
|
+
def info(path)
|
35
|
+
NewspaperWorks::Ingest::PDFIssue.new(path, @publication)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,114 @@
|
|
1
|
+
require 'open3'
|
2
|
+
require 'securerandom'
|
3
|
+
require 'tmpdir'
|
4
|
+
|
5
|
+
module NewspaperWorks
|
6
|
+
module Ingest
|
7
|
+
class PdfPages
|
8
|
+
include Enumerable
|
9
|
+
|
10
|
+
def initialize(path)
|
11
|
+
@baseid = SecureRandom.uuid
|
12
|
+
@pdfpath = path
|
13
|
+
@info = nil
|
14
|
+
@entries = nil
|
15
|
+
@tmpdir = nil
|
16
|
+
@size = nil
|
17
|
+
@pagecount = nil
|
18
|
+
@pdftext = nil
|
19
|
+
end
|
20
|
+
|
21
|
+
# return
|
22
|
+
def pdfinfo
|
23
|
+
@info = PdfImages.new(@pdfpath) if @info.nil?
|
24
|
+
@info
|
25
|
+
end
|
26
|
+
|
27
|
+
def tmpdir
|
28
|
+
@tmpdir = Dir.mktmpdir if @tmpdir.nil?
|
29
|
+
@tmpdir
|
30
|
+
end
|
31
|
+
|
32
|
+
def colordevice(channels, bpc)
|
33
|
+
bits = bpc * channels
|
34
|
+
# will be either 8bpc/16bpd color TIFF,
|
35
|
+
# with any CMYK source transformed to 8bpc RBG
|
36
|
+
bits = 24 unless [24, 48].include? bits
|
37
|
+
"tiff#{bits}nc"
|
38
|
+
end
|
39
|
+
|
40
|
+
def gsdevice
|
41
|
+
color, channels, bpc = pdfinfo.color
|
42
|
+
device = nil
|
43
|
+
# CCITT Group 4 Black and White, if applicable:
|
44
|
+
device = 'tiffg4' if color == 'gray' && bpc == 1
|
45
|
+
# 8 Bit Grayscale, if applicable:
|
46
|
+
device = 'tiffgray' if color == 'gray' && bpc > 1
|
47
|
+
# otherwise color:
|
48
|
+
device = colordevice(channels, bpc) if device.nil?
|
49
|
+
device
|
50
|
+
end
|
51
|
+
|
52
|
+
def gstext
|
53
|
+
cmd = "gs -q -dNOPAUSE -dBATCH -sDEVICE=txtwrite " \
|
54
|
+
"-sOutputFile=- -f #{@pdfpath}"
|
55
|
+
Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
|
56
|
+
@pdftext = stdout.read
|
57
|
+
end
|
58
|
+
@pdftext
|
59
|
+
end
|
60
|
+
|
61
|
+
def pagecount
|
62
|
+
cmd = "pdfinfo #{@pdfpath}"
|
63
|
+
Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
|
64
|
+
output = stdout.read.split("\n")
|
65
|
+
pages_e = output.select { |e| e.start_with?('Pages:') }[0]
|
66
|
+
@pagecount = pages_e.split[-1].to_i
|
67
|
+
end
|
68
|
+
@pagecount
|
69
|
+
end
|
70
|
+
|
71
|
+
def looks_scanned
|
72
|
+
max_image_px = pdfinfo.width * pdfinfo.height
|
73
|
+
single_image_per_page = pdfinfo.entries.length == pagecount
|
74
|
+
# single 10mp+ image per page?
|
75
|
+
single_image_per_page && max_image_px > 1024 * 1024 * 10
|
76
|
+
end
|
77
|
+
|
78
|
+
def ppi
|
79
|
+
unless looks_scanned
|
80
|
+
# 400 dpi for something that does not look like scanned media:
|
81
|
+
return 400
|
82
|
+
end
|
83
|
+
# For scanned media, defer to detected image PPI:
|
84
|
+
pdfinfo.ppi
|
85
|
+
end
|
86
|
+
|
87
|
+
# ghostscript convert all pages to TIFF
|
88
|
+
def gsconvert
|
89
|
+
output_base = File.join(tmpdir, "#{@baseid}-page%d.tiff")
|
90
|
+
cmd = "gs -dNOPAUSE -dBATCH -sDEVICE=#{gsdevice} " \
|
91
|
+
"-dTextAlphaBits=4 " \
|
92
|
+
"-sOutputFile=#{output_base} -r#{ppi} -f #{@pdfpath}"
|
93
|
+
Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
|
94
|
+
output = stdout.read.split("\n")
|
95
|
+
@size = output.select { |e| e.start_with?('Page ') }.length
|
96
|
+
end
|
97
|
+
# Return an array of expected filenames
|
98
|
+
(1..@size).map { |n| File.join(tmpdir, "#{@baseid}-page#{n}.tiff") }
|
99
|
+
end
|
100
|
+
|
101
|
+
# entries for each page
|
102
|
+
def entries
|
103
|
+
@entries = gsconvert if @entries.nil?
|
104
|
+
@entries
|
105
|
+
end
|
106
|
+
|
107
|
+
def each
|
108
|
+
entries.each do |e|
|
109
|
+
yield(e)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
require 'newspaper_works/logging'
|
2
|
+
require 'newspaper_works/ingest'
|
3
|
+
|
4
|
+
module NewspaperWorks
|
5
|
+
module Ingest
|
6
|
+
# mixin for find-or-create of publication, for use by various ingests
|
7
|
+
module PubFinder
|
8
|
+
include NewspaperWorks::Logging
|
9
|
+
|
10
|
+
COPY_FIELDS = [
|
11
|
+
:title,
|
12
|
+
:lccn,
|
13
|
+
:oclcnum,
|
14
|
+
:issn,
|
15
|
+
:place_of_publication,
|
16
|
+
:language,
|
17
|
+
:preceded_by,
|
18
|
+
:succeeded_by
|
19
|
+
].freeze
|
20
|
+
|
21
|
+
MULTI_VALUED = [
|
22
|
+
:title,
|
23
|
+
:language,
|
24
|
+
:preceded_by,
|
25
|
+
:succeeded_by,
|
26
|
+
:place_of_publication
|
27
|
+
].freeze
|
28
|
+
|
29
|
+
WRAPPERS = {
|
30
|
+
place_of_publication: Hyrax::ControlledVocabularies::Location
|
31
|
+
}.freeze
|
32
|
+
|
33
|
+
# @param lccn [String] Library of Congress Control Number
|
34
|
+
# of Publication
|
35
|
+
# @return [NewspaperTitle, NilClass] publication or nil if not found
|
36
|
+
def find_publication(lccn)
|
37
|
+
NewspaperTitle.where(lccn: lccn).first
|
38
|
+
end
|
39
|
+
|
40
|
+
# Copy publication metadata from authority lookup for LCCN
|
41
|
+
# @param publication [NewspaperTitle]
|
42
|
+
# @param metadata [NewspaperWorks::Ingest::PublicationInfo]
|
43
|
+
def copy_publication_metadata(publication, metadata, lccn, title = nil)
|
44
|
+
COPY_FIELDS.each do |name|
|
45
|
+
value = metadata.send(name)
|
46
|
+
next if value.nil?
|
47
|
+
# wrapped value, if applicable:
|
48
|
+
value = WRAPPERS[name].new(value) if WRAPPERS.include?(name)
|
49
|
+
# value in array, if applicable:
|
50
|
+
value = [value] if MULTI_VALUED.include?(name)
|
51
|
+
publication.send("#{name}=", value)
|
52
|
+
end
|
53
|
+
# prefer locally-specified title to looked-up title:
|
54
|
+
publication.title = [title] unless title.nil?
|
55
|
+
# final fallback, nothing specified, title mandatory: use LCCN
|
56
|
+
publication.title = [lccn] if publication.title.empty?
|
57
|
+
end
|
58
|
+
|
59
|
+
def create_publication(lccn, title = nil, opts = {})
|
60
|
+
publication = NewspaperTitle.create
|
61
|
+
info = NewspaperWorks::Ingest::PublicationInfo.new(lccn)
|
62
|
+
copy_publication_metadata(publication, info, lccn, title)
|
63
|
+
publication.lccn ||= lccn
|
64
|
+
NewspaperWorks::Ingest.assign_administrative_metadata(publication, opts)
|
65
|
+
publication.save!
|
66
|
+
write_log(
|
67
|
+
"Created NewspaperTitle work #{publication.id} for LCCN #{lccn}"
|
68
|
+
)
|
69
|
+
publication
|
70
|
+
end
|
71
|
+
|
72
|
+
def find_or_create_publication_for_issue(issue, lccn, title, opts)
|
73
|
+
publication = find_publication(lccn)
|
74
|
+
unless publication.nil?
|
75
|
+
write_log(
|
76
|
+
"Found existing NewspaperTitle #{publication.id}, LCCN #{lccn}"
|
77
|
+
)
|
78
|
+
end
|
79
|
+
publication = create_publication(lccn, title, opts) if publication.nil?
|
80
|
+
publication.members << issue
|
81
|
+
publication.save!
|
82
|
+
write_log(
|
83
|
+
"Linked NewspaperIssue #{issue.id} to "\
|
84
|
+
"NewspaperTitle work #{publication.id}"
|
85
|
+
)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'faraday'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'uri'
|
4
|
+
|
5
|
+
module NewspaperWorks
|
6
|
+
module Ingest
|
7
|
+
class PublicationInfo
|
8
|
+
attr_accessor :implementation, :lccn
|
9
|
+
|
10
|
+
def initialize(lccn)
|
11
|
+
@lccn = lccn
|
12
|
+
@implementation = nil
|
13
|
+
load
|
14
|
+
end
|
15
|
+
|
16
|
+
def load_chronam_fallback
|
17
|
+
@implementation = ChronAmPublicationInfo.new(@lccn)
|
18
|
+
end
|
19
|
+
|
20
|
+
def load
|
21
|
+
@implementation = LCPublicationInfo.new(@lccn)
|
22
|
+
@implementation.load
|
23
|
+
# Empty mods is equivalent to 404 for LCCN in LC Catalog:
|
24
|
+
load_chronam_fallback if @implementation.empty?
|
25
|
+
end
|
26
|
+
|
27
|
+
def respond_to_missing?(symbol, include_priv = false)
|
28
|
+
@implementation.respond_to?(symbol, include_priv)
|
29
|
+
end
|
30
|
+
|
31
|
+
def method_missing(method, *args, &block)
|
32
|
+
# proxy call to underlying implementation:
|
33
|
+
if respond_to_missing?(method)
|
34
|
+
return @implementation.send(
|
35
|
+
method,
|
36
|
+
*args,
|
37
|
+
&block
|
38
|
+
)
|
39
|
+
end
|
40
|
+
super
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
require 'faraday'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'uri'
|
4
|
+
require 'newspaper_works/ingest/from_command'
|
5
|
+
require 'newspaper_works/ingest/base_publication_info'
|
6
|
+
require 'newspaper_works/ingest/chronam_publication_info'
|
7
|
+
require 'newspaper_works/ingest/lc_publication_info'
|
8
|
+
require 'newspaper_works/ingest/publication_info'
|
9
|
+
require 'newspaper_works/ingest/pub_finder'
|
10
|
+
require 'newspaper_works/ingest/pdf_images'
|
11
|
+
require 'newspaper_works/ingest/named_issue_metadata'
|
12
|
+
require 'newspaper_works/ingest/path_enumeration'
|
13
|
+
require 'newspaper_works/ingest/pdf_issue'
|
14
|
+
require 'newspaper_works/ingest/pdf_issues'
|
15
|
+
require 'newspaper_works/ingest/batch_ingest_helper'
|
16
|
+
require 'newspaper_works/ingest/batch_issue_ingester'
|
17
|
+
require 'newspaper_works/ingest/pdf_pages'
|
18
|
+
require 'newspaper_works/ingest/issue_images'
|
19
|
+
require 'newspaper_works/ingest/page_image'
|
20
|
+
require 'newspaper_works/ingest/image_ingest_issues'
|
21
|
+
require 'newspaper_works/ingest/base_ingest'
|
22
|
+
require 'newspaper_works/ingest/ndnp'
|
23
|
+
require 'newspaper_works/ingest/newspaper_page_ingest'
|
24
|
+
require 'newspaper_works/ingest/newspaper_issue_ingest'
|
25
|
+
|
26
|
+
module NewspaperWorks
|
27
|
+
# Module for Ingest adapters that import files into model objects
|
28
|
+
module Ingest
|
29
|
+
# Get Geonames URI for closest place match
|
30
|
+
# Requires Qa::Authorities::Geonames.username is set, likely via
|
31
|
+
# `Hyrax.config.geonames_username=` setter in
|
32
|
+
# config/initializers/hyrax.rb of consuming app.
|
33
|
+
# @param place_name [String] Name of place as human-readable text
|
34
|
+
# @return [String, NilClass] URI to Geonames RDF or nil
|
35
|
+
def self.geonames_place_uri(place_name)
|
36
|
+
username = Qa::Authorities::Geonames.username
|
37
|
+
return if username.nil? || username.empty?
|
38
|
+
place_name = place_name.delete('.').split(/[\[\(]/)[0].strip
|
39
|
+
query = URI.encode(place_name)
|
40
|
+
geo_qs = "q=#{query}&username=#{username}"
|
41
|
+
url = "http://api.geonames.org/search?#{geo_qs}"
|
42
|
+
resp = NewspaperWorks::ResourceFetcher.get url
|
43
|
+
doc = Nokogiri.XML(resp['body'])
|
44
|
+
geonames_id = doc.xpath('//geonames/geoname[1]/geonameId').first
|
45
|
+
return if geonames_id.nil?
|
46
|
+
"http://sws.geonames.org/#{geonames_id.text}/"
|
47
|
+
end
|
48
|
+
|
49
|
+
# Normalize publication title from catalog data
|
50
|
+
# Presently strips trailing period
|
51
|
+
# @param title [String]
|
52
|
+
# @return [String] normalized title
|
53
|
+
def self.normalize_title(title)
|
54
|
+
title.strip.sub(/[.]+$/, '')
|
55
|
+
end
|
56
|
+
|
57
|
+
# Get publication metadata from LC catalog MODS data, if available,
|
58
|
+
# and from ChronAm, as a fallback.
|
59
|
+
# @param lccn [String] Library of Congress Control number for publication
|
60
|
+
# @return [NewspaperWorks::Ingest::PublicationInfo] proxy to metadata
|
61
|
+
# source, an object for accessors for publication fields.
|
62
|
+
def self.publication_metadata(lccn)
|
63
|
+
PublicationInfo.new(lccn)
|
64
|
+
end
|
65
|
+
|
66
|
+
def self.find_admin_set(admin_set = nil)
|
67
|
+
return admin_set if admin_set.class == AdminSet
|
68
|
+
admin_set = AdminSet::DEFAULT_ID if admin_set.nil?
|
69
|
+
begin
|
70
|
+
AdminSet.find(admin_set)
|
71
|
+
rescue
|
72
|
+
# only create if default admin set
|
73
|
+
raise unless admin_set == AdminSet::DEFAULT_ID
|
74
|
+
AdminSet.find(AdminSet.find_or_create_default_admin_set_id)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def self.assign_administrative_metadata(work, opts = {})
|
79
|
+
work.depositor = opts.fetch(:email, User.batch_user.user_key)
|
80
|
+
work.admin_set = find_admin_set(opts.fetch(:admin_set, nil))
|
81
|
+
work.visibility = opts.fetch(:visibility, 'open')
|
82
|
+
work.resource_type = ['Newspapers']
|
83
|
+
work.date_modified ||= Hyrax::TimeService.time_in_utc
|
84
|
+
work.date_uploaded ||= work.date_modified
|
85
|
+
work.state = RDF::URI(
|
86
|
+
'http://fedora.info/definitions/1/0/access/ObjState#active'
|
87
|
+
)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
@@ -0,0 +1,111 @@
|
|
1
|
+
require 'open3'
|
2
|
+
require 'tmpdir'
|
3
|
+
|
4
|
+
module NewspaperWorks
|
5
|
+
# Adapter class composes a PDF derivative for issue, if it requires one.
|
6
|
+
class IssuePDFComposer
|
7
|
+
attr_accessor :issue, :page_pdfs
|
8
|
+
|
9
|
+
CMD_BASE = "gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite".freeze
|
10
|
+
|
11
|
+
# @param issue [NewspaperIssue] adapts issue work object
|
12
|
+
def initialize(issue)
|
13
|
+
@issue = issue
|
14
|
+
# paths to page PDFs
|
15
|
+
@page_pdfs = []
|
16
|
+
end
|
17
|
+
|
18
|
+
def compose
|
19
|
+
# we will not step on any existing PDF
|
20
|
+
return if issue_pdf_exists?
|
21
|
+
# we can not compose a multi-page issue PDF if constituent page PDFs
|
22
|
+
# do not exist (yet == not ready, possibly waiting on an async job).
|
23
|
+
@page_pdfs = validated_page_pdfs
|
24
|
+
# Compose a Ghostscript command to merge all paths in @page_pdfs into
|
25
|
+
# a single output document, execute:
|
26
|
+
compose_from_pages
|
27
|
+
end
|
28
|
+
|
29
|
+
def compose_from_pages
|
30
|
+
outfile = File.join(Dir.mktmpdir, output_filename)
|
31
|
+
sources = @page_pdfs.join(' ')
|
32
|
+
cmd = "#{CMD_BASE} -sOutputFile=#{outfile} #{sources}"
|
33
|
+
# rubocop:disable Lint/UnusedBlockArgument
|
34
|
+
Open3.popen3(cmd) do |stdin, stdout, stderr, wait_thr|
|
35
|
+
unless wait_thr.value.success?
|
36
|
+
e = "Ghostscript Error: \n#{stderr.read}"
|
37
|
+
raise NewspaperWorks::DataError, e
|
38
|
+
end
|
39
|
+
end
|
40
|
+
# rubocop:enable Lint/UnusedBlockArgument
|
41
|
+
# at this point, something should exist and validate at path `outfile`:
|
42
|
+
raise NewspaperWorks::DataError, "Generated PDF invalid" unless validate_pdf(outfile)
|
43
|
+
# Assign for attachment to issue, commit:
|
44
|
+
attach_to_issue(outfile)
|
45
|
+
end
|
46
|
+
|
47
|
+
def output_filename
|
48
|
+
"#{@issue.id}_full-issue.pdf"
|
49
|
+
end
|
50
|
+
|
51
|
+
# Validate PDF with poppler `pdfinfo` command, which will detect
|
52
|
+
# error conditions in cases like truncated PDF, and only in those
|
53
|
+
# error conditions will write to stderr.
|
54
|
+
# @param path [String] path to PDF file
|
55
|
+
# @return [Boolean] true or false
|
56
|
+
def validate_pdf(path)
|
57
|
+
return false if path.nil? || !File.exist?(path)
|
58
|
+
return false if File.size(path).zero?
|
59
|
+
result = ''
|
60
|
+
cmd = "pdfinfo #{path}"
|
61
|
+
# rubocop:disable Lint/UnusedBlockArgument
|
62
|
+
Open3.popen3(cmd) do |stdin, stdout, stderr, wait_thr|
|
63
|
+
result = stderr.read
|
64
|
+
end
|
65
|
+
# rubocop:enable Lint/UnusedBlockArgument
|
66
|
+
# only zero bytes stderr output from `pdfinfo` considered valid PDF:
|
67
|
+
result.size.zero?
|
68
|
+
end
|
69
|
+
|
70
|
+
private
|
71
|
+
|
72
|
+
# @return [Array] list of paths to page PDFs, in page order
|
73
|
+
# @raises [NewspaperWorks::PagesNotReady] if any page has invalid
|
74
|
+
# or non-ready PDF source.
|
75
|
+
def validated_page_pdfs
|
76
|
+
result = []
|
77
|
+
# if any page PDF invalid, raise; otherwise append to result:
|
78
|
+
issue.pages.to_a.each_with_index do |page, idx|
|
79
|
+
e = "Page PDFs not ready for issue "\
|
80
|
+
"(Issue id: #{issue.id}, Page index: #{idx})"
|
81
|
+
path = derivatives_of(page).path('pdf')
|
82
|
+
raise NewspaperWorks::PagesNotReady, e unless validate_pdf(path)
|
83
|
+
result.push(path)
|
84
|
+
end
|
85
|
+
result
|
86
|
+
end
|
87
|
+
|
88
|
+
def issue_pdf_exists?
|
89
|
+
derivatives_of(@issue).exist?('pdf')
|
90
|
+
end
|
91
|
+
|
92
|
+
def derivatives_of(work)
|
93
|
+
NewspaperWorks::Data::WorkDerivatives.of(work)
|
94
|
+
end
|
95
|
+
|
96
|
+
def ensure_whitelist
|
97
|
+
whitelist = Hyrax.config.whitelisted_ingest_dirs
|
98
|
+
whitelist.push(Dir.tmpdir) unless whitelist.include?(Dir.tmpdir)
|
99
|
+
end
|
100
|
+
|
101
|
+
def attach_to_issue(path)
|
102
|
+
ensure_whitelist
|
103
|
+
# We rely upon WorkFiles to create fileset, and by consequence of
|
104
|
+
# running primary file attachment through actor stack,
|
105
|
+
# visibility of the FileSet is copied from the work:
|
106
|
+
attachment = NewspaperWorks::Data::WorkFiles.of(@issue)
|
107
|
+
attachment.assign(path)
|
108
|
+
attachment.commit!
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|