newspaper_works 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.fcrepo_wrapper +4 -0
- data/.gitignore +43 -0
- data/.rubocop.yml +143 -0
- data/.solr_wrapper +8 -0
- data/.travis.yml +50 -0
- data/Gemfile +47 -0
- data/LICENSE +203 -0
- data/README.md +159 -0
- data/Rakefile +38 -0
- data/app/actors/hyrax/actors/newspaper_article_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_container_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_issue_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_page_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_title_actor.rb +8 -0
- data/app/actors/newspaper_works/actors/newspaper_works_upload_actor.rb +88 -0
- data/app/assets/config/newspaper_works_manifest.js +2 -0
- data/app/assets/images/newspaper_works/.keep +0 -0
- data/app/assets/javascripts/newspaper_works/autocomplete_fix.js +33 -0
- data/app/assets/javascripts/newspaper_works/ocr_search.js.erb +6 -0
- data/app/assets/javascripts/newspaper_works/thumbnail_highlights.js.erb +102 -0
- data/app/assets/javascripts/newspaper_works.js +4 -0
- data/app/assets/stylesheets/newspaper_works/_issue_search.scss +13 -0
- data/app/assets/stylesheets/newspaper_works/_issues_calendar.scss +18 -0
- data/app/assets/stylesheets/newspaper_works/_newspaper_works.scss +4 -0
- data/app/assets/stylesheets/newspaper_works/_newspapers_search.scss +38 -0
- data/app/assets/stylesheets/newspaper_works/_search_results.scss +12 -0
- data/app/controllers/hyrax/newspaper_articles_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_containers_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_issues_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_pages_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_titles_controller.rb +13 -0
- data/app/controllers/newspaper_works/newspapers_controller.rb +117 -0
- data/app/controllers/newspaper_works/newspapers_search_controller.rb +26 -0
- data/app/forms/hyrax/newspaper_article_form.rb +11 -0
- data/app/forms/hyrax/newspaper_container_form.rb +11 -0
- data/app/forms/hyrax/newspaper_issue_form.rb +11 -0
- data/app/forms/hyrax/newspaper_page_form.rb +15 -0
- data/app/forms/hyrax/newspaper_title_form.rb +12 -0
- data/app/forms/newspaper_works/newspaper_core_form_data.rb +17 -0
- data/app/helpers/hyrax/newspaper_articles_helper.rb +5 -0
- data/app/helpers/hyrax/newspaper_containers_helper.rb +5 -0
- data/app/helpers/hyrax/newspaper_issues_helper.rb +5 -0
- data/app/helpers/hyrax/newspaper_pages_helper.rb +5 -0
- data/app/helpers/newspaper_works/application_helper.rb +5 -0
- data/app/helpers/newspaper_works/breadcrumb_helper.rb +92 -0
- data/app/helpers/newspaper_works/newspaper_works_helper_behavior.rb +103 -0
- data/app/helpers/newspaper_works/newspapers_helper.rb +5 -0
- data/app/indexers/concerns/newspaper_works/indexes_full_text.rb +17 -0
- data/app/indexers/concerns/newspaper_works/indexes_place_of_publication.rb +67 -0
- data/app/indexers/concerns/newspaper_works/indexes_publication_date_range.rb +35 -0
- data/app/indexers/concerns/newspaper_works/indexes_relationships.rb +125 -0
- data/app/indexers/newspaper_article_indexer.rb +16 -0
- data/app/indexers/newspaper_container_indexer.rb +18 -0
- data/app/indexers/newspaper_issue_indexer.rb +26 -0
- data/app/indexers/newspaper_page_indexer.rb +9 -0
- data/app/indexers/newspaper_title_indexer.rb +19 -0
- data/app/indexers/newspaper_works/newspaper_core_indexer.rb +21 -0
- data/app/jobs/newspaper_works/application_job.rb +4 -0
- data/app/jobs/newspaper_works/compose_issue_pdf_job.rb +13 -0
- data/app/jobs/newspaper_works/create_issue_pages_job.rb +19 -0
- data/app/mailers/newspaper_works/application_mailer.rb +8 -0
- data/app/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior.rb +82 -0
- data/app/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior.rb +27 -0
- data/app/models/concerns/newspaper_works/newspaper_core_metadata.rb +67 -0
- data/app/models/concerns/newspaper_works/place_of_publication_behavior.rb +15 -0
- data/app/models/concerns/newspaper_works/scanned_media_metadata.rb +43 -0
- data/app/models/concerns/newspaper_works/solr/document.rb +25 -0
- data/app/models/file_set.rb +10 -0
- data/app/models/newspaper_article.rb +158 -0
- data/app/models/newspaper_container.rb +86 -0
- data/app/models/newspaper_issue.rb +115 -0
- data/app/models/newspaper_page.rb +70 -0
- data/app/models/newspaper_title.rb +111 -0
- data/app/models/newspaper_works/application_record.rb +6 -0
- data/app/models/newspaper_works/derivative_attachment.rb +8 -0
- data/app/models/newspaper_works/ingest_file_relation.rb +14 -0
- data/app/presenters/hyrax/newspaper_article_presenter.rb +38 -0
- data/app/presenters/hyrax/newspaper_container_presenter.rb +11 -0
- data/app/presenters/hyrax/newspaper_issue_presenter.rb +62 -0
- data/app/presenters/hyrax/newspaper_page_presenter.rb +72 -0
- data/app/presenters/hyrax/newspaper_title_presenter.rb +86 -0
- data/app/presenters/newspaper_works/iiif_manifest_presenter_behavior.rb +29 -0
- data/app/presenters/newspaper_works/issue_info_presenter.rb +29 -0
- data/app/presenters/newspaper_works/newspaper_core_presenter.rb +9 -0
- data/app/presenters/newspaper_works/persistent_url_presenter_behavior.rb +16 -0
- data/app/presenters/newspaper_works/place_of_publication_presenter_behavior.rb +8 -0
- data/app/presenters/newspaper_works/scanned_media_presenter.rb +7 -0
- data/app/presenters/newspaper_works/title_info_presenter.rb +13 -0
- data/app/search_builders/concerns/newspaper_works/exclude_models.rb +16 -0
- data/app/search_builders/concerns/newspaper_works/highlight_search_params.rb +14 -0
- data/app/search_builders/newspaper_works/newspapers_search_builder.rb +26 -0
- data/app/services/hyrax/article_genre_service.rb +9 -0
- data/app/services/newspaper_works/jp2_derivative_service.rb +120 -0
- data/app/services/newspaper_works/newspaper_page_derivative_service.rb +91 -0
- data/app/services/newspaper_works/pdf_derivative_service.rb +45 -0
- data/app/services/newspaper_works/pluggable_derivative_service.rb +114 -0
- data/app/services/newspaper_works/text_extraction_derivative_service.rb +56 -0
- data/app/services/newspaper_works/text_formats_from_alto_service.rb +77 -0
- data/app/services/newspaper_works/tiff_derivative_service.rb +54 -0
- data/app/validators/newspaper_works/publication_date_start_end_validator.rb +48 -0
- data/app/validators/newspaper_works/publication_date_validator.rb +16 -0
- data/app/views/catalog/_index_gallery_newspaper_article_wrapper.html.erb +9 -0
- data/app/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb +9 -0
- data/app/views/catalog/_index_header_gallery_newspaper_article.html.erb +23 -0
- data/app/views/catalog/_index_header_gallery_newspaper_page.html.erb +23 -0
- data/app/views/catalog/_index_header_list_newspaper_article.html.erb +7 -0
- data/app/views/catalog/_index_header_list_newspaper_page.html.erb +7 -0
- data/app/views/catalog/_snippets_more.html.erb +16 -0
- data/app/views/catalog/_thumbnail_list_newspaper_article.html.erb +6 -0
- data/app/views/catalog/_thumbnail_list_newspaper_page.html.erb +6 -0
- data/app/views/hyrax/file_sets/_actions.html.erb +45 -0
- data/app/views/hyrax/newspaper_articles/_newspaper_article.html.erb +2 -0
- data/app/views/hyrax/newspaper_articles/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_containers/_newspaper_container.html.erb +2 -0
- data/app/views/hyrax/newspaper_containers/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_issues/_newspaper_issue.html.erb +2 -0
- data/app/views/hyrax/newspaper_issues/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_pages/_newspaper_page.html.erb +2 -0
- data/app/views/hyrax/newspaper_pages/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb +5 -0
- data/app/views/hyrax/newspaper_titles/_issue_search_form.html.erb +33 -0
- data/app/views/hyrax/newspaper_titles/_issues_calendar.html.erb +63 -0
- data/app/views/hyrax/newspaper_titles/_newspaper_title.html.erb +2 -0
- data/app/views/hyrax/newspaper_titles/show.html.erb +54 -0
- data/app/views/newspaper_works/base/_attribute_rows.html.erb +42 -0
- data/app/views/newspaper_works/base/_attributes.html.erb +16 -0
- data/app/views/newspaper_works/base/_metadata.html.erb +6 -0
- data/app/views/newspaper_works/base/_newspaper_hierarchy.html.erb +14 -0
- data/app/views/newspaper_works/base/_persistent_url.html.erb +1 -0
- data/app/views/newspaper_works/base/_show.html.erb +45 -0
- data/app/views/newspaper_works/newspapers_search/_date_fields.html.erb +29 -0
- data/app/views/newspaper_works/newspapers_search/_facet_layout.html.erb +8 -0
- data/app/views/newspaper_works/newspapers_search/_facet_limit.html.erb +17 -0
- data/app/views/newspaper_works/newspapers_search/_front_pages_input.html.erb +5 -0
- data/app/views/newspaper_works/newspapers_search/_keyword_input.html.erb +18 -0
- data/app/views/newspaper_works/newspapers_search/_newspapers_facets.html.erb +5 -0
- data/app/views/newspaper_works/newspapers_search/_newspapers_search_form.html.erb +13 -0
- data/app/views/newspaper_works/newspapers_search/_newspapers_search_help.html.erb +8 -0
- data/app/views/newspaper_works/newspapers_search/search.html.erb +13 -0
- data/app/views/records/edit_fields/_alternate_title.html.erb +4 -0
- data/app/views/records/edit_fields/_genre.html.erb +4 -0
- data/app/views/records/edit_fields/_place_of_publication.html.erb +14 -0
- data/app/views/records/edit_fields/_subtitle.html.erb +4 -0
- data/bin/rails +13 -0
- data/config/fcrepo_wrapper_test.yml +5 -0
- data/config/initializers/assets.rb +2 -0
- data/config/locales/newspaper_article.de.yml +12 -0
- data/config/locales/newspaper_article.en.yml +12 -0
- data/config/locales/newspaper_article.es.yml +12 -0
- data/config/locales/newspaper_article.fr.yml +12 -0
- data/config/locales/newspaper_article.it.yml +12 -0
- data/config/locales/newspaper_article.pt-BR.yml +12 -0
- data/config/locales/newspaper_article.zh.yml +12 -0
- data/config/locales/newspaper_container.de.yml +8 -0
- data/config/locales/newspaper_container.en.yml +8 -0
- data/config/locales/newspaper_container.es.yml +8 -0
- data/config/locales/newspaper_container.fr.yml +8 -0
- data/config/locales/newspaper_container.it.yml +8 -0
- data/config/locales/newspaper_container.pt-BR.yml +8 -0
- data/config/locales/newspaper_container.zh.yml +8 -0
- data/config/locales/newspaper_issue.de.yml +8 -0
- data/config/locales/newspaper_issue.en.yml +8 -0
- data/config/locales/newspaper_issue.es.yml +8 -0
- data/config/locales/newspaper_issue.fr.yml +8 -0
- data/config/locales/newspaper_issue.it.yml +8 -0
- data/config/locales/newspaper_issue.pt-BR.yml +8 -0
- data/config/locales/newspaper_issue.zh.yml +8 -0
- data/config/locales/newspaper_page.de.yml +15 -0
- data/config/locales/newspaper_page.en.yml +15 -0
- data/config/locales/newspaper_page.es.yml +15 -0
- data/config/locales/newspaper_page.fr.yml +15 -0
- data/config/locales/newspaper_page.it.yml +15 -0
- data/config/locales/newspaper_page.pt-BR.yml +15 -0
- data/config/locales/newspaper_page.zh.yml +15 -0
- data/config/locales/newspaper_title.de.yml +8 -0
- data/config/locales/newspaper_title.en.yml +8 -0
- data/config/locales/newspaper_title.es.yml +8 -0
- data/config/locales/newspaper_title.fr.yml +8 -0
- data/config/locales/newspaper_title.it.yml +8 -0
- data/config/locales/newspaper_title.pt-BR.yml +8 -0
- data/config/locales/newspaper_title.zh.yml +8 -0
- data/config/locales/newspaper_works.de.yml +50 -0
- data/config/locales/newspaper_works.en.yml +52 -0
- data/config/locales/newspaper_works.es.yml +52 -0
- data/config/locales/newspaper_works.fr.yml +52 -0
- data/config/locales/newspaper_works.it.yml +52 -0
- data/config/locales/newspaper_works.pt-BR.yml +52 -0
- data/config/locales/newspaper_works.zh.yml +52 -0
- data/config/routes.rb +9 -0
- data/config/solr_wrapper_test.yml +9 -0
- data/config/test-fixture/solr-config/_rest_managed.json +3 -0
- data/config/test-fixture/solr-config/admin-extra.html +31 -0
- data/config/test-fixture/solr-config/elevate.xml +36 -0
- data/config/test-fixture/solr-config/mapping-ISOLatin1Accent.txt +246 -0
- data/config/test-fixture/solr-config/protwords.txt +21 -0
- data/config/test-fixture/solr-config/schema.xml +366 -0
- data/config/test-fixture/solr-config/scripts.conf +24 -0
- data/config/test-fixture/solr-config/solrconfig.xml +322 -0
- data/config/test-fixture/solr-config/spellings.txt +2 -0
- data/config/test-fixture/solr-config/stopwords.txt +58 -0
- data/config/test-fixture/solr-config/stopwords_en.txt +58 -0
- data/config/test-fixture/solr-config/synonyms.txt +31 -0
- data/config/test-fixture/solr-config/xslt/example.xsl +132 -0
- data/config/test-fixture/solr-config/xslt/example_atom.xsl +67 -0
- data/config/test-fixture/solr-config/xslt/example_rss.xsl +66 -0
- data/config/test-fixture/solr-config/xslt/luke.xsl +337 -0
- data/config/vendor/imagemagick-6-policy.xml +76 -0
- data/db/migrate/20181214181358_create_newspaper_works_derivative_attachments.rb +12 -0
- data/db/migrate/20190107165909_create_newspaper_works_ingest_file_relations.rb +11 -0
- data/lib/generators/newspaper_works/assets_generator.rb +29 -0
- data/lib/generators/newspaper_works/blacklight_advanced_search_generator.rb +44 -0
- data/lib/generators/newspaper_works/blacklight_iiif_search_generator.rb +41 -0
- data/lib/generators/newspaper_works/catalog_controller_generator.rb +60 -0
- data/lib/generators/newspaper_works/install_generator.rb +97 -0
- data/lib/generators/newspaper_works/templates/annotation_behavior.rb +6 -0
- data/lib/generators/newspaper_works/templates/config/authorities/newspaper_article_genres.yml +86 -0
- data/lib/generators/newspaper_works/templates/config/initializers/newspaper_works.rb +12 -0
- data/lib/generators/newspaper_works/templates/config/initializers/patch_blacklight_advanced_search.rb +74 -0
- data/lib/generators/newspaper_works/templates/custom_search_builder.rb +23 -0
- data/lib/generators/newspaper_works/templates/newspaper_works.scss +1 -0
- data/lib/generators/newspaper_works/templates/newspaper_works_helper.rb +3 -0
- data/lib/generators/newspaper_works/templates/search_behavior.rb +6 -0
- data/lib/newspaper_works/configuration.rb +14 -0
- data/lib/newspaper_works/data/fileset_helper.rb +25 -0
- data/lib/newspaper_works/data/path_helper.rb +40 -0
- data/lib/newspaper_works/data/work_derivatives.rb +314 -0
- data/lib/newspaper_works/data/work_file.rb +92 -0
- data/lib/newspaper_works/data/work_files.rb +181 -0
- data/lib/newspaper_works/data.rb +35 -0
- data/lib/newspaper_works/engine.rb +42 -0
- data/lib/newspaper_works/errors.rb +14 -0
- data/lib/newspaper_works/ingest/base_ingest.rb +69 -0
- data/lib/newspaper_works/ingest/base_publication_info.rb +35 -0
- data/lib/newspaper_works/ingest/batch_ingest_helper.rb +44 -0
- data/lib/newspaper_works/ingest/batch_issue_ingester.rb +129 -0
- data/lib/newspaper_works/ingest/chronam_publication_info.rb +133 -0
- data/lib/newspaper_works/ingest/from_command.rb +52 -0
- data/lib/newspaper_works/ingest/image_ingest_issues.rb +43 -0
- data/lib/newspaper_works/ingest/issue_images.rb +51 -0
- data/lib/newspaper_works/ingest/lc_publication_info.rb +144 -0
- data/lib/newspaper_works/ingest/named_issue_metadata.rb +60 -0
- data/lib/newspaper_works/ingest/ndnp/batch_ingester.rb +64 -0
- data/lib/newspaper_works/ingest/ndnp/batch_xml_ingest.rb +72 -0
- data/lib/newspaper_works/ingest/ndnp/container_ingest.rb +99 -0
- data/lib/newspaper_works/ingest/ndnp/container_ingester.rb +84 -0
- data/lib/newspaper_works/ingest/ndnp/container_metadata.rb +87 -0
- data/lib/newspaper_works/ingest/ndnp/issue_ingest.rb +81 -0
- data/lib/newspaper_works/ingest/ndnp/issue_ingester.rb +101 -0
- data/lib/newspaper_works/ingest/ndnp/issue_metadata.rb +96 -0
- data/lib/newspaper_works/ingest/ndnp/ndnp_asset_helper.rb +20 -0
- data/lib/newspaper_works/ingest/ndnp/ndnp_mets_helper.rb +70 -0
- data/lib/newspaper_works/ingest/ndnp/page_ingest.rb +47 -0
- data/lib/newspaper_works/ingest/ndnp/page_ingester.rb +157 -0
- data/lib/newspaper_works/ingest/ndnp/page_metadata.rb +112 -0
- data/lib/newspaper_works/ingest/ndnp.rb +21 -0
- data/lib/newspaper_works/ingest/newspaper_issue_ingest.rb +56 -0
- data/lib/newspaper_works/ingest/newspaper_page_ingest.rb +6 -0
- data/lib/newspaper_works/ingest/page_image.rb +52 -0
- data/lib/newspaper_works/ingest/path_enumeration.rb +52 -0
- data/lib/newspaper_works/ingest/pdf_images.rb +85 -0
- data/lib/newspaper_works/ingest/pdf_issue.rb +20 -0
- data/lib/newspaper_works/ingest/pdf_issues.rb +39 -0
- data/lib/newspaper_works/ingest/pdf_pages.rb +114 -0
- data/lib/newspaper_works/ingest/pub_finder.rb +89 -0
- data/lib/newspaper_works/ingest/publication_info.rb +44 -0
- data/lib/newspaper_works/ingest.rb +90 -0
- data/lib/newspaper_works/issue_pdf_composer.rb +111 -0
- data/lib/newspaper_works/logging.rb +54 -0
- data/lib/newspaper_works/page_finder.rb +62 -0
- data/lib/newspaper_works/resource_fetcher.rb +78 -0
- data/lib/newspaper_works/text_extraction/alto_reader.rb +122 -0
- data/lib/newspaper_works/text_extraction/page_ocr.rb +100 -0
- data/lib/newspaper_works/text_extraction/render_alto.rb +84 -0
- data/lib/newspaper_works/text_extraction/word_coords_builder.rb +30 -0
- data/lib/newspaper_works/text_extraction.rb +10 -0
- data/lib/newspaper_works/version.rb +3 -0
- data/lib/newspaper_works.rb +19 -0
- data/lib/tasks/newspaper_works_tasks.rake +39 -0
- data/newspaper_works.gemspec +49 -0
- data/spec/.keep.txt +1 -0
- data/spec/actors/newspaper_works/actors/newspaper_works_upload_actor_spec.rb +69 -0
- data/spec/controllers/catalog_controller_spec.rb +63 -0
- data/spec/controllers/newspaper_works/newspapers_controller_spec.rb +114 -0
- data/spec/controllers/newspaper_works/newspapers_search_controller_spec.rb +21 -0
- data/spec/factories/ability.rb +6 -0
- data/spec/factories/newspaper_issue.rb +7 -0
- data/spec/factories/newspaper_issue_ingest.rb +6 -0
- data/spec/factories/newspaper_page.rb +7 -0
- data/spec/factories/newspaper_page_ingest.rb +6 -0
- data/spec/factories/newspaper_page_solr_document.rb +12 -0
- data/spec/factories/newspaper_title.rb +8 -0
- data/spec/factories/uploaded_pdf_file.rb +9 -0
- data/spec/factories/user.rb +13 -0
- data/spec/features/front_pages_for_title_spec.rb +19 -0
- data/spec/features/newspaper_title_search_spec.rb +30 -0
- data/spec/features/newspapers_search_spec.rb +49 -0
- data/spec/features/search_results_thumbnail_highlights_spec.rb +33 -0
- data/spec/features_shared.rb +71 -0
- data/spec/fixtures/files/4.1.07.jp2 +0 -0
- data/spec/fixtures/files/4.1.07.tiff +0 -0
- data/spec/fixtures/files/README.md +7 -0
- data/spec/fixtures/files/alto-2-0.xsd +714 -0
- data/spec/fixtures/files/broken-truncated.pdf +0 -0
- data/spec/fixtures/files/credits.md +16 -0
- data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
- data/spec/fixtures/files/minimal-1-page.pdf +0 -0
- data/spec/fixtures/files/minimal-2-page.pdf +0 -0
- data/spec/fixtures/files/minimal-alto.xml +31 -0
- data/spec/fixtures/files/ndnp-alto-sample.xml +24 -0
- data/spec/fixtures/files/ndnp-sample1-json.json +1 -0
- data/spec/fixtures/files/ndnp-sample1-txt.txt +1 -0
- data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
- data/spec/fixtures/files/ocr_alto.xml +202 -0
- data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +202 -0
- data/spec/fixtures/files/ocr_color.tiff +0 -0
- data/spec/fixtures/files/ocr_gray.jp2 +0 -0
- data/spec/fixtures/files/ocr_gray.tiff +0 -0
- data/spec/fixtures/files/ocr_mono.tiff +0 -0
- data/spec/fixtures/files/page1.tiff +0 -0
- data/spec/fixtures/files/resource_mocks/chronam/http404-expected +0 -0
- data/spec/fixtures/files/resource_mocks/chronam/sn84038814.rdf +1028 -0
- data/spec/fixtures/files/resource_mocks/chronam/sn93059126.rdf +36 -0
- data/spec/fixtures/files/resource_mocks/chronam/sn94051019.rdf +37 -0
- data/spec/fixtures/files/resource_mocks/geonames/Chicopee +1104 -0
- data/spec/fixtures/files/resource_mocks/geonames/Denver +1104 -0
- data/spec/fixtures/files/resource_mocks/geonames/Marysville +279 -0
- data/spec/fixtures/files/resource_mocks/geonames/Marysville2 +279 -0
- data/spec/fixtures/files/resource_mocks/geonames/SLC +1104 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn2099999999 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn82014496 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn83020109 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn83021453 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn83045396 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn84038814 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn93059126 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn94051019 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn99999999 +1 -0
- data/spec/fixtures/files/resource_mocks/urls.json +82 -0
- data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
- data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
- data/spec/fixtures/files/thumbnail.jpg +0 -0
- data/spec/forms/hyrax/newspaper_article_form_spec.rb +33 -0
- data/spec/forms/hyrax/newspaper_container_form_spec.rb +30 -0
- data/spec/forms/hyrax/newspaper_issue_form_spec.rb +31 -0
- data/spec/forms/hyrax/newspaper_page_form_spec.rb +28 -0
- data/spec/forms/hyrax/newspaper_title_form_spec.rb +31 -0
- data/spec/forms/newspaper_works/newspaper_core_form_data_spec.rb +12 -0
- data/spec/helpers/newspaper_works/breadcrumb_helper_spec.rb +82 -0
- data/spec/helpers/newspaper_works_helper_spec.rb +57 -0
- data/spec/indexers/concerns/newspaper_works/indexes_full_text_spec.rb +31 -0
- data/spec/indexers/concerns/newspaper_works/indexes_place_of_publication_spec.rb +53 -0
- data/spec/indexers/concerns/newspaper_works/indexes_publication_date_range_spec.rb +39 -0
- data/spec/indexers/concerns/newspaper_works/indexes_relationships_spec.rb +86 -0
- data/spec/indexers/newspaper_article_indexer_spec.rb +29 -0
- data/spec/indexers/newspaper_issue_indexer_spec.rb +19 -0
- data/spec/indexers/newspaper_title_indexer_spec.rb +22 -0
- data/spec/indexers/newspaper_works/newspaper_core_indexer_spec.rb +23 -0
- data/spec/lib/newspaper_works/configuration_spec.rb +18 -0
- data/spec/lib/newspaper_works/data/work_derivatives_spec.rb +245 -0
- data/spec/lib/newspaper_works/data/work_file_spec.rb +99 -0
- data/spec/lib/newspaper_works/data/work_files_spec.rb +224 -0
- data/spec/lib/newspaper_works/ingest/batch_issue_ingester_spec.rb +158 -0
- data/spec/lib/newspaper_works/ingest/chronam_publication_info_spec.rb +35 -0
- data/spec/lib/newspaper_works/ingest/from_command_spec.rb +75 -0
- data/spec/lib/newspaper_works/ingest/image_ingest_issues_spec.rb +62 -0
- data/spec/lib/newspaper_works/ingest/ingest_shared.rb +75 -0
- data/spec/lib/newspaper_works/ingest/issue_images_spec.rb +65 -0
- data/spec/lib/newspaper_works/ingest/lc_publication_info_spec.rb +34 -0
- data/spec/lib/newspaper_works/ingest/ndnp/batch_ingester_spec.rb +131 -0
- data/spec/lib/newspaper_works/ingest/ndnp/batch_xml_ingest_spec.rb +64 -0
- data/spec/lib/newspaper_works/ingest/ndnp/container_ingest_spec.rb +44 -0
- data/spec/lib/newspaper_works/ingest/ndnp/container_ingester_spec.rb +126 -0
- data/spec/lib/newspaper_works/ingest/ndnp/container_metadata_spec.rb +36 -0
- data/spec/lib/newspaper_works/ingest/ndnp/issue_ingest_spec.rb +108 -0
- data/spec/lib/newspaper_works/ingest/ndnp/issue_ingester_spec.rb +155 -0
- data/spec/lib/newspaper_works/ingest/ndnp/issue_metadata_spec.rb +84 -0
- data/spec/lib/newspaper_works/ingest/ndnp/page_ingest_spec.rb +79 -0
- data/spec/lib/newspaper_works/ingest/ndnp/page_ingester_spec.rb +184 -0
- data/spec/lib/newspaper_works/ingest/ndnp/page_metadata_spec.rb +85 -0
- data/spec/lib/newspaper_works/ingest/newspaper_issue_ingest_spec.rb +83 -0
- data/spec/lib/newspaper_works/ingest/newspaper_page_ingest_spec.rb +77 -0
- data/spec/lib/newspaper_works/ingest/page_image_spec.rb +29 -0
- data/spec/lib/newspaper_works/ingest/pdf_images_spec.rb +32 -0
- data/spec/lib/newspaper_works/ingest/pdf_issue_spec.rb +29 -0
- data/spec/lib/newspaper_works/ingest/pdf_issues_spec.rb +62 -0
- data/spec/lib/newspaper_works/ingest/pdf_pages_spec.rb +110 -0
- data/spec/lib/newspaper_works/ingest/pub_finder_spec.rb +58 -0
- data/spec/lib/newspaper_works/ingest/publication_info_spec.rb +61 -0
- data/spec/lib/newspaper_works/ingest_spec.rb +45 -0
- data/spec/lib/newspaper_works/issue_pdf_composer_spec.rb +101 -0
- data/spec/lib/newspaper_works/logging_spec.rb +53 -0
- data/spec/lib/newspaper_works/page_finder_spec.rb +53 -0
- data/spec/lib/newspaper_works/resource_fetcher_spec.rb +65 -0
- data/spec/lib/newspaper_works/text_extraction/alto_reader_spec.rb +49 -0
- data/spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb +84 -0
- data/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb +54 -0
- data/spec/lib/newspaper_works/text_extraction/word_coords_builder_spec.rb +30 -0
- data/spec/lib/tasks/newspaper_works_rake_spec.rb +124 -0
- data/spec/misc_shared.rb +109 -0
- data/spec/model_shared.rb +134 -0
- data/spec/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior_spec.rb +45 -0
- data/spec/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior_spec.rb +27 -0
- data/spec/models/concerns/newspaper_works/newspaper_core_metadata_spec.rb +45 -0
- data/spec/models/concerns/newspaper_works/place_of_publication_behavior_spec.rb +17 -0
- data/spec/models/concerns/newspaper_works/scanned_media_metadata_spec.rb +35 -0
- data/spec/models/newspaper_article_spec.rb +73 -0
- data/spec/models/newspaper_container_spec.rb +111 -0
- data/spec/models/newspaper_issue_spec.rb +91 -0
- data/spec/models/newspaper_page_spec.rb +44 -0
- data/spec/models/newspaper_title_spec.rb +116 -0
- data/spec/models/newspaper_works/derivative_attachment_spec.rb +37 -0
- data/spec/models/newspaper_works/ingest_file_relation_spec.rb +56 -0
- data/spec/models/solr_document_spec.rb +14 -0
- data/spec/ndnp_shared.rb +48 -0
- data/spec/presenters/hyrax/newspaper_article_presenter_spec.rb +53 -0
- data/spec/presenters/hyrax/newspaper_container_presenter_spec.rb +20 -0
- data/spec/presenters/hyrax/newspaper_issue_presenter_spec.rb +65 -0
- data/spec/presenters/hyrax/newspaper_page_presenter_spec.rb +75 -0
- data/spec/presenters/hyrax/newspaper_title_presenter_spec.rb +153 -0
- data/spec/presenters/newspaper_works/iiif_manifest_presenter_behavior_spec.rb +32 -0
- data/spec/presenters/newspaper_works/issue_info_presenter_spec.rb +51 -0
- data/spec/presenters/newspaper_works/newspaper_core_presenter_spec.rb +22 -0
- data/spec/presenters/newspaper_works/persistent_url_presenter_behavior_spec.rb +24 -0
- data/spec/presenters/newspaper_works/place_of_publication_presenter_behavior_spec.rb +17 -0
- data/spec/presenters/newspaper_works/scanned_media_presenter_spec.rb +18 -0
- data/spec/presenters/newspaper_works/title_info_presenter_spec.rb +23 -0
- data/spec/routing/route_spec.rb +52 -0
- data/spec/search_builders/custom_search_builder_spec.rb +34 -0
- data/spec/search_builders/newspaper_works/newspapers_search_builder_spec.rb +33 -0
- data/spec/services/hyrax/article_genre_service_spec.rb +12 -0
- data/spec/services/hyrax/resource_types_service_spec.rb +12 -0
- data/spec/services/newspaper_works/jp2_derivative_service_spec.rb +62 -0
- data/spec/services/newspaper_works/newspaper_page_derivative_service_spec.rb +125 -0
- data/spec/services/newspaper_works/pdf_derivative_service_spec.rb +62 -0
- data/spec/services/newspaper_works/pluggable_derivative_service_spec.rb +204 -0
- data/spec/services/newspaper_works/text_extraction_derivative_service_spec.rb +82 -0
- data/spec/services/newspaper_works/text_formats_from_alto_service_spec.rb +129 -0
- data/spec/services/newspaper_works/tiff_derivative_service_spec.rb +58 -0
- data/spec/spec_helper.rb +261 -0
- data/spec/support/controller_level_helpers.rb +28 -0
- data/spec/test_app_templates/lib/generators/test_app_generator.rb +22 -0
- data/spec/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb_spec.rb +36 -0
- data/spec/views/catalog/_index_header_list_newspaper_page.html.erb_spec.rb +26 -0
- data/spec/views/catalog/_thumbnail_list_newspaper_page.html.erb_spec.rb +35 -0
- data/spec/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb_spec.rb +16 -0
- data/spec/views/hyrax/newspaper_titles/_issue_search_form.html.erb_spec.rb +33 -0
- data/spec/views/hyrax/newspaper_titles/_issues_calendar.html.erb_spec.rb +37 -0
- data/spec/views/hyrax/newspaper_titles/show.html.erb_spec.rb +87 -0
- data/spec/views/newspaper_works/base/_attribute_rows.html.erb_spec.rb +60 -0
- data/spec/views/newspaper_works/base/_newspaper_hierarchy.html.erb_spec.rb +80 -0
- data/spec/views/newspaper_works/base/_show.html.erb_spec.rb +78 -0
- data/spec/views/newspaper_works/newspapers_search/search.html.erb_spec.rb +54 -0
- data/spec/views/records/edit_fields/_place_of_publication.html.erb_spec.rb +26 -0
- data/tasks/newspaperworks_dev.rake +26 -0
- data/test/integration/navigation_test.rb +7 -0
- data/test/lib/generators/newspaper_works/install_generator_test.rb +16 -0
- data/test/newspaper_works_test.rb +7 -0
- data/test/test_helper.rb +17 -0
- data/tmp/.keep +0 -0
- metadata +1037 -0
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
module NewspaperWorks
|
|
2
|
+
module Ingest
|
|
3
|
+
# Represents TIFF/JP2 page, access to file, page-numbering metadata
|
|
4
|
+
class PageImage
|
|
5
|
+
attr_accessor :path, :issue, :sequence
|
|
6
|
+
|
|
7
|
+
delegate :lccn, to: :issue
|
|
8
|
+
|
|
9
|
+
def initialize(path, issue, sequence)
|
|
10
|
+
# path to image:
|
|
11
|
+
@path = path
|
|
12
|
+
validate_path
|
|
13
|
+
# Issue is NewspaperWorks::Ingest::IssueImages object
|
|
14
|
+
@issue = issue
|
|
15
|
+
# sequence is page sequence number (Integer)
|
|
16
|
+
@sequence = sequence.to_i
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# Page number inferred from image filename, or nil, presuming that:
|
|
20
|
+
# - The page number follows the actual word "page" (case-insenstive)
|
|
21
|
+
# in filename, possibly separated by a dash or underscore.
|
|
22
|
+
# - The page number is terminated by the period-plus-file-extension.
|
|
23
|
+
# - Both of the above can be determined by regular expression match.
|
|
24
|
+
# - Extraneous leading information in filename (e.g. datestamp) will
|
|
25
|
+
# be ignored.
|
|
26
|
+
# - Examples:
|
|
27
|
+
# - 'Page1.tiff'
|
|
28
|
+
# - '2019091801-page_1.jp2'
|
|
29
|
+
# - 'page_C2.tiff'
|
|
30
|
+
# @return [String, NilClass] page number string, or nil if indecipherable
|
|
31
|
+
def named_page_number
|
|
32
|
+
pattern = /(page)([_-]?)([^.]+)([.])/i
|
|
33
|
+
match = pattern.match(path)
|
|
34
|
+
match.nil? ? nil : match[3]
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def page_number
|
|
38
|
+
named_page_number || @sequence.to_s
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def title
|
|
42
|
+
["#{@issue.title.first}: Page #{page_number}"]
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def validate_path
|
|
46
|
+
# expect path to be regular file, that exists:
|
|
47
|
+
raise ArgumentError unless File.exist?(path)
|
|
48
|
+
raise ArgumentError unless File.ftype(path) == 'file'
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
module NewspaperWorks
|
|
2
|
+
module Ingest
|
|
3
|
+
# Provides enumeration of path keys to object values, where:
|
|
4
|
+
# - Consuming class:
|
|
5
|
+
# - Defines a `paths` method returning array of paths.
|
|
6
|
+
# - Defines an `info` method that returns an object for a path.
|
|
7
|
+
# - Also mixes in Enumerable
|
|
8
|
+
module PathEnumeration
|
|
9
|
+
delegate :size, :include?, to: :_paths
|
|
10
|
+
|
|
11
|
+
def _paths
|
|
12
|
+
paths
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def _info(path)
|
|
16
|
+
info(path)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def each
|
|
20
|
+
return enum_for(:each) unless block_given?
|
|
21
|
+
paths.each do |path|
|
|
22
|
+
yield [path, info(path)]
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def each_key
|
|
27
|
+
enum_for(:each_key) unless block_given?
|
|
28
|
+
paths.each { |path| yield path }
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def each_value
|
|
32
|
+
return enum_for(:each_value) unless block_given?
|
|
33
|
+
paths.each do |path|
|
|
34
|
+
yield info(path)
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def values
|
|
39
|
+
each_value.to_a
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def entries
|
|
43
|
+
each.to_a
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
alias each_pair each
|
|
47
|
+
alias keys _paths
|
|
48
|
+
alias has_key? include?
|
|
49
|
+
alias [] _info
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
require 'open3'
|
|
2
|
+
require 'mini_magick'
|
|
3
|
+
|
|
4
|
+
module NewspaperWorks
|
|
5
|
+
module Ingest
|
|
6
|
+
# PdfImages uses poppler 0.19+ pdfimages command to extract image
|
|
7
|
+
# listing metadata from PDF files.
|
|
8
|
+
# For dpi extraction, falls back to calculating using MiniMagick,
|
|
9
|
+
# if neccessary.
|
|
10
|
+
class PdfImages
|
|
11
|
+
# class constant column numbers
|
|
12
|
+
COL_WIDTH = 3
|
|
13
|
+
COL_HEIGHT = 4
|
|
14
|
+
COL_COLOR = 5
|
|
15
|
+
COL_CHANNELS = 6
|
|
16
|
+
COL_BITS = 7
|
|
17
|
+
# only poppler 0.25+ has this column in output:
|
|
18
|
+
COL_XPPI = 12
|
|
19
|
+
|
|
20
|
+
def initialize(path)
|
|
21
|
+
@path = path
|
|
22
|
+
@cmd = format('pdfimages -list %<path>s', path: path)
|
|
23
|
+
@output = nil
|
|
24
|
+
@entries = nil
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def process
|
|
28
|
+
# call just once
|
|
29
|
+
if @output.nil?
|
|
30
|
+
Open3.popen3(@cmd) do |_stdin, stdout, _stderr, _wait_thr|
|
|
31
|
+
@output = stdout.read.split("\n")
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
@output.slice(2, @output.size - 1)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def entries
|
|
38
|
+
if @entries.nil?
|
|
39
|
+
@entries = []
|
|
40
|
+
output = process
|
|
41
|
+
(0..output.size - 1).each do |i|
|
|
42
|
+
@entries.push(output[i].gsub(/\s+/m, ' ').strip.split(" "))
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
@entries
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def selectcolumn(i, &block)
|
|
49
|
+
result = entries.map { |e| e[i] }
|
|
50
|
+
return result.map!(&block) if block_given?
|
|
51
|
+
result
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def width
|
|
55
|
+
selectcolumn(COL_WIDTH, &:to_i).max
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def height
|
|
59
|
+
selectcolumn(COL_HEIGHT, &:to_i).max
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def color
|
|
63
|
+
# desc is either 'gray', 'cmyk', 'rgb', but 1-bit gray is black/white
|
|
64
|
+
# so caller may want all of this information, and in case of
|
|
65
|
+
# mixed color spaces across images, this returns maximum
|
|
66
|
+
desc = entries.any? { |e| e[COL_COLOR] != 'gray' } ? 'rgb' : 'gray'
|
|
67
|
+
channels = entries.map { |e| e[COL_CHANNELS].to_i }.max
|
|
68
|
+
bits = entries.map { |e| e[COL_BITS].to_i }.max
|
|
69
|
+
[desc, channels, bits]
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def ppi
|
|
73
|
+
if entries[0].size <= 12
|
|
74
|
+
# poppler < 0.25
|
|
75
|
+
pdf = MiniMagick::Image.open(@path)
|
|
76
|
+
width_points = pdf.width
|
|
77
|
+
width_px = width
|
|
78
|
+
return (72 * width_px / width_points).to_i
|
|
79
|
+
end
|
|
80
|
+
# with poppler 0.25+, pdfimages just gives us this:
|
|
81
|
+
selectcolumn(COL_XPPI, &:to_i).max
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
require 'date'
|
|
2
|
+
|
|
3
|
+
module NewspaperWorks
|
|
4
|
+
module Ingest
|
|
5
|
+
class PDFIssue
|
|
6
|
+
attr_accessor :path, :publication
|
|
7
|
+
|
|
8
|
+
# most acccessors for issue/edition metadata, publication metadata
|
|
9
|
+
# provided by including this mixin:
|
|
10
|
+
include NewspaperWorks::Ingest::NamedIssueMetadata
|
|
11
|
+
|
|
12
|
+
def initialize(path, publication)
|
|
13
|
+
@path = path
|
|
14
|
+
validate_path
|
|
15
|
+
# as a NewspaperWorks::Ingest::PublicationInfo object:
|
|
16
|
+
@publication = publication
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
require 'find'
|
|
2
|
+
|
|
3
|
+
module NewspaperWorks
|
|
4
|
+
module Ingest
|
|
5
|
+
class PDFIssues
|
|
6
|
+
include Enumerable
|
|
7
|
+
include NewspaperWorks::Ingest::PathEnumeration
|
|
8
|
+
|
|
9
|
+
attr_accessor :path, :publication, :pdf_paths
|
|
10
|
+
|
|
11
|
+
alias paths pdf_paths
|
|
12
|
+
|
|
13
|
+
def initialize(path, publication)
|
|
14
|
+
@path = path
|
|
15
|
+
# as a NewspaperWorks::Ingest::PublicationInfo object:
|
|
16
|
+
@publication = publication
|
|
17
|
+
@pdf_paths = valid_pdfs(path)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def valid_pdfs(path)
|
|
21
|
+
target = []
|
|
22
|
+
Find.find(path) do |p|
|
|
23
|
+
next if File.directory?(p)
|
|
24
|
+
next unless p.end_with?('.pdf')
|
|
25
|
+
target.push(p)
|
|
26
|
+
end
|
|
27
|
+
target
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def lccn
|
|
31
|
+
@publication.lccn
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def info(path)
|
|
35
|
+
NewspaperWorks::Ingest::PDFIssue.new(path, @publication)
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
require 'open3'
|
|
2
|
+
require 'securerandom'
|
|
3
|
+
require 'tmpdir'
|
|
4
|
+
|
|
5
|
+
module NewspaperWorks
|
|
6
|
+
module Ingest
|
|
7
|
+
class PdfPages
|
|
8
|
+
include Enumerable
|
|
9
|
+
|
|
10
|
+
def initialize(path)
|
|
11
|
+
@baseid = SecureRandom.uuid
|
|
12
|
+
@pdfpath = path
|
|
13
|
+
@info = nil
|
|
14
|
+
@entries = nil
|
|
15
|
+
@tmpdir = nil
|
|
16
|
+
@size = nil
|
|
17
|
+
@pagecount = nil
|
|
18
|
+
@pdftext = nil
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# return
|
|
22
|
+
def pdfinfo
|
|
23
|
+
@info = PdfImages.new(@pdfpath) if @info.nil?
|
|
24
|
+
@info
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def tmpdir
|
|
28
|
+
@tmpdir = Dir.mktmpdir if @tmpdir.nil?
|
|
29
|
+
@tmpdir
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def colordevice(channels, bpc)
|
|
33
|
+
bits = bpc * channels
|
|
34
|
+
# will be either 8bpc/16bpd color TIFF,
|
|
35
|
+
# with any CMYK source transformed to 8bpc RBG
|
|
36
|
+
bits = 24 unless [24, 48].include? bits
|
|
37
|
+
"tiff#{bits}nc"
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def gsdevice
|
|
41
|
+
color, channels, bpc = pdfinfo.color
|
|
42
|
+
device = nil
|
|
43
|
+
# CCITT Group 4 Black and White, if applicable:
|
|
44
|
+
device = 'tiffg4' if color == 'gray' && bpc == 1
|
|
45
|
+
# 8 Bit Grayscale, if applicable:
|
|
46
|
+
device = 'tiffgray' if color == 'gray' && bpc > 1
|
|
47
|
+
# otherwise color:
|
|
48
|
+
device = colordevice(channels, bpc) if device.nil?
|
|
49
|
+
device
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def gstext
|
|
53
|
+
cmd = "gs -q -dNOPAUSE -dBATCH -sDEVICE=txtwrite " \
|
|
54
|
+
"-sOutputFile=- -f #{@pdfpath}"
|
|
55
|
+
Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
|
|
56
|
+
@pdftext = stdout.read
|
|
57
|
+
end
|
|
58
|
+
@pdftext
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def pagecount
|
|
62
|
+
cmd = "pdfinfo #{@pdfpath}"
|
|
63
|
+
Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
|
|
64
|
+
output = stdout.read.split("\n")
|
|
65
|
+
pages_e = output.select { |e| e.start_with?('Pages:') }[0]
|
|
66
|
+
@pagecount = pages_e.split[-1].to_i
|
|
67
|
+
end
|
|
68
|
+
@pagecount
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def looks_scanned
|
|
72
|
+
max_image_px = pdfinfo.width * pdfinfo.height
|
|
73
|
+
single_image_per_page = pdfinfo.entries.length == pagecount
|
|
74
|
+
# single 10mp+ image per page?
|
|
75
|
+
single_image_per_page && max_image_px > 1024 * 1024 * 10
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def ppi
|
|
79
|
+
unless looks_scanned
|
|
80
|
+
# 400 dpi for something that does not look like scanned media:
|
|
81
|
+
return 400
|
|
82
|
+
end
|
|
83
|
+
# For scanned media, defer to detected image PPI:
|
|
84
|
+
pdfinfo.ppi
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# ghostscript convert all pages to TIFF
|
|
88
|
+
def gsconvert
|
|
89
|
+
output_base = File.join(tmpdir, "#{@baseid}-page%d.tiff")
|
|
90
|
+
cmd = "gs -dNOPAUSE -dBATCH -sDEVICE=#{gsdevice} " \
|
|
91
|
+
"-dTextAlphaBits=4 " \
|
|
92
|
+
"-sOutputFile=#{output_base} -r#{ppi} -f #{@pdfpath}"
|
|
93
|
+
Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
|
|
94
|
+
output = stdout.read.split("\n")
|
|
95
|
+
@size = output.select { |e| e.start_with?('Page ') }.length
|
|
96
|
+
end
|
|
97
|
+
# Return an array of expected filenames
|
|
98
|
+
(1..@size).map { |n| File.join(tmpdir, "#{@baseid}-page#{n}.tiff") }
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# entries for each page
|
|
102
|
+
def entries
|
|
103
|
+
@entries = gsconvert if @entries.nil?
|
|
104
|
+
@entries
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def each
|
|
108
|
+
entries.each do |e|
|
|
109
|
+
yield(e)
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
end
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
require 'newspaper_works/logging'
|
|
2
|
+
require 'newspaper_works/ingest'
|
|
3
|
+
|
|
4
|
+
module NewspaperWorks
|
|
5
|
+
module Ingest
|
|
6
|
+
# mixin for find-or-create of publication, for use by various ingests
|
|
7
|
+
module PubFinder
|
|
8
|
+
include NewspaperWorks::Logging
|
|
9
|
+
|
|
10
|
+
COPY_FIELDS = [
|
|
11
|
+
:title,
|
|
12
|
+
:lccn,
|
|
13
|
+
:oclcnum,
|
|
14
|
+
:issn,
|
|
15
|
+
:place_of_publication,
|
|
16
|
+
:language,
|
|
17
|
+
:preceded_by,
|
|
18
|
+
:succeeded_by
|
|
19
|
+
].freeze
|
|
20
|
+
|
|
21
|
+
MULTI_VALUED = [
|
|
22
|
+
:title,
|
|
23
|
+
:language,
|
|
24
|
+
:preceded_by,
|
|
25
|
+
:succeeded_by,
|
|
26
|
+
:place_of_publication
|
|
27
|
+
].freeze
|
|
28
|
+
|
|
29
|
+
WRAPPERS = {
|
|
30
|
+
place_of_publication: Hyrax::ControlledVocabularies::Location
|
|
31
|
+
}.freeze
|
|
32
|
+
|
|
33
|
+
# @param lccn [String] Library of Congress Control Number
|
|
34
|
+
# of Publication
|
|
35
|
+
# @return [NewspaperTitle, NilClass] publication or nil if not found
|
|
36
|
+
def find_publication(lccn)
|
|
37
|
+
NewspaperTitle.where(lccn: lccn).first
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Copy publication metadata from authority lookup for LCCN
|
|
41
|
+
# @param publication [NewspaperTitle]
|
|
42
|
+
# @param metadata [NewspaperWorks::Ingest::PublicationInfo]
|
|
43
|
+
def copy_publication_metadata(publication, metadata, lccn, title = nil)
|
|
44
|
+
COPY_FIELDS.each do |name|
|
|
45
|
+
value = metadata.send(name)
|
|
46
|
+
next if value.nil?
|
|
47
|
+
# wrapped value, if applicable:
|
|
48
|
+
value = WRAPPERS[name].new(value) if WRAPPERS.include?(name)
|
|
49
|
+
# value in array, if applicable:
|
|
50
|
+
value = [value] if MULTI_VALUED.include?(name)
|
|
51
|
+
publication.send("#{name}=", value)
|
|
52
|
+
end
|
|
53
|
+
# prefer locally-specified title to looked-up title:
|
|
54
|
+
publication.title = [title] unless title.nil?
|
|
55
|
+
# final fallback, nothing specified, title mandatory: use LCCN
|
|
56
|
+
publication.title = [lccn] if publication.title.empty?
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def create_publication(lccn, title = nil, opts = {})
|
|
60
|
+
publication = NewspaperTitle.create
|
|
61
|
+
info = NewspaperWorks::Ingest::PublicationInfo.new(lccn)
|
|
62
|
+
copy_publication_metadata(publication, info, lccn, title)
|
|
63
|
+
publication.lccn ||= lccn
|
|
64
|
+
NewspaperWorks::Ingest.assign_administrative_metadata(publication, opts)
|
|
65
|
+
publication.save!
|
|
66
|
+
write_log(
|
|
67
|
+
"Created NewspaperTitle work #{publication.id} for LCCN #{lccn}"
|
|
68
|
+
)
|
|
69
|
+
publication
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def find_or_create_publication_for_issue(issue, lccn, title, opts)
|
|
73
|
+
publication = find_publication(lccn)
|
|
74
|
+
unless publication.nil?
|
|
75
|
+
write_log(
|
|
76
|
+
"Found existing NewspaperTitle #{publication.id}, LCCN #{lccn}"
|
|
77
|
+
)
|
|
78
|
+
end
|
|
79
|
+
publication = create_publication(lccn, title, opts) if publication.nil?
|
|
80
|
+
publication.members << issue
|
|
81
|
+
publication.save!
|
|
82
|
+
write_log(
|
|
83
|
+
"Linked NewspaperIssue #{issue.id} to "\
|
|
84
|
+
"NewspaperTitle work #{publication.id}"
|
|
85
|
+
)
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
require 'faraday'
|
|
2
|
+
require 'nokogiri'
|
|
3
|
+
require 'uri'
|
|
4
|
+
|
|
5
|
+
module NewspaperWorks
|
|
6
|
+
module Ingest
|
|
7
|
+
class PublicationInfo
|
|
8
|
+
attr_accessor :implementation, :lccn
|
|
9
|
+
|
|
10
|
+
def initialize(lccn)
|
|
11
|
+
@lccn = lccn
|
|
12
|
+
@implementation = nil
|
|
13
|
+
load
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def load_chronam_fallback
|
|
17
|
+
@implementation = ChronAmPublicationInfo.new(@lccn)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def load
|
|
21
|
+
@implementation = LCPublicationInfo.new(@lccn)
|
|
22
|
+
@implementation.load
|
|
23
|
+
# Empty mods is equivalent to 404 for LCCN in LC Catalog:
|
|
24
|
+
load_chronam_fallback if @implementation.empty?
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def respond_to_missing?(symbol, include_priv = false)
|
|
28
|
+
@implementation.respond_to?(symbol, include_priv)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def method_missing(method, *args, &block)
|
|
32
|
+
# proxy call to underlying implementation:
|
|
33
|
+
if respond_to_missing?(method)
|
|
34
|
+
return @implementation.send(
|
|
35
|
+
method,
|
|
36
|
+
*args,
|
|
37
|
+
&block
|
|
38
|
+
)
|
|
39
|
+
end
|
|
40
|
+
super
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
require 'faraday'
|
|
2
|
+
require 'nokogiri'
|
|
3
|
+
require 'uri'
|
|
4
|
+
require 'newspaper_works/ingest/from_command'
|
|
5
|
+
require 'newspaper_works/ingest/base_publication_info'
|
|
6
|
+
require 'newspaper_works/ingest/chronam_publication_info'
|
|
7
|
+
require 'newspaper_works/ingest/lc_publication_info'
|
|
8
|
+
require 'newspaper_works/ingest/publication_info'
|
|
9
|
+
require 'newspaper_works/ingest/pub_finder'
|
|
10
|
+
require 'newspaper_works/ingest/pdf_images'
|
|
11
|
+
require 'newspaper_works/ingest/named_issue_metadata'
|
|
12
|
+
require 'newspaper_works/ingest/path_enumeration'
|
|
13
|
+
require 'newspaper_works/ingest/pdf_issue'
|
|
14
|
+
require 'newspaper_works/ingest/pdf_issues'
|
|
15
|
+
require 'newspaper_works/ingest/batch_ingest_helper'
|
|
16
|
+
require 'newspaper_works/ingest/batch_issue_ingester'
|
|
17
|
+
require 'newspaper_works/ingest/pdf_pages'
|
|
18
|
+
require 'newspaper_works/ingest/issue_images'
|
|
19
|
+
require 'newspaper_works/ingest/page_image'
|
|
20
|
+
require 'newspaper_works/ingest/image_ingest_issues'
|
|
21
|
+
require 'newspaper_works/ingest/base_ingest'
|
|
22
|
+
require 'newspaper_works/ingest/ndnp'
|
|
23
|
+
require 'newspaper_works/ingest/newspaper_page_ingest'
|
|
24
|
+
require 'newspaper_works/ingest/newspaper_issue_ingest'
|
|
25
|
+
|
|
26
|
+
module NewspaperWorks
|
|
27
|
+
# Module for Ingest adapters that import files into model objects
|
|
28
|
+
module Ingest
|
|
29
|
+
# Get Geonames URI for closest place match
|
|
30
|
+
# Requires Qa::Authorities::Geonames.username is set, likely via
|
|
31
|
+
# `Hyrax.config.geonames_username=` setter in
|
|
32
|
+
# config/initializers/hyrax.rb of consuming app.
|
|
33
|
+
# @param place_name [String] Name of place as human-readable text
|
|
34
|
+
# @return [String, NilClass] URI to Geonames RDF or nil
|
|
35
|
+
def self.geonames_place_uri(place_name)
|
|
36
|
+
username = Qa::Authorities::Geonames.username
|
|
37
|
+
return if username.nil? || username.empty?
|
|
38
|
+
place_name = place_name.delete('.').split(/[\[\(]/)[0].strip
|
|
39
|
+
query = URI.encode(place_name)
|
|
40
|
+
geo_qs = "q=#{query}&username=#{username}"
|
|
41
|
+
url = "http://api.geonames.org/search?#{geo_qs}"
|
|
42
|
+
resp = NewspaperWorks::ResourceFetcher.get url
|
|
43
|
+
doc = Nokogiri.XML(resp['body'])
|
|
44
|
+
geonames_id = doc.xpath('//geonames/geoname[1]/geonameId').first
|
|
45
|
+
return if geonames_id.nil?
|
|
46
|
+
"http://sws.geonames.org/#{geonames_id.text}/"
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Normalize publication title from catalog data
|
|
50
|
+
# Presently strips trailing period
|
|
51
|
+
# @param title [String]
|
|
52
|
+
# @return [String] normalized title
|
|
53
|
+
def self.normalize_title(title)
|
|
54
|
+
title.strip.sub(/[.]+$/, '')
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Get publication metadata from LC catalog MODS data, if available,
|
|
58
|
+
# and from ChronAm, as a fallback.
|
|
59
|
+
# @param lccn [String] Library of Congress Control number for publication
|
|
60
|
+
# @return [NewspaperWorks::Ingest::PublicationInfo] proxy to metadata
|
|
61
|
+
# source, an object for accessors for publication fields.
|
|
62
|
+
def self.publication_metadata(lccn)
|
|
63
|
+
PublicationInfo.new(lccn)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def self.find_admin_set(admin_set = nil)
|
|
67
|
+
return admin_set if admin_set.class == AdminSet
|
|
68
|
+
admin_set = AdminSet::DEFAULT_ID if admin_set.nil?
|
|
69
|
+
begin
|
|
70
|
+
AdminSet.find(admin_set)
|
|
71
|
+
rescue
|
|
72
|
+
# only create if default admin set
|
|
73
|
+
raise unless admin_set == AdminSet::DEFAULT_ID
|
|
74
|
+
AdminSet.find(AdminSet.find_or_create_default_admin_set_id)
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def self.assign_administrative_metadata(work, opts = {})
|
|
79
|
+
work.depositor = opts.fetch(:email, User.batch_user.user_key)
|
|
80
|
+
work.admin_set = find_admin_set(opts.fetch(:admin_set, nil))
|
|
81
|
+
work.visibility = opts.fetch(:visibility, 'open')
|
|
82
|
+
work.resource_type = ['Newspapers']
|
|
83
|
+
work.date_modified ||= Hyrax::TimeService.time_in_utc
|
|
84
|
+
work.date_uploaded ||= work.date_modified
|
|
85
|
+
work.state = RDF::URI(
|
|
86
|
+
'http://fedora.info/definitions/1/0/access/ObjState#active'
|
|
87
|
+
)
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
require 'open3'
|
|
2
|
+
require 'tmpdir'
|
|
3
|
+
|
|
4
|
+
module NewspaperWorks
|
|
5
|
+
# Adapter class composes a PDF derivative for issue, if it requires one.
|
|
6
|
+
class IssuePDFComposer
|
|
7
|
+
attr_accessor :issue, :page_pdfs
|
|
8
|
+
|
|
9
|
+
CMD_BASE = "gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite".freeze
|
|
10
|
+
|
|
11
|
+
# @param issue [NewspaperIssue] adapts issue work object
|
|
12
|
+
def initialize(issue)
|
|
13
|
+
@issue = issue
|
|
14
|
+
# paths to page PDFs
|
|
15
|
+
@page_pdfs = []
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def compose
|
|
19
|
+
# we will not step on any existing PDF
|
|
20
|
+
return if issue_pdf_exists?
|
|
21
|
+
# we can not compose a multi-page issue PDF if constituent page PDFs
|
|
22
|
+
# do not exist (yet == not ready, possibly waiting on an async job).
|
|
23
|
+
@page_pdfs = validated_page_pdfs
|
|
24
|
+
# Compose a Ghostscript command to merge all paths in @page_pdfs into
|
|
25
|
+
# a single output document, execute:
|
|
26
|
+
compose_from_pages
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def compose_from_pages
|
|
30
|
+
outfile = File.join(Dir.mktmpdir, output_filename)
|
|
31
|
+
sources = @page_pdfs.join(' ')
|
|
32
|
+
cmd = "#{CMD_BASE} -sOutputFile=#{outfile} #{sources}"
|
|
33
|
+
# rubocop:disable Lint/UnusedBlockArgument
|
|
34
|
+
Open3.popen3(cmd) do |stdin, stdout, stderr, wait_thr|
|
|
35
|
+
unless wait_thr.value.success?
|
|
36
|
+
e = "Ghostscript Error: \n#{stderr.read}"
|
|
37
|
+
raise NewspaperWorks::DataError, e
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
# rubocop:enable Lint/UnusedBlockArgument
|
|
41
|
+
# at this point, something should exist and validate at path `outfile`:
|
|
42
|
+
raise NewspaperWorks::DataError, "Generated PDF invalid" unless validate_pdf(outfile)
|
|
43
|
+
# Assign for attachment to issue, commit:
|
|
44
|
+
attach_to_issue(outfile)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def output_filename
|
|
48
|
+
"#{@issue.id}_full-issue.pdf"
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Validate PDF with poppler `pdfinfo` command, which will detect
|
|
52
|
+
# error conditions in cases like truncated PDF, and only in those
|
|
53
|
+
# error conditions will write to stderr.
|
|
54
|
+
# @param path [String] path to PDF file
|
|
55
|
+
# @return [Boolean] true or false
|
|
56
|
+
def validate_pdf(path)
|
|
57
|
+
return false if path.nil? || !File.exist?(path)
|
|
58
|
+
return false if File.size(path).zero?
|
|
59
|
+
result = ''
|
|
60
|
+
cmd = "pdfinfo #{path}"
|
|
61
|
+
# rubocop:disable Lint/UnusedBlockArgument
|
|
62
|
+
Open3.popen3(cmd) do |stdin, stdout, stderr, wait_thr|
|
|
63
|
+
result = stderr.read
|
|
64
|
+
end
|
|
65
|
+
# rubocop:enable Lint/UnusedBlockArgument
|
|
66
|
+
# only zero bytes stderr output from `pdfinfo` considered valid PDF:
|
|
67
|
+
result.size.zero?
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
private
|
|
71
|
+
|
|
72
|
+
# @return [Array] list of paths to page PDFs, in page order
|
|
73
|
+
# @raises [NewspaperWorks::PagesNotReady] if any page has invalid
|
|
74
|
+
# or non-ready PDF source.
|
|
75
|
+
def validated_page_pdfs
|
|
76
|
+
result = []
|
|
77
|
+
# if any page PDF invalid, raise; otherwise append to result:
|
|
78
|
+
issue.pages.to_a.each_with_index do |page, idx|
|
|
79
|
+
e = "Page PDFs not ready for issue "\
|
|
80
|
+
"(Issue id: #{issue.id}, Page index: #{idx})"
|
|
81
|
+
path = derivatives_of(page).path('pdf')
|
|
82
|
+
raise NewspaperWorks::PagesNotReady, e unless validate_pdf(path)
|
|
83
|
+
result.push(path)
|
|
84
|
+
end
|
|
85
|
+
result
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def issue_pdf_exists?
|
|
89
|
+
derivatives_of(@issue).exist?('pdf')
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def derivatives_of(work)
|
|
93
|
+
NewspaperWorks::Data::WorkDerivatives.of(work)
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def ensure_whitelist
|
|
97
|
+
whitelist = Hyrax.config.whitelisted_ingest_dirs
|
|
98
|
+
whitelist.push(Dir.tmpdir) unless whitelist.include?(Dir.tmpdir)
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def attach_to_issue(path)
|
|
102
|
+
ensure_whitelist
|
|
103
|
+
# We rely upon WorkFiles to create fileset, and by consequence of
|
|
104
|
+
# running primary file attachment through actor stack,
|
|
105
|
+
# visibility of the FileSet is copied from the work:
|
|
106
|
+
attachment = NewspaperWorks::Data::WorkFiles.of(@issue)
|
|
107
|
+
attachment.assign(path)
|
|
108
|
+
attachment.commit!
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
end
|