newspaper_works 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.fcrepo_wrapper +4 -0
- data/.gitignore +43 -0
- data/.rubocop.yml +143 -0
- data/.solr_wrapper +8 -0
- data/.travis.yml +50 -0
- data/Gemfile +47 -0
- data/LICENSE +203 -0
- data/README.md +159 -0
- data/Rakefile +38 -0
- data/app/actors/hyrax/actors/newspaper_article_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_container_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_issue_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_page_actor.rb +8 -0
- data/app/actors/hyrax/actors/newspaper_title_actor.rb +8 -0
- data/app/actors/newspaper_works/actors/newspaper_works_upload_actor.rb +88 -0
- data/app/assets/config/newspaper_works_manifest.js +2 -0
- data/app/assets/images/newspaper_works/.keep +0 -0
- data/app/assets/javascripts/newspaper_works/autocomplete_fix.js +33 -0
- data/app/assets/javascripts/newspaper_works/ocr_search.js.erb +6 -0
- data/app/assets/javascripts/newspaper_works/thumbnail_highlights.js.erb +102 -0
- data/app/assets/javascripts/newspaper_works.js +4 -0
- data/app/assets/stylesheets/newspaper_works/_issue_search.scss +13 -0
- data/app/assets/stylesheets/newspaper_works/_issues_calendar.scss +18 -0
- data/app/assets/stylesheets/newspaper_works/_newspaper_works.scss +4 -0
- data/app/assets/stylesheets/newspaper_works/_newspapers_search.scss +38 -0
- data/app/assets/stylesheets/newspaper_works/_search_results.scss +12 -0
- data/app/controllers/hyrax/newspaper_articles_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_containers_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_issues_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_pages_controller.rb +14 -0
- data/app/controllers/hyrax/newspaper_titles_controller.rb +13 -0
- data/app/controllers/newspaper_works/newspapers_controller.rb +117 -0
- data/app/controllers/newspaper_works/newspapers_search_controller.rb +26 -0
- data/app/forms/hyrax/newspaper_article_form.rb +11 -0
- data/app/forms/hyrax/newspaper_container_form.rb +11 -0
- data/app/forms/hyrax/newspaper_issue_form.rb +11 -0
- data/app/forms/hyrax/newspaper_page_form.rb +15 -0
- data/app/forms/hyrax/newspaper_title_form.rb +12 -0
- data/app/forms/newspaper_works/newspaper_core_form_data.rb +17 -0
- data/app/helpers/hyrax/newspaper_articles_helper.rb +5 -0
- data/app/helpers/hyrax/newspaper_containers_helper.rb +5 -0
- data/app/helpers/hyrax/newspaper_issues_helper.rb +5 -0
- data/app/helpers/hyrax/newspaper_pages_helper.rb +5 -0
- data/app/helpers/newspaper_works/application_helper.rb +5 -0
- data/app/helpers/newspaper_works/breadcrumb_helper.rb +92 -0
- data/app/helpers/newspaper_works/newspaper_works_helper_behavior.rb +103 -0
- data/app/helpers/newspaper_works/newspapers_helper.rb +5 -0
- data/app/indexers/concerns/newspaper_works/indexes_full_text.rb +17 -0
- data/app/indexers/concerns/newspaper_works/indexes_place_of_publication.rb +67 -0
- data/app/indexers/concerns/newspaper_works/indexes_publication_date_range.rb +35 -0
- data/app/indexers/concerns/newspaper_works/indexes_relationships.rb +125 -0
- data/app/indexers/newspaper_article_indexer.rb +16 -0
- data/app/indexers/newspaper_container_indexer.rb +18 -0
- data/app/indexers/newspaper_issue_indexer.rb +26 -0
- data/app/indexers/newspaper_page_indexer.rb +9 -0
- data/app/indexers/newspaper_title_indexer.rb +19 -0
- data/app/indexers/newspaper_works/newspaper_core_indexer.rb +21 -0
- data/app/jobs/newspaper_works/application_job.rb +4 -0
- data/app/jobs/newspaper_works/compose_issue_pdf_job.rb +13 -0
- data/app/jobs/newspaper_works/create_issue_pages_job.rb +19 -0
- data/app/mailers/newspaper_works/application_mailer.rb +8 -0
- data/app/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior.rb +82 -0
- data/app/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior.rb +27 -0
- data/app/models/concerns/newspaper_works/newspaper_core_metadata.rb +67 -0
- data/app/models/concerns/newspaper_works/place_of_publication_behavior.rb +15 -0
- data/app/models/concerns/newspaper_works/scanned_media_metadata.rb +43 -0
- data/app/models/concerns/newspaper_works/solr/document.rb +25 -0
- data/app/models/file_set.rb +10 -0
- data/app/models/newspaper_article.rb +158 -0
- data/app/models/newspaper_container.rb +86 -0
- data/app/models/newspaper_issue.rb +115 -0
- data/app/models/newspaper_page.rb +70 -0
- data/app/models/newspaper_title.rb +111 -0
- data/app/models/newspaper_works/application_record.rb +6 -0
- data/app/models/newspaper_works/derivative_attachment.rb +8 -0
- data/app/models/newspaper_works/ingest_file_relation.rb +14 -0
- data/app/presenters/hyrax/newspaper_article_presenter.rb +38 -0
- data/app/presenters/hyrax/newspaper_container_presenter.rb +11 -0
- data/app/presenters/hyrax/newspaper_issue_presenter.rb +62 -0
- data/app/presenters/hyrax/newspaper_page_presenter.rb +72 -0
- data/app/presenters/hyrax/newspaper_title_presenter.rb +86 -0
- data/app/presenters/newspaper_works/iiif_manifest_presenter_behavior.rb +29 -0
- data/app/presenters/newspaper_works/issue_info_presenter.rb +29 -0
- data/app/presenters/newspaper_works/newspaper_core_presenter.rb +9 -0
- data/app/presenters/newspaper_works/persistent_url_presenter_behavior.rb +16 -0
- data/app/presenters/newspaper_works/place_of_publication_presenter_behavior.rb +8 -0
- data/app/presenters/newspaper_works/scanned_media_presenter.rb +7 -0
- data/app/presenters/newspaper_works/title_info_presenter.rb +13 -0
- data/app/search_builders/concerns/newspaper_works/exclude_models.rb +16 -0
- data/app/search_builders/concerns/newspaper_works/highlight_search_params.rb +14 -0
- data/app/search_builders/newspaper_works/newspapers_search_builder.rb +26 -0
- data/app/services/hyrax/article_genre_service.rb +9 -0
- data/app/services/newspaper_works/jp2_derivative_service.rb +120 -0
- data/app/services/newspaper_works/newspaper_page_derivative_service.rb +91 -0
- data/app/services/newspaper_works/pdf_derivative_service.rb +45 -0
- data/app/services/newspaper_works/pluggable_derivative_service.rb +114 -0
- data/app/services/newspaper_works/text_extraction_derivative_service.rb +56 -0
- data/app/services/newspaper_works/text_formats_from_alto_service.rb +77 -0
- data/app/services/newspaper_works/tiff_derivative_service.rb +54 -0
- data/app/validators/newspaper_works/publication_date_start_end_validator.rb +48 -0
- data/app/validators/newspaper_works/publication_date_validator.rb +16 -0
- data/app/views/catalog/_index_gallery_newspaper_article_wrapper.html.erb +9 -0
- data/app/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb +9 -0
- data/app/views/catalog/_index_header_gallery_newspaper_article.html.erb +23 -0
- data/app/views/catalog/_index_header_gallery_newspaper_page.html.erb +23 -0
- data/app/views/catalog/_index_header_list_newspaper_article.html.erb +7 -0
- data/app/views/catalog/_index_header_list_newspaper_page.html.erb +7 -0
- data/app/views/catalog/_snippets_more.html.erb +16 -0
- data/app/views/catalog/_thumbnail_list_newspaper_article.html.erb +6 -0
- data/app/views/catalog/_thumbnail_list_newspaper_page.html.erb +6 -0
- data/app/views/hyrax/file_sets/_actions.html.erb +45 -0
- data/app/views/hyrax/newspaper_articles/_newspaper_article.html.erb +2 -0
- data/app/views/hyrax/newspaper_articles/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_containers/_newspaper_container.html.erb +2 -0
- data/app/views/hyrax/newspaper_containers/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_issues/_newspaper_issue.html.erb +2 -0
- data/app/views/hyrax/newspaper_issues/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_pages/_newspaper_page.html.erb +2 -0
- data/app/views/hyrax/newspaper_pages/show.html.erb +1 -0
- data/app/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb +5 -0
- data/app/views/hyrax/newspaper_titles/_issue_search_form.html.erb +33 -0
- data/app/views/hyrax/newspaper_titles/_issues_calendar.html.erb +63 -0
- data/app/views/hyrax/newspaper_titles/_newspaper_title.html.erb +2 -0
- data/app/views/hyrax/newspaper_titles/show.html.erb +54 -0
- data/app/views/newspaper_works/base/_attribute_rows.html.erb +42 -0
- data/app/views/newspaper_works/base/_attributes.html.erb +16 -0
- data/app/views/newspaper_works/base/_metadata.html.erb +6 -0
- data/app/views/newspaper_works/base/_newspaper_hierarchy.html.erb +14 -0
- data/app/views/newspaper_works/base/_persistent_url.html.erb +1 -0
- data/app/views/newspaper_works/base/_show.html.erb +45 -0
- data/app/views/newspaper_works/newspapers_search/_date_fields.html.erb +29 -0
- data/app/views/newspaper_works/newspapers_search/_facet_layout.html.erb +8 -0
- data/app/views/newspaper_works/newspapers_search/_facet_limit.html.erb +17 -0
- data/app/views/newspaper_works/newspapers_search/_front_pages_input.html.erb +5 -0
- data/app/views/newspaper_works/newspapers_search/_keyword_input.html.erb +18 -0
- data/app/views/newspaper_works/newspapers_search/_newspapers_facets.html.erb +5 -0
- data/app/views/newspaper_works/newspapers_search/_newspapers_search_form.html.erb +13 -0
- data/app/views/newspaper_works/newspapers_search/_newspapers_search_help.html.erb +8 -0
- data/app/views/newspaper_works/newspapers_search/search.html.erb +13 -0
- data/app/views/records/edit_fields/_alternate_title.html.erb +4 -0
- data/app/views/records/edit_fields/_genre.html.erb +4 -0
- data/app/views/records/edit_fields/_place_of_publication.html.erb +14 -0
- data/app/views/records/edit_fields/_subtitle.html.erb +4 -0
- data/bin/rails +13 -0
- data/config/fcrepo_wrapper_test.yml +5 -0
- data/config/initializers/assets.rb +2 -0
- data/config/locales/newspaper_article.de.yml +12 -0
- data/config/locales/newspaper_article.en.yml +12 -0
- data/config/locales/newspaper_article.es.yml +12 -0
- data/config/locales/newspaper_article.fr.yml +12 -0
- data/config/locales/newspaper_article.it.yml +12 -0
- data/config/locales/newspaper_article.pt-BR.yml +12 -0
- data/config/locales/newspaper_article.zh.yml +12 -0
- data/config/locales/newspaper_container.de.yml +8 -0
- data/config/locales/newspaper_container.en.yml +8 -0
- data/config/locales/newspaper_container.es.yml +8 -0
- data/config/locales/newspaper_container.fr.yml +8 -0
- data/config/locales/newspaper_container.it.yml +8 -0
- data/config/locales/newspaper_container.pt-BR.yml +8 -0
- data/config/locales/newspaper_container.zh.yml +8 -0
- data/config/locales/newspaper_issue.de.yml +8 -0
- data/config/locales/newspaper_issue.en.yml +8 -0
- data/config/locales/newspaper_issue.es.yml +8 -0
- data/config/locales/newspaper_issue.fr.yml +8 -0
- data/config/locales/newspaper_issue.it.yml +8 -0
- data/config/locales/newspaper_issue.pt-BR.yml +8 -0
- data/config/locales/newspaper_issue.zh.yml +8 -0
- data/config/locales/newspaper_page.de.yml +15 -0
- data/config/locales/newspaper_page.en.yml +15 -0
- data/config/locales/newspaper_page.es.yml +15 -0
- data/config/locales/newspaper_page.fr.yml +15 -0
- data/config/locales/newspaper_page.it.yml +15 -0
- data/config/locales/newspaper_page.pt-BR.yml +15 -0
- data/config/locales/newspaper_page.zh.yml +15 -0
- data/config/locales/newspaper_title.de.yml +8 -0
- data/config/locales/newspaper_title.en.yml +8 -0
- data/config/locales/newspaper_title.es.yml +8 -0
- data/config/locales/newspaper_title.fr.yml +8 -0
- data/config/locales/newspaper_title.it.yml +8 -0
- data/config/locales/newspaper_title.pt-BR.yml +8 -0
- data/config/locales/newspaper_title.zh.yml +8 -0
- data/config/locales/newspaper_works.de.yml +50 -0
- data/config/locales/newspaper_works.en.yml +52 -0
- data/config/locales/newspaper_works.es.yml +52 -0
- data/config/locales/newspaper_works.fr.yml +52 -0
- data/config/locales/newspaper_works.it.yml +52 -0
- data/config/locales/newspaper_works.pt-BR.yml +52 -0
- data/config/locales/newspaper_works.zh.yml +52 -0
- data/config/routes.rb +9 -0
- data/config/solr_wrapper_test.yml +9 -0
- data/config/test-fixture/solr-config/_rest_managed.json +3 -0
- data/config/test-fixture/solr-config/admin-extra.html +31 -0
- data/config/test-fixture/solr-config/elevate.xml +36 -0
- data/config/test-fixture/solr-config/mapping-ISOLatin1Accent.txt +246 -0
- data/config/test-fixture/solr-config/protwords.txt +21 -0
- data/config/test-fixture/solr-config/schema.xml +366 -0
- data/config/test-fixture/solr-config/scripts.conf +24 -0
- data/config/test-fixture/solr-config/solrconfig.xml +322 -0
- data/config/test-fixture/solr-config/spellings.txt +2 -0
- data/config/test-fixture/solr-config/stopwords.txt +58 -0
- data/config/test-fixture/solr-config/stopwords_en.txt +58 -0
- data/config/test-fixture/solr-config/synonyms.txt +31 -0
- data/config/test-fixture/solr-config/xslt/example.xsl +132 -0
- data/config/test-fixture/solr-config/xslt/example_atom.xsl +67 -0
- data/config/test-fixture/solr-config/xslt/example_rss.xsl +66 -0
- data/config/test-fixture/solr-config/xslt/luke.xsl +337 -0
- data/config/vendor/imagemagick-6-policy.xml +76 -0
- data/db/migrate/20181214181358_create_newspaper_works_derivative_attachments.rb +12 -0
- data/db/migrate/20190107165909_create_newspaper_works_ingest_file_relations.rb +11 -0
- data/lib/generators/newspaper_works/assets_generator.rb +29 -0
- data/lib/generators/newspaper_works/blacklight_advanced_search_generator.rb +44 -0
- data/lib/generators/newspaper_works/blacklight_iiif_search_generator.rb +41 -0
- data/lib/generators/newspaper_works/catalog_controller_generator.rb +60 -0
- data/lib/generators/newspaper_works/install_generator.rb +97 -0
- data/lib/generators/newspaper_works/templates/annotation_behavior.rb +6 -0
- data/lib/generators/newspaper_works/templates/config/authorities/newspaper_article_genres.yml +86 -0
- data/lib/generators/newspaper_works/templates/config/initializers/newspaper_works.rb +12 -0
- data/lib/generators/newspaper_works/templates/config/initializers/patch_blacklight_advanced_search.rb +74 -0
- data/lib/generators/newspaper_works/templates/custom_search_builder.rb +23 -0
- data/lib/generators/newspaper_works/templates/newspaper_works.scss +1 -0
- data/lib/generators/newspaper_works/templates/newspaper_works_helper.rb +3 -0
- data/lib/generators/newspaper_works/templates/search_behavior.rb +6 -0
- data/lib/newspaper_works/configuration.rb +14 -0
- data/lib/newspaper_works/data/fileset_helper.rb +25 -0
- data/lib/newspaper_works/data/path_helper.rb +40 -0
- data/lib/newspaper_works/data/work_derivatives.rb +314 -0
- data/lib/newspaper_works/data/work_file.rb +92 -0
- data/lib/newspaper_works/data/work_files.rb +181 -0
- data/lib/newspaper_works/data.rb +35 -0
- data/lib/newspaper_works/engine.rb +42 -0
- data/lib/newspaper_works/errors.rb +14 -0
- data/lib/newspaper_works/ingest/base_ingest.rb +69 -0
- data/lib/newspaper_works/ingest/base_publication_info.rb +35 -0
- data/lib/newspaper_works/ingest/batch_ingest_helper.rb +44 -0
- data/lib/newspaper_works/ingest/batch_issue_ingester.rb +129 -0
- data/lib/newspaper_works/ingest/chronam_publication_info.rb +133 -0
- data/lib/newspaper_works/ingest/from_command.rb +52 -0
- data/lib/newspaper_works/ingest/image_ingest_issues.rb +43 -0
- data/lib/newspaper_works/ingest/issue_images.rb +51 -0
- data/lib/newspaper_works/ingest/lc_publication_info.rb +144 -0
- data/lib/newspaper_works/ingest/named_issue_metadata.rb +60 -0
- data/lib/newspaper_works/ingest/ndnp/batch_ingester.rb +64 -0
- data/lib/newspaper_works/ingest/ndnp/batch_xml_ingest.rb +72 -0
- data/lib/newspaper_works/ingest/ndnp/container_ingest.rb +99 -0
- data/lib/newspaper_works/ingest/ndnp/container_ingester.rb +84 -0
- data/lib/newspaper_works/ingest/ndnp/container_metadata.rb +87 -0
- data/lib/newspaper_works/ingest/ndnp/issue_ingest.rb +81 -0
- data/lib/newspaper_works/ingest/ndnp/issue_ingester.rb +101 -0
- data/lib/newspaper_works/ingest/ndnp/issue_metadata.rb +96 -0
- data/lib/newspaper_works/ingest/ndnp/ndnp_asset_helper.rb +20 -0
- data/lib/newspaper_works/ingest/ndnp/ndnp_mets_helper.rb +70 -0
- data/lib/newspaper_works/ingest/ndnp/page_ingest.rb +47 -0
- data/lib/newspaper_works/ingest/ndnp/page_ingester.rb +157 -0
- data/lib/newspaper_works/ingest/ndnp/page_metadata.rb +112 -0
- data/lib/newspaper_works/ingest/ndnp.rb +21 -0
- data/lib/newspaper_works/ingest/newspaper_issue_ingest.rb +56 -0
- data/lib/newspaper_works/ingest/newspaper_page_ingest.rb +6 -0
- data/lib/newspaper_works/ingest/page_image.rb +52 -0
- data/lib/newspaper_works/ingest/path_enumeration.rb +52 -0
- data/lib/newspaper_works/ingest/pdf_images.rb +85 -0
- data/lib/newspaper_works/ingest/pdf_issue.rb +20 -0
- data/lib/newspaper_works/ingest/pdf_issues.rb +39 -0
- data/lib/newspaper_works/ingest/pdf_pages.rb +114 -0
- data/lib/newspaper_works/ingest/pub_finder.rb +89 -0
- data/lib/newspaper_works/ingest/publication_info.rb +44 -0
- data/lib/newspaper_works/ingest.rb +90 -0
- data/lib/newspaper_works/issue_pdf_composer.rb +111 -0
- data/lib/newspaper_works/logging.rb +54 -0
- data/lib/newspaper_works/page_finder.rb +62 -0
- data/lib/newspaper_works/resource_fetcher.rb +78 -0
- data/lib/newspaper_works/text_extraction/alto_reader.rb +122 -0
- data/lib/newspaper_works/text_extraction/page_ocr.rb +100 -0
- data/lib/newspaper_works/text_extraction/render_alto.rb +84 -0
- data/lib/newspaper_works/text_extraction/word_coords_builder.rb +30 -0
- data/lib/newspaper_works/text_extraction.rb +10 -0
- data/lib/newspaper_works/version.rb +3 -0
- data/lib/newspaper_works.rb +19 -0
- data/lib/tasks/newspaper_works_tasks.rake +39 -0
- data/newspaper_works.gemspec +49 -0
- data/spec/.keep.txt +1 -0
- data/spec/actors/newspaper_works/actors/newspaper_works_upload_actor_spec.rb +69 -0
- data/spec/controllers/catalog_controller_spec.rb +63 -0
- data/spec/controllers/newspaper_works/newspapers_controller_spec.rb +114 -0
- data/spec/controllers/newspaper_works/newspapers_search_controller_spec.rb +21 -0
- data/spec/factories/ability.rb +6 -0
- data/spec/factories/newspaper_issue.rb +7 -0
- data/spec/factories/newspaper_issue_ingest.rb +6 -0
- data/spec/factories/newspaper_page.rb +7 -0
- data/spec/factories/newspaper_page_ingest.rb +6 -0
- data/spec/factories/newspaper_page_solr_document.rb +12 -0
- data/spec/factories/newspaper_title.rb +8 -0
- data/spec/factories/uploaded_pdf_file.rb +9 -0
- data/spec/factories/user.rb +13 -0
- data/spec/features/front_pages_for_title_spec.rb +19 -0
- data/spec/features/newspaper_title_search_spec.rb +30 -0
- data/spec/features/newspapers_search_spec.rb +49 -0
- data/spec/features/search_results_thumbnail_highlights_spec.rb +33 -0
- data/spec/features_shared.rb +71 -0
- data/spec/fixtures/files/4.1.07.jp2 +0 -0
- data/spec/fixtures/files/4.1.07.tiff +0 -0
- data/spec/fixtures/files/README.md +7 -0
- data/spec/fixtures/files/alto-2-0.xsd +714 -0
- data/spec/fixtures/files/broken-truncated.pdf +0 -0
- data/spec/fixtures/files/credits.md +16 -0
- data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
- data/spec/fixtures/files/minimal-1-page.pdf +0 -0
- data/spec/fixtures/files/minimal-2-page.pdf +0 -0
- data/spec/fixtures/files/minimal-alto.xml +31 -0
- data/spec/fixtures/files/ndnp-alto-sample.xml +24 -0
- data/spec/fixtures/files/ndnp-sample1-json.json +1 -0
- data/spec/fixtures/files/ndnp-sample1-txt.txt +1 -0
- data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
- data/spec/fixtures/files/ocr_alto.xml +202 -0
- data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +202 -0
- data/spec/fixtures/files/ocr_color.tiff +0 -0
- data/spec/fixtures/files/ocr_gray.jp2 +0 -0
- data/spec/fixtures/files/ocr_gray.tiff +0 -0
- data/spec/fixtures/files/ocr_mono.tiff +0 -0
- data/spec/fixtures/files/page1.tiff +0 -0
- data/spec/fixtures/files/resource_mocks/chronam/http404-expected +0 -0
- data/spec/fixtures/files/resource_mocks/chronam/sn84038814.rdf +1028 -0
- data/spec/fixtures/files/resource_mocks/chronam/sn93059126.rdf +36 -0
- data/spec/fixtures/files/resource_mocks/chronam/sn94051019.rdf +37 -0
- data/spec/fixtures/files/resource_mocks/geonames/Chicopee +1104 -0
- data/spec/fixtures/files/resource_mocks/geonames/Denver +1104 -0
- data/spec/fixtures/files/resource_mocks/geonames/Marysville +279 -0
- data/spec/fixtures/files/resource_mocks/geonames/Marysville2 +279 -0
- data/spec/fixtures/files/resource_mocks/geonames/SLC +1104 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn2099999999 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn82014496 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn83020109 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn83021453 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn83045396 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn84038814 +2 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn93059126 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn94051019 +1 -0
- data/spec/fixtures/files/resource_mocks/lccn/sn99999999 +1 -0
- data/spec/fixtures/files/resource_mocks/urls.json +82 -0
- data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
- data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
- data/spec/fixtures/files/thumbnail.jpg +0 -0
- data/spec/forms/hyrax/newspaper_article_form_spec.rb +33 -0
- data/spec/forms/hyrax/newspaper_container_form_spec.rb +30 -0
- data/spec/forms/hyrax/newspaper_issue_form_spec.rb +31 -0
- data/spec/forms/hyrax/newspaper_page_form_spec.rb +28 -0
- data/spec/forms/hyrax/newspaper_title_form_spec.rb +31 -0
- data/spec/forms/newspaper_works/newspaper_core_form_data_spec.rb +12 -0
- data/spec/helpers/newspaper_works/breadcrumb_helper_spec.rb +82 -0
- data/spec/helpers/newspaper_works_helper_spec.rb +57 -0
- data/spec/indexers/concerns/newspaper_works/indexes_full_text_spec.rb +31 -0
- data/spec/indexers/concerns/newspaper_works/indexes_place_of_publication_spec.rb +53 -0
- data/spec/indexers/concerns/newspaper_works/indexes_publication_date_range_spec.rb +39 -0
- data/spec/indexers/concerns/newspaper_works/indexes_relationships_spec.rb +86 -0
- data/spec/indexers/newspaper_article_indexer_spec.rb +29 -0
- data/spec/indexers/newspaper_issue_indexer_spec.rb +19 -0
- data/spec/indexers/newspaper_title_indexer_spec.rb +22 -0
- data/spec/indexers/newspaper_works/newspaper_core_indexer_spec.rb +23 -0
- data/spec/lib/newspaper_works/configuration_spec.rb +18 -0
- data/spec/lib/newspaper_works/data/work_derivatives_spec.rb +245 -0
- data/spec/lib/newspaper_works/data/work_file_spec.rb +99 -0
- data/spec/lib/newspaper_works/data/work_files_spec.rb +224 -0
- data/spec/lib/newspaper_works/ingest/batch_issue_ingester_spec.rb +158 -0
- data/spec/lib/newspaper_works/ingest/chronam_publication_info_spec.rb +35 -0
- data/spec/lib/newspaper_works/ingest/from_command_spec.rb +75 -0
- data/spec/lib/newspaper_works/ingest/image_ingest_issues_spec.rb +62 -0
- data/spec/lib/newspaper_works/ingest/ingest_shared.rb +75 -0
- data/spec/lib/newspaper_works/ingest/issue_images_spec.rb +65 -0
- data/spec/lib/newspaper_works/ingest/lc_publication_info_spec.rb +34 -0
- data/spec/lib/newspaper_works/ingest/ndnp/batch_ingester_spec.rb +131 -0
- data/spec/lib/newspaper_works/ingest/ndnp/batch_xml_ingest_spec.rb +64 -0
- data/spec/lib/newspaper_works/ingest/ndnp/container_ingest_spec.rb +44 -0
- data/spec/lib/newspaper_works/ingest/ndnp/container_ingester_spec.rb +126 -0
- data/spec/lib/newspaper_works/ingest/ndnp/container_metadata_spec.rb +36 -0
- data/spec/lib/newspaper_works/ingest/ndnp/issue_ingest_spec.rb +108 -0
- data/spec/lib/newspaper_works/ingest/ndnp/issue_ingester_spec.rb +155 -0
- data/spec/lib/newspaper_works/ingest/ndnp/issue_metadata_spec.rb +84 -0
- data/spec/lib/newspaper_works/ingest/ndnp/page_ingest_spec.rb +79 -0
- data/spec/lib/newspaper_works/ingest/ndnp/page_ingester_spec.rb +184 -0
- data/spec/lib/newspaper_works/ingest/ndnp/page_metadata_spec.rb +85 -0
- data/spec/lib/newspaper_works/ingest/newspaper_issue_ingest_spec.rb +83 -0
- data/spec/lib/newspaper_works/ingest/newspaper_page_ingest_spec.rb +77 -0
- data/spec/lib/newspaper_works/ingest/page_image_spec.rb +29 -0
- data/spec/lib/newspaper_works/ingest/pdf_images_spec.rb +32 -0
- data/spec/lib/newspaper_works/ingest/pdf_issue_spec.rb +29 -0
- data/spec/lib/newspaper_works/ingest/pdf_issues_spec.rb +62 -0
- data/spec/lib/newspaper_works/ingest/pdf_pages_spec.rb +110 -0
- data/spec/lib/newspaper_works/ingest/pub_finder_spec.rb +58 -0
- data/spec/lib/newspaper_works/ingest/publication_info_spec.rb +61 -0
- data/spec/lib/newspaper_works/ingest_spec.rb +45 -0
- data/spec/lib/newspaper_works/issue_pdf_composer_spec.rb +101 -0
- data/spec/lib/newspaper_works/logging_spec.rb +53 -0
- data/spec/lib/newspaper_works/page_finder_spec.rb +53 -0
- data/spec/lib/newspaper_works/resource_fetcher_spec.rb +65 -0
- data/spec/lib/newspaper_works/text_extraction/alto_reader_spec.rb +49 -0
- data/spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb +84 -0
- data/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb +54 -0
- data/spec/lib/newspaper_works/text_extraction/word_coords_builder_spec.rb +30 -0
- data/spec/lib/tasks/newspaper_works_rake_spec.rb +124 -0
- data/spec/misc_shared.rb +109 -0
- data/spec/model_shared.rb +134 -0
- data/spec/models/concerns/newspaper_works/blacklight_iiif_search/annotation_behavior_spec.rb +45 -0
- data/spec/models/concerns/newspaper_works/blacklight_iiif_search/search_behavior_spec.rb +27 -0
- data/spec/models/concerns/newspaper_works/newspaper_core_metadata_spec.rb +45 -0
- data/spec/models/concerns/newspaper_works/place_of_publication_behavior_spec.rb +17 -0
- data/spec/models/concerns/newspaper_works/scanned_media_metadata_spec.rb +35 -0
- data/spec/models/newspaper_article_spec.rb +73 -0
- data/spec/models/newspaper_container_spec.rb +111 -0
- data/spec/models/newspaper_issue_spec.rb +91 -0
- data/spec/models/newspaper_page_spec.rb +44 -0
- data/spec/models/newspaper_title_spec.rb +116 -0
- data/spec/models/newspaper_works/derivative_attachment_spec.rb +37 -0
- data/spec/models/newspaper_works/ingest_file_relation_spec.rb +56 -0
- data/spec/models/solr_document_spec.rb +14 -0
- data/spec/ndnp_shared.rb +48 -0
- data/spec/presenters/hyrax/newspaper_article_presenter_spec.rb +53 -0
- data/spec/presenters/hyrax/newspaper_container_presenter_spec.rb +20 -0
- data/spec/presenters/hyrax/newspaper_issue_presenter_spec.rb +65 -0
- data/spec/presenters/hyrax/newspaper_page_presenter_spec.rb +75 -0
- data/spec/presenters/hyrax/newspaper_title_presenter_spec.rb +153 -0
- data/spec/presenters/newspaper_works/iiif_manifest_presenter_behavior_spec.rb +32 -0
- data/spec/presenters/newspaper_works/issue_info_presenter_spec.rb +51 -0
- data/spec/presenters/newspaper_works/newspaper_core_presenter_spec.rb +22 -0
- data/spec/presenters/newspaper_works/persistent_url_presenter_behavior_spec.rb +24 -0
- data/spec/presenters/newspaper_works/place_of_publication_presenter_behavior_spec.rb +17 -0
- data/spec/presenters/newspaper_works/scanned_media_presenter_spec.rb +18 -0
- data/spec/presenters/newspaper_works/title_info_presenter_spec.rb +23 -0
- data/spec/routing/route_spec.rb +52 -0
- data/spec/search_builders/custom_search_builder_spec.rb +34 -0
- data/spec/search_builders/newspaper_works/newspapers_search_builder_spec.rb +33 -0
- data/spec/services/hyrax/article_genre_service_spec.rb +12 -0
- data/spec/services/hyrax/resource_types_service_spec.rb +12 -0
- data/spec/services/newspaper_works/jp2_derivative_service_spec.rb +62 -0
- data/spec/services/newspaper_works/newspaper_page_derivative_service_spec.rb +125 -0
- data/spec/services/newspaper_works/pdf_derivative_service_spec.rb +62 -0
- data/spec/services/newspaper_works/pluggable_derivative_service_spec.rb +204 -0
- data/spec/services/newspaper_works/text_extraction_derivative_service_spec.rb +82 -0
- data/spec/services/newspaper_works/text_formats_from_alto_service_spec.rb +129 -0
- data/spec/services/newspaper_works/tiff_derivative_service_spec.rb +58 -0
- data/spec/spec_helper.rb +261 -0
- data/spec/support/controller_level_helpers.rb +28 -0
- data/spec/test_app_templates/lib/generators/test_app_generator.rb +22 -0
- data/spec/views/catalog/_index_gallery_newspaper_page_wrapper.html.erb_spec.rb +36 -0
- data/spec/views/catalog/_index_header_list_newspaper_page.html.erb_spec.rb +26 -0
- data/spec/views/catalog/_thumbnail_list_newspaper_page.html.erb_spec.rb +35 -0
- data/spec/views/hyrax/newspaper_titles/_all_front_pages_form.html.erb_spec.rb +16 -0
- data/spec/views/hyrax/newspaper_titles/_issue_search_form.html.erb_spec.rb +33 -0
- data/spec/views/hyrax/newspaper_titles/_issues_calendar.html.erb_spec.rb +37 -0
- data/spec/views/hyrax/newspaper_titles/show.html.erb_spec.rb +87 -0
- data/spec/views/newspaper_works/base/_attribute_rows.html.erb_spec.rb +60 -0
- data/spec/views/newspaper_works/base/_newspaper_hierarchy.html.erb_spec.rb +80 -0
- data/spec/views/newspaper_works/base/_show.html.erb_spec.rb +78 -0
- data/spec/views/newspaper_works/newspapers_search/search.html.erb_spec.rb +54 -0
- data/spec/views/records/edit_fields/_place_of_publication.html.erb_spec.rb +26 -0
- data/tasks/newspaperworks_dev.rake +26 -0
- data/test/integration/navigation_test.rb +7 -0
- data/test/lib/generators/newspaper_works/install_generator_test.rb +16 -0
- data/test/newspaper_works_test.rb +7 -0
- data/test/test_helper.rb +17 -0
- data/tmp/.keep +0 -0
- metadata +1037 -0
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
module NewspaperWorks
|
|
2
|
+
module Logging
|
|
3
|
+
class << self
|
|
4
|
+
attr_accessor :configured
|
|
5
|
+
end
|
|
6
|
+
self.configured = []
|
|
7
|
+
|
|
8
|
+
def logger
|
|
9
|
+
@logger = Rails.logger
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
# Log message, as in standard logger, but use message_format on message.
|
|
13
|
+
# @param severity [Integer] log level/severity, e.g. Logger::INFO == 2
|
|
14
|
+
# @param msg [String] Log message to be formatted by message_format
|
|
15
|
+
# @param progname [String] (optional)
|
|
16
|
+
def log(severity, msg, progname = nil, &block)
|
|
17
|
+
logger.add(severity, message_format(msg), progname, &block)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Simpler alternative to .log, with default severity, message_format
|
|
21
|
+
# wrapping.
|
|
22
|
+
# @param msg [String] Log message to be formatted by message_format
|
|
23
|
+
# @param severity [Integer] log level/severity, e.g. Logger::INFO == 2
|
|
24
|
+
# @param progname [String]
|
|
25
|
+
def write_log(msg, severity = Logger::INFO, progname = nil)
|
|
26
|
+
logger.add(severity, message_format(msg), progname)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# format message, distinct from per-output formatting, to be used in
|
|
30
|
+
# all logging channels Rails.logger broadcasts to. This wrapping
|
|
31
|
+
# indicates in parenthetical prefix which class is acting to
|
|
32
|
+
# produce message.
|
|
33
|
+
# @param msg [String]
|
|
34
|
+
def message_format(msg)
|
|
35
|
+
"(#{self.class}) #{msg}"
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Should be called by consuming class, prior to use of .logger method
|
|
39
|
+
# has checks to prevent duplicate configuration if already configured.
|
|
40
|
+
def configure_logger(name)
|
|
41
|
+
@logger = Rails.logger
|
|
42
|
+
return if NewspaperWorks::Logging.configured.include?(name)
|
|
43
|
+
path = Rails.root.join("log/#{name}.log")
|
|
44
|
+
@named_log = ActiveSupport::Logger.new(path)
|
|
45
|
+
@named_log.formatter = proc do |_severity, datetime, _progname, msg|
|
|
46
|
+
"#{datetime}: #{msg}\n"
|
|
47
|
+
end
|
|
48
|
+
# rails will log to named_log in addition to any other configured
|
|
49
|
+
# or default logging destinations:
|
|
50
|
+
@logger.extend(ActiveSupport::Logger.broadcast(@named_log))
|
|
51
|
+
NewspaperWorks::Logging.configured.push(name)
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# useful methods for retrieving and ordering NewspaperPage objects
|
|
2
|
+
module NewspaperWorks
|
|
3
|
+
module PageFinder
|
|
4
|
+
##
|
|
5
|
+
# find all pages for an issue, return in order
|
|
6
|
+
# @param issue_id [String]
|
|
7
|
+
# @return [Array] ordered NewspaperPage SolrDocuments for an issue
|
|
8
|
+
def pages_for_issue(issue_id)
|
|
9
|
+
solr_params = ["has_model_ssim:\"NewspaperPage\""]
|
|
10
|
+
solr_params << "issue_id_ssi:\"#{issue_id}\""
|
|
11
|
+
solr_resp = Blacklight.default_index.search(fq: solr_params.join(' AND '))
|
|
12
|
+
all_pages = solr_resp.documents
|
|
13
|
+
return [] if all_pages.blank?
|
|
14
|
+
ordered_pages(all_pages)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
##
|
|
18
|
+
# return an ordered array of NewspaperPage documents
|
|
19
|
+
# @param documents [Array] NewspaperPage SolrDocuments for an issue
|
|
20
|
+
# @return [Array] ordered NewspaperPage SolrDocuments for an issue
|
|
21
|
+
def ordered_pages(documents)
|
|
22
|
+
return documents if documents.length <= 1
|
|
23
|
+
ordered_list = []
|
|
24
|
+
next_page_id, final_page_id = nil
|
|
25
|
+
documents.each do |doc|
|
|
26
|
+
if doc['is_following_page_of_ssi'].blank?
|
|
27
|
+
ordered_list.insert(0, doc)
|
|
28
|
+
next_page_id = doc['is_preceding_page_of_ssi']
|
|
29
|
+
elsif doc['is_preceding_page_of_ssi'].blank?
|
|
30
|
+
ordered_list.insert(-1, doc)
|
|
31
|
+
final_page_id = doc['id']
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
return documents if next_page_id.nil?
|
|
35
|
+
while next_page_id != final_page_id
|
|
36
|
+
next_page = documents.select { |doc| doc['id'] == next_page_id }.first
|
|
37
|
+
ordered_list.insert(-2, next_page)
|
|
38
|
+
next_page_id = next_page['is_preceding_page_of_ssi']
|
|
39
|
+
end
|
|
40
|
+
ordered_list
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
##
|
|
44
|
+
# return the index of the current page
|
|
45
|
+
# @param page_id [String] id of the NewspaperPage
|
|
46
|
+
# @param issue_id [String] id of the parent NewspaperIssue
|
|
47
|
+
# @return [Integer] the page's index
|
|
48
|
+
def get_page_index(page_id, issue_id = nil)
|
|
49
|
+
default_index = 0
|
|
50
|
+
unless issue_id
|
|
51
|
+
page_doc = SolrDocument.find(page_id)
|
|
52
|
+
return default_index unless page_doc &&
|
|
53
|
+
page_doc['issue_id_ssi'] &&
|
|
54
|
+
page_doc['is_following_page_of_ssi']
|
|
55
|
+
issue_id = page_doc['issue_id_ssi']
|
|
56
|
+
end
|
|
57
|
+
all_pages = pages_for_issue(issue_id)
|
|
58
|
+
return default_index if all_pages.blank?
|
|
59
|
+
all_pages.index { |page| page['id'] == page_id }
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
module NewspaperWorks
|
|
2
|
+
# in-memory caching fetcher for HTTP GET requests, wraps Faraday.get
|
|
3
|
+
class ResourceFetcher
|
|
4
|
+
# only cache following HTTP response codes, per Section 6.1, RFC 7231
|
|
5
|
+
CACHEABLE_STATUS = [
|
|
6
|
+
200, 203, 204, 206, 300, 301, 404, 405, 410, 414, 501
|
|
7
|
+
].freeze
|
|
8
|
+
|
|
9
|
+
class << self
|
|
10
|
+
attr_accessor :cache
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def self.get(url, stale_after = 3600)
|
|
14
|
+
new(stale_after).get(url)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def self.include?(url)
|
|
18
|
+
return false if cache.nil?
|
|
19
|
+
cache.keys.include?(url)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def initialize(stale_after = 3600)
|
|
23
|
+
@stale_after = stale_after # seconds
|
|
24
|
+
# initialize shared state only if missing:
|
|
25
|
+
self.class.cache = {} if self.class.cache.nil?
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def get(url)
|
|
29
|
+
cache_get(url) || miss_get(url)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# @return [Hash] shared cache state
|
|
33
|
+
def cache
|
|
34
|
+
self.class.cache
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# @return [NilClass, Hash] hash of status, response body — or nil if no HIT
|
|
38
|
+
def cache_get(url)
|
|
39
|
+
return unless cache.include?(url)
|
|
40
|
+
check_expiry(url)
|
|
41
|
+
# in case of expiration, cache will no longer include URL:
|
|
42
|
+
return unless cache.include?(url)
|
|
43
|
+
# return non-expired cache HIT:
|
|
44
|
+
cache[url]
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Get URL from original source, by URL; will cache any cachable response
|
|
48
|
+
# in self.class.cache (shared state).
|
|
49
|
+
# @param url [String] URL to GET
|
|
50
|
+
# @raise [Faraday::ConnectionFailed] if DNS or TCP connection error.
|
|
51
|
+
# @return [Hash] hash containing status, response headers, response body
|
|
52
|
+
def miss_get(url)
|
|
53
|
+
resp = Faraday.get url
|
|
54
|
+
# create a new hash from headers
|
|
55
|
+
result = resp.headers.to_h
|
|
56
|
+
# add status and body to
|
|
57
|
+
result['status'] = resp.status
|
|
58
|
+
result['body'] = resp.body
|
|
59
|
+
# set (new or replaced previously) cached value for URL:
|
|
60
|
+
if CACHEABLE_STATUS.include?(resp.status)
|
|
61
|
+
result['cached_time'] = DateTime.now.to_time.to_i
|
|
62
|
+
cache[url] = result
|
|
63
|
+
end
|
|
64
|
+
result
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def check_expiry(url)
|
|
68
|
+
return unless cache.include?(url)
|
|
69
|
+
cache.delete(url) if expired(cache[url])
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def expired(record)
|
|
73
|
+
now = DateTime.now.to_time.to_i
|
|
74
|
+
# does elapsed seconds between store and now exceed threshold?
|
|
75
|
+
(now - record['cached_time']) > @stale_after
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
require 'active_support/core_ext/module/delegation'
|
|
2
|
+
require 'json'
|
|
3
|
+
require 'nokogiri'
|
|
4
|
+
|
|
5
|
+
module NewspaperWorks
|
|
6
|
+
# Module for text extraction
|
|
7
|
+
module TextExtraction
|
|
8
|
+
# Class to obtain plain text and JSON word-coordinates from ALTO source
|
|
9
|
+
class AltoReader
|
|
10
|
+
attr_accessor :source, :doc_stream
|
|
11
|
+
delegate :text, to: :doc_stream
|
|
12
|
+
|
|
13
|
+
# SAX Document Stream class to gather text and word tokens from ALTO
|
|
14
|
+
class AltoDocStream < Nokogiri::XML::SAX::Document
|
|
15
|
+
attr_accessor :text, :words
|
|
16
|
+
|
|
17
|
+
def initialize(image_width = nil)
|
|
18
|
+
super()
|
|
19
|
+
# scaling matters:
|
|
20
|
+
@image_width = image_width
|
|
21
|
+
@scaling = 1.0 # pt to px, if ALTO using points
|
|
22
|
+
# plain text buffer:
|
|
23
|
+
@text = ''
|
|
24
|
+
# list of word hash, containing word+coord:
|
|
25
|
+
@words = []
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Return coordinates from String element attribute hash
|
|
29
|
+
#
|
|
30
|
+
# @param attrs [Hash] hash containing ALTO `String` element attributes.
|
|
31
|
+
# @return [Array] Array of position x, y, width, height in px.
|
|
32
|
+
def s_coords(attrs)
|
|
33
|
+
height = scale_value((attrs['HEIGHT'] || 0).to_i)
|
|
34
|
+
width = scale_value((attrs['WIDTH'] || 0).to_i)
|
|
35
|
+
hpos = scale_value((attrs['HPOS'] || 0).to_i)
|
|
36
|
+
vpos = scale_value((attrs['VPOS'] || 0).to_i)
|
|
37
|
+
[hpos, vpos, width, height]
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def compute_scaling(attrs)
|
|
41
|
+
return if @image_width.nil?
|
|
42
|
+
match = attrs.select { |e| e[0].casecmp?('WIDTH') }[0]
|
|
43
|
+
return if match.empty?
|
|
44
|
+
page_width = match[1].to_i
|
|
45
|
+
return if @image_width == page_width
|
|
46
|
+
@scaling = page_width / @image_width.to_f
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def scale_value(v)
|
|
50
|
+
(v / @scaling).to_i
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Callback for element start, implementation of which ignores
|
|
54
|
+
# non-String elements.
|
|
55
|
+
#
|
|
56
|
+
# @param name [String] element name.
|
|
57
|
+
# @param attrs [Array] Array of key, value pair Arrays.
|
|
58
|
+
def start_element(name, attrs = [])
|
|
59
|
+
values = attrs.to_h
|
|
60
|
+
compute_scaling(attrs) if name == 'Page'
|
|
61
|
+
return if name != 'String'
|
|
62
|
+
token = values['CONTENT']
|
|
63
|
+
@text << token
|
|
64
|
+
@words << {
|
|
65
|
+
word: token,
|
|
66
|
+
coordinates: s_coords(values)
|
|
67
|
+
}
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Callback for element end, used here to manage endings of lines and
|
|
71
|
+
# blocks.
|
|
72
|
+
#
|
|
73
|
+
# @param name [String] element name.
|
|
74
|
+
def end_element(name)
|
|
75
|
+
@text << " " if name == 'String'
|
|
76
|
+
@text << "\n" if name == 'TextBlock'
|
|
77
|
+
@text << "\n" if name == 'TextLine'
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Callback for completion of parsing ALTO, used to normalize generated
|
|
81
|
+
# text content (strip unneeded whitespace incidental to output).
|
|
82
|
+
def end_document
|
|
83
|
+
# postprocess @text to remove trailing spaces on lines
|
|
84
|
+
@text = @text.split("\n").map(&:strip).join("\n")
|
|
85
|
+
# remove trailing whitespace at end of buffer
|
|
86
|
+
@text.strip!
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Construct with either path
|
|
91
|
+
#
|
|
92
|
+
# @param xml [String], and process document
|
|
93
|
+
def initialize(xml, image_width = nil, image_height = nil)
|
|
94
|
+
@source = isxml?(xml) ? xml : File.read(xml)
|
|
95
|
+
@image_width = image_width
|
|
96
|
+
@image_height = image_height
|
|
97
|
+
@doc_stream = AltoDocStream.new(image_width)
|
|
98
|
+
parser = Nokogiri::XML::SAX::Parser.new(doc_stream)
|
|
99
|
+
parser.parse(@source)
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# Determine if source parameter is path or xml
|
|
103
|
+
#
|
|
104
|
+
# @param xml [String] either path to xml file or xml source
|
|
105
|
+
# @return [true, false] true if string appears to be XML source, not path
|
|
106
|
+
def isxml?(xml)
|
|
107
|
+
xml.lstrip.start_with?('<')
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Output JSON flattened word coordinates
|
|
111
|
+
#
|
|
112
|
+
# @return [String] JSON serialization of flattened word coordinates
|
|
113
|
+
def json
|
|
114
|
+
words = @doc_stream.words
|
|
115
|
+
builder = NewspaperWorks::TextExtraction::WordCoordsBuilder.new(words,
|
|
116
|
+
@image_width,
|
|
117
|
+
@image_height)
|
|
118
|
+
builder.to_json
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
end
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
require 'json'
|
|
2
|
+
require 'open3'
|
|
3
|
+
require 'rtesseract'
|
|
4
|
+
|
|
5
|
+
# --
|
|
6
|
+
module NewspaperWorks
|
|
7
|
+
# Module for text extraction (OCR or otherwise)
|
|
8
|
+
module TextExtraction
|
|
9
|
+
class PageOCR
|
|
10
|
+
def self.alto_from(path)
|
|
11
|
+
new(path).alto
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def initialize(path)
|
|
15
|
+
@path = path
|
|
16
|
+
@words = nil
|
|
17
|
+
@processor = "mini_magick"
|
|
18
|
+
@source_meta = nil
|
|
19
|
+
@use_gm = extension.start_with?('jp2')
|
|
20
|
+
@box = nil
|
|
21
|
+
@plain = nil
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def extension
|
|
25
|
+
@path.split('.')[-1].downcase
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def load_box
|
|
29
|
+
if @box.nil?
|
|
30
|
+
if @use_gm
|
|
31
|
+
MiniMagick.with_cli(:graphicsmagick) do
|
|
32
|
+
@box = RTesseract::Box.new(@path, processor: @processor)
|
|
33
|
+
@plain = @box.to_s
|
|
34
|
+
end
|
|
35
|
+
else
|
|
36
|
+
@box = RTesseract::Box.new(@path, processor: @processor)
|
|
37
|
+
@plain = @box.to_s
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
@box
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def words
|
|
44
|
+
@words = load_box.words if @words.nil?
|
|
45
|
+
@words
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def normalized_coordinate(word)
|
|
49
|
+
{
|
|
50
|
+
word: word[:word],
|
|
51
|
+
coordinates: [
|
|
52
|
+
word[:x_start],
|
|
53
|
+
word[:y_start],
|
|
54
|
+
(word[:x_end] - word[:x_start]),
|
|
55
|
+
(word[:y_end] - word[:y_start])
|
|
56
|
+
]
|
|
57
|
+
}
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def word_json
|
|
61
|
+
save_words = words.map { |w| normalized_coordinate(w) }
|
|
62
|
+
builder = NewspaperWorks::TextExtraction::WordCoordsBuilder.new(save_words,
|
|
63
|
+
width,
|
|
64
|
+
height)
|
|
65
|
+
builder.to_json
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def plain
|
|
69
|
+
load_box
|
|
70
|
+
@plain
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def identify
|
|
74
|
+
if @source_geometry.nil?
|
|
75
|
+
path = @path
|
|
76
|
+
cmd = "identify -verbose #{path}"
|
|
77
|
+
cmd = 'gm ' + cmd if @use_gm
|
|
78
|
+
lines = `#{cmd}`.lines
|
|
79
|
+
geo = lines.select { |line| line.strip.start_with?('Geometry') }[0]
|
|
80
|
+
img_geo = geo.strip.split(':')[-1].strip.split('+')[0]
|
|
81
|
+
@source_geometry = img_geo.split('x').map(&:to_i)
|
|
82
|
+
end
|
|
83
|
+
@source_geometry
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def width
|
|
87
|
+
identify[0]
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def height
|
|
91
|
+
identify[1]
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def alto
|
|
95
|
+
writer = NewspaperWorks::TextExtraction::RenderAlto.new(width, height)
|
|
96
|
+
writer.to_alto(words)
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
end
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
require 'nokogiri'
|
|
2
|
+
|
|
3
|
+
module NewspaperWorks
|
|
4
|
+
# Module for text extraction (OCR or otherwise)
|
|
5
|
+
module TextExtraction
|
|
6
|
+
class RenderAlto
|
|
7
|
+
def initialize(width, height, scaling = 1.0)
|
|
8
|
+
@height = height
|
|
9
|
+
@width = width
|
|
10
|
+
@scaling = scaling
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def to_alto(words)
|
|
14
|
+
page = alto_page(@width, @height) do |xml|
|
|
15
|
+
words.each do |word|
|
|
16
|
+
xml.String(
|
|
17
|
+
CONTENT: word[:word],
|
|
18
|
+
HEIGHT: scale_point(word[:y_end] - word[:y_start]).to_s,
|
|
19
|
+
WIDTH: scale_point(word[:x_end] - word[:x_start]).to_s,
|
|
20
|
+
HPOS: scale_point(word[:x_start]).to_s,
|
|
21
|
+
VPOS: scale_point(word[:y_start]).to_s
|
|
22
|
+
) { xml.text '' }
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
page.to_xml
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
private
|
|
29
|
+
|
|
30
|
+
# given block to manage word generation, wrap with page/block/line
|
|
31
|
+
def alto_page(pxwidth, pxheight, &block)
|
|
32
|
+
builder = Nokogiri::XML::Builder.new(encoding: 'UTF-8') do |xml|
|
|
33
|
+
xml.alto(xmlns: 'http://www.loc.gov/standards/alto/ns-v2#') do
|
|
34
|
+
xml.Description do
|
|
35
|
+
xml.MeasurementUnit 'pixel'
|
|
36
|
+
end
|
|
37
|
+
alto_layout(xml, pxwidth, pxheight, &block)
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
builder
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def scale_point(value)
|
|
44
|
+
# note: presuming non-fractional, even though ALTO 2.1
|
|
45
|
+
# specifies coordinates are xsd:float, not xsd:int,
|
|
46
|
+
# simplify to integer value for output:
|
|
47
|
+
(value * @scaling).to_i
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# return layout for page
|
|
51
|
+
def alto_layout(xml, pxwidth, pxheight, &block)
|
|
52
|
+
xml.Layout do
|
|
53
|
+
xml.Page(ID: 'ID1',
|
|
54
|
+
PHYSICAL_IMG_NR: '1',
|
|
55
|
+
HEIGHT: pxheight.to_i,
|
|
56
|
+
WIDTH: pxwidth.to_i) do
|
|
57
|
+
xml.PrintSpace(HEIGHT: pxheight.to_i,
|
|
58
|
+
WIDTH: pxwidth.to_i,
|
|
59
|
+
HPOS: '0',
|
|
60
|
+
VPOS: '0') do
|
|
61
|
+
alto_blockline(xml, pxwidth, pxheight, &block)
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# make block line and call word-block
|
|
68
|
+
def alto_blockline(xml, pxwidth, pxheight)
|
|
69
|
+
xml.TextBlock(ID: 'ID1a',
|
|
70
|
+
HEIGHT: pxheight.to_i,
|
|
71
|
+
WIDTH: pxwidth.to_i,
|
|
72
|
+
HPOS: '0',
|
|
73
|
+
VPOS: '0') do
|
|
74
|
+
xml.TextLine(HEIGHT: pxheight.to_i,
|
|
75
|
+
WIDTH: pxwidth.to_i,
|
|
76
|
+
HPOS: '0',
|
|
77
|
+
VPOS: '0') do
|
|
78
|
+
yield(xml)
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
module NewspaperWorks
|
|
2
|
+
# Module for text extraction (OCR or otherwise)
|
|
3
|
+
module TextExtraction
|
|
4
|
+
class WordCoordsBuilder
|
|
5
|
+
def initialize(words, width = nil, height = nil)
|
|
6
|
+
@words = words
|
|
7
|
+
@width = width
|
|
8
|
+
@height = height
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
# Output JSON flattened word coordinates
|
|
12
|
+
#
|
|
13
|
+
# @return [String] JSON serialization of flattened word coordinates
|
|
14
|
+
def to_json
|
|
15
|
+
coordinates = {}
|
|
16
|
+
@words.each do |w|
|
|
17
|
+
word_chars = w[:word]
|
|
18
|
+
word_coords = w[:coordinates]
|
|
19
|
+
if coordinates[word_chars]
|
|
20
|
+
coordinates[word_chars] << word_coords
|
|
21
|
+
else
|
|
22
|
+
coordinates[word_chars] = [word_coords]
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
payload = { width: @width, height: @height, coords: coordinates }
|
|
26
|
+
JSON.generate(payload)
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
require 'newspaper_works/text_extraction/alto_reader'
|
|
2
|
+
require 'newspaper_works/text_extraction/page_ocr'
|
|
3
|
+
require 'newspaper_works/text_extraction/render_alto'
|
|
4
|
+
require 'newspaper_works/text_extraction/word_coords_builder'
|
|
5
|
+
|
|
6
|
+
module NewspaperWorks
|
|
7
|
+
# Module for text extraction (OCR or otherwise)
|
|
8
|
+
module TextExtraction
|
|
9
|
+
end
|
|
10
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
require "newspaper_works/engine"
|
|
2
|
+
require "newspaper_works/errors"
|
|
3
|
+
require "newspaper_works/ingest"
|
|
4
|
+
require "newspaper_works/issue_pdf_composer"
|
|
5
|
+
require "newspaper_works/text_extraction"
|
|
6
|
+
require "newspaper_works/data"
|
|
7
|
+
require "newspaper_works/configuration"
|
|
8
|
+
require "newspaper_works/page_finder"
|
|
9
|
+
require "newspaper_works/logging"
|
|
10
|
+
require "newspaper_works/resource_fetcher"
|
|
11
|
+
|
|
12
|
+
# Newspaper works modules
|
|
13
|
+
module NewspaperWorks
|
|
14
|
+
def self.config(&block)
|
|
15
|
+
@config ||= NewspaperWorks::Configuration.new
|
|
16
|
+
yield @config if block
|
|
17
|
+
@config
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
namespace :newspaper_works do
|
|
2
|
+
def use_application
|
|
3
|
+
ENV['RAILS_ENV'] = Rails.env if ENV['RAILS_ENV'].nil?
|
|
4
|
+
Rails.application.require_environment!
|
|
5
|
+
end
|
|
6
|
+
|
|
7
|
+
desc 'Ingest an NDNP batch: "rake newspaper_works:ingest_ndnp -- --path="'
|
|
8
|
+
task :ingest_ndnp do
|
|
9
|
+
use_application
|
|
10
|
+
ingester = NewspaperWorks::Ingest::NDNP::BatchIngester.from_command(
|
|
11
|
+
ARGV,
|
|
12
|
+
'rake newspaper_works:ingest_ndnp --'
|
|
13
|
+
)
|
|
14
|
+
puts "Beginning NDNP batch ingest..."
|
|
15
|
+
ingester.ingest
|
|
16
|
+
puts "NDNP batch ingest complete! See log/ingest.log for details."
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
desc 'Ingest a directory of PDF issues for a single publication: '\
|
|
20
|
+
'"rake newspaper_works:ingest_pdf_issues -- --path="'
|
|
21
|
+
task :ingest_issues do
|
|
22
|
+
use_application
|
|
23
|
+
ingester = NewspaperWorks::Ingest::BatchIssueIngester.from_command(
|
|
24
|
+
ARGV,
|
|
25
|
+
'rake newspaper_works:ingest_issues --'
|
|
26
|
+
)
|
|
27
|
+
puts "Beginning batch ingest of issues for single publication..."
|
|
28
|
+
ingester.ingest
|
|
29
|
+
puts "Ingest of issue(s) ingest complete, but may be pending background "\
|
|
30
|
+
"jobs. See log/ingest.log for details."
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Aliases to media-specific task ingest names
|
|
34
|
+
# rubocop:disable Style/HashSyntax
|
|
35
|
+
task :ingest_pdf_issues => :ingest_issues
|
|
36
|
+
task :ingest_tiff_issues => :ingest_issues
|
|
37
|
+
task :ingest_jp2_issues => :ingest_issues
|
|
38
|
+
# rubocop:enable Style/HashSyntax
|
|
39
|
+
end
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
$LOAD_PATH.push File.expand_path('../lib', __FILE__)
|
|
2
|
+
|
|
3
|
+
# version updated in one place:
|
|
4
|
+
require 'newspaper_works/version'
|
|
5
|
+
|
|
6
|
+
# Gem description:
|
|
7
|
+
Gem::Specification.new do |spec|
|
|
8
|
+
spec.name = 'newspaper_works'
|
|
9
|
+
spec.version = NewspaperWorks::VERSION
|
|
10
|
+
spec.authors = ['Sean Upton', 'Jacob Reed', 'Brian McBride',
|
|
11
|
+
'Eben English']
|
|
12
|
+
spec.email = ['sean.upton@utah.edu', 'jacob.reed@utah.edu',
|
|
13
|
+
'brian.mcbride@utah.edu', 'eenglish@bpl.org']
|
|
14
|
+
spec.homepage = 'https://github.com/marriott-library/newspaper_works'
|
|
15
|
+
spec.description = 'Gem/Engine for Newspaper Works in Hyrax-based Samvera
|
|
16
|
+
Application.'
|
|
17
|
+
spec.summary = <<-SUMMARY
|
|
18
|
+
newspaper_works is a Rails Engine gem providing model and administrative
|
|
19
|
+
functions to Hyrax-based Samvera applications, for management of
|
|
20
|
+
(primarily scanned) archival newspaper content.
|
|
21
|
+
SUMMARY
|
|
22
|
+
spec.license = 'Apache-2.0'
|
|
23
|
+
spec.files = `git ls-files`.split($OUTPUT_RECORD_SEPARATOR)
|
|
24
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
|
25
|
+
spec.add_dependency 'blacklight_iiif_search'
|
|
26
|
+
spec.add_dependency 'blacklight_advanced_search', '6.4.1'
|
|
27
|
+
spec.add_dependency 'hyrax', '2.5.1'
|
|
28
|
+
spec.add_dependency 'nokogiri'
|
|
29
|
+
spec.add_dependency 'rails', '~> 5.1'
|
|
30
|
+
spec.add_dependency 'rtesseract', '~> 2.2.0'
|
|
31
|
+
spec.add_dependency 'sass-rails', '~> 5.0'
|
|
32
|
+
|
|
33
|
+
spec.add_development_dependency 'bixby'
|
|
34
|
+
spec.add_development_dependency 'capybara', '~> 2.4', '< 2.18.0'
|
|
35
|
+
spec.add_development_dependency 'chromedriver-helper', '~> 2.1'
|
|
36
|
+
spec.add_development_dependency 'engine_cart', '~> 2.2'
|
|
37
|
+
spec.add_development_dependency "factory_bot", '~> 4.4'
|
|
38
|
+
spec.add_development_dependency "faraday"
|
|
39
|
+
spec.add_development_dependency 'fcrepo_wrapper', '~> 0.5', '>= 0.5.1'
|
|
40
|
+
spec.add_development_dependency 'newspaper_works_fixtures', '~> 0.3', '>=0.3.1'
|
|
41
|
+
spec.add_development_dependency 'rails-controller-testing', '~> 1'
|
|
42
|
+
spec.add_development_dependency 'rspec-rails', '~> 3.1'
|
|
43
|
+
spec.add_development_dependency 'rspec-activemodel-mocks'
|
|
44
|
+
spec.add_development_dependency 'selenium-webdriver'
|
|
45
|
+
spec.add_development_dependency 'shoulda-matchers', '~> 3.1'
|
|
46
|
+
spec.add_development_dependency 'solr_wrapper', '>= 1.1', '< 3.0'
|
|
47
|
+
spec.add_development_dependency 'webdrivers', '~> 3.0'
|
|
48
|
+
spec.add_development_dependency 'webmock', '~> 3.6'
|
|
49
|
+
end
|
data/spec/.keep.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
spec dir for RSpec
|