curation_concerns-models 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +2 -0
  3. data/Rakefile +1 -1
  4. data/app/actors/concerns/curation_concerns/manages_embargoes_actor.rb +11 -19
  5. data/app/actors/curation_concerns/base_actor.rb +41 -45
  6. data/app/actors/curation_concerns/embargo_actor.rb +19 -0
  7. data/app/actors/curation_concerns/file_set_actor.rb +200 -0
  8. data/app/actors/curation_concerns/lease_actor.rb +19 -0
  9. data/app/actors/curation_concerns/work_actor_behavior.rb +55 -58
  10. data/app/indexers/curation_concerns/collection_indexer.rb +10 -0
  11. data/app/indexers/curation_concerns/file_set_indexing_service.rb +24 -0
  12. data/app/{services/curation_concerns/generic_work_indexing_service.rb → indexers/curation_concerns/work_indexing_service.rb} +6 -6
  13. data/app/jobs/active_fedora_id_based_job.rb +5 -12
  14. data/app/jobs/audit_job.rb +11 -17
  15. data/app/jobs/characterize_job.rb +8 -7
  16. data/app/jobs/create_derivatives_job.rb +8 -11
  17. data/app/jobs/import_url_job.rb +12 -25
  18. data/app/jobs/ingest_file_job.rb +16 -0
  19. data/app/jobs/ingest_local_file_job.rb +14 -35
  20. data/app/jobs/resolrize_job.rb +3 -5
  21. data/app/jobs/upload_set_update_job.rb +68 -0
  22. data/app/models/checksum_audit_log.rb +2 -3
  23. data/app/models/concerns/curation_concerns/ability.rb +18 -10
  24. data/app/models/concerns/curation_concerns/basic_metadata.rb +1 -3
  25. data/app/models/concerns/curation_concerns/collection_behavior.rb +13 -14
  26. data/app/models/concerns/curation_concerns/file_set/belongs_to_upload_sets.rb +15 -0
  27. data/app/models/concerns/curation_concerns/{generic_file → file_set}/belongs_to_works.rb +8 -14
  28. data/app/models/concerns/curation_concerns/file_set/derivatives.rb +54 -0
  29. data/app/models/concerns/curation_concerns/{generic_file → file_set}/full_text_indexing.rb +1 -2
  30. data/app/models/concerns/curation_concerns/{generic_file → file_set}/indexing.rb +2 -2
  31. data/app/models/concerns/curation_concerns/{generic_file → file_set}/versions.rb +2 -3
  32. data/app/models/concerns/curation_concerns/file_set_behavior.rb +36 -0
  33. data/app/models/concerns/curation_concerns/generic_file.rb +1 -1
  34. data/app/models/concerns/curation_concerns/has_representative.rb +6 -7
  35. data/app/models/concerns/curation_concerns/human_readable_type.rb +5 -7
  36. data/app/models/concerns/curation_concerns/permissions.rb +2 -2
  37. data/app/models/concerns/curation_concerns/permissions/readable.rb +0 -1
  38. data/app/models/concerns/curation_concerns/permissions/writable.rb +10 -51
  39. data/app/models/concerns/curation_concerns/serializers.rb +3 -5
  40. data/app/models/concerns/curation_concerns/solr_document_behavior.rb +37 -40
  41. data/app/models/concerns/curation_concerns/upload_set_behavior.rb +38 -0
  42. data/app/models/concerns/curation_concerns/user.rb +4 -51
  43. data/app/models/concerns/curation_concerns/with_file_sets.rb +28 -0
  44. data/app/models/concerns/curation_concerns/{generic_work_behavior.rb → work_behavior.rb} +12 -6
  45. data/app/models/curation_concerns/classify_concern.rb +7 -7
  46. data/app/models/curation_concerns/quick_classification_query.rb +6 -7
  47. data/app/models/single_use_link.rb +34 -0
  48. data/app/models/upload_set.rb +3 -0
  49. data/app/services/curation_concerns/derivative_path.rb +32 -0
  50. data/app/services/curation_concerns/{generic_file_audit_service.rb → file_set_audit_service.rb} +17 -18
  51. data/app/services/curation_concerns/indexes_thumbnails.rb +14 -0
  52. data/app/services/curation_concerns/local_file_service.rb +10 -0
  53. data/app/services/curation_concerns/lock_manager.rb +40 -0
  54. data/app/services/curation_concerns/noid.rb +1 -1
  55. data/app/services/curation_concerns/persist_derivatives.rb +33 -0
  56. data/app/services/curation_concerns/persist_directly_contained_output_file_service.rb +26 -0
  57. data/app/services/curation_concerns/repository_audit_service.rb +1 -3
  58. data/app/services/curation_concerns/thumbnail_path_service.rb +46 -0
  59. data/app/services/curation_concerns/time_service.rb +7 -0
  60. data/app/services/curation_concerns/versioning_service.rb +11 -12
  61. data/curation_concerns-models.gemspec +6 -6
  62. data/lib/curation_concerns/configuration.rb +154 -0
  63. data/lib/curation_concerns/messages.rb +26 -26
  64. data/lib/curation_concerns/models.rb +5 -14
  65. data/lib/curation_concerns/models/engine.rb +0 -30
  66. data/lib/curation_concerns/models/utils.rb +4 -4
  67. data/lib/curation_concerns/models/version.rb +1 -1
  68. data/lib/generators/curation_concerns/models/abstract_migration_generator.rb +8 -7
  69. data/lib/generators/curation_concerns/models/clamav_generator.rb +3 -3
  70. data/lib/generators/curation_concerns/models/install_generator.rb +13 -20
  71. data/lib/generators/curation_concerns/models/templates/app/models/file_set.rb +4 -0
  72. data/lib/generators/curation_concerns/models/templates/config/clamav.rb +1 -1
  73. data/lib/generators/curation_concerns/models/templates/config/curation_concerns.rb +52 -65
  74. data/lib/generators/curation_concerns/models/templates/config/redis_config.rb +13 -17
  75. data/lib/generators/curation_concerns/models/templates/config/resque_config.rb +2 -1
  76. data/lib/generators/curation_concerns/models/templates/migrations/create_checksum_audit_logs.rb +3 -3
  77. data/lib/generators/curation_concerns/models/templates/migrations/create_single_use_links.rb +12 -0
  78. data/lib/tasks/curation_concerns-models_tasks.rake +4 -62
  79. data/lib/tasks/migrate.rake +1 -1
  80. data/lib/tasks/resque.rake +1 -0
  81. data/lib/tasks/solr_reindex.rake +1 -1
  82. metadata +59 -52
  83. data/app/actors/curation_concerns/generic_file_actor.rb +0 -150
  84. data/app/jobs/active_fedora_pid_based_job.rb +0 -6
  85. data/app/jobs/copy_permissions_job.rb +0 -24
  86. data/app/models/concerns/curation_concerns/generic_file/characterization.rb +0 -89
  87. data/app/models/concerns/curation_concerns/generic_file/content.rb +0 -8
  88. data/app/models/concerns/curation_concerns/generic_file/export.rb +0 -343
  89. data/app/models/concerns/curation_concerns/generic_file_behavior.rb +0 -44
  90. data/app/models/concerns/curation_concerns/with_basic_metadata.rb +0 -98
  91. data/app/models/concerns/curation_concerns/with_generic_files.rb +0 -29
  92. data/app/models/datastreams/fits_datastream.rb +0 -148
  93. data/app/services/curation_concerns/characterization_service.rb +0 -71
  94. data/app/services/curation_concerns/full_text_extraction_service.rb +0 -38
  95. data/app/services/curation_concerns/generic_file_indexing_service.rb +0 -14
  96. data/lib/curation_concerns/models/resque.rb +0 -36
  97. data/lib/generators/curation_concerns/models/fulltext_generator.rb +0 -28
  98. data/lib/generators/curation_concerns/models/templates/app/models/generic_file.rb +0 -4
  99. data/lib/generators/curation_concerns/models/templates/config/resque_admin.rb +0 -10
@@ -1,44 +0,0 @@
1
- module CurationConcerns
2
- module GenericFileBehavior
3
- extend ActiveSupport::Concern
4
- include Hydra::Works::GenericFileBehavior
5
- include Hydra::Works::GenericFile::VirusCheck
6
- include Hydra::WithDepositor
7
- include CurationConcerns::Serializers
8
- include CurationConcerns::Noid
9
- include CurationConcerns::Permissions
10
- include CurationConcerns::GenericFile::Export
11
- include CurationConcerns::GenericFile::Characterization
12
- include CurationConcerns::BasicMetadata
13
- include CurationConcerns::GenericFile::Content
14
- include CurationConcerns::GenericFile::FullTextIndexing
15
- include CurationConcerns::GenericFile::Indexing
16
- include CurationConcerns::GenericFile::BelongsToWorks
17
- include Hydra::AccessControls::Embargoable
18
-
19
- included do
20
- attr_accessor :file
21
-
22
- # make filename single-value (CurationConcerns::GenericFile::Characterization makes it multivalue)
23
- def filename
24
- self[:filename].first
25
- end
26
- end
27
-
28
- def human_readable_type
29
- self.class.to_s.demodulize.titleize
30
- end
31
-
32
- def representative
33
- to_param
34
- end
35
-
36
- def to_solr(solr_doc = {})
37
- super(solr_doc).tap do |solr_doc|
38
- # Enables Riiif to not have to recalculate this each time.
39
- solr_doc['height_isi'] = Integer(height.first) if height.present?
40
- solr_doc['width_isi'] = Integer(width.first) if width.present?
41
- end
42
- end
43
- end
44
- end
@@ -1,98 +0,0 @@
1
- module CurationConcerns
2
- # This is a direct copy of Sufia::GenericFile::Metadata with a few modifications:
3
- # * title & description are single-value instead of multivalue
4
- module DefaultMetadata
5
- extend ActiveSupport::Concern
6
-
7
- included do
8
-
9
- property :label, predicate: ::RDF::DC.title, multiple: false
10
-
11
- property :depositor, predicate: ::RDF::URI.new("http://id.loc.gov/vocabulary/relators/dpt"), multiple: false do |index|
12
- index.as :symbol, :stored_searchable
13
- end
14
-
15
- property :relative_path, predicate: ::RDF::URI.new('http://scholarsphere.psu.edu/ns#relativePath'), multiple: false
16
-
17
- property :import_url, predicate: ::RDF::URI.new('http://scholarsphere.psu.edu/ns#importUrl'), multiple: false do |index|
18
- index.as :symbol
19
- end
20
-
21
- property :part_of, predicate: ::RDF::DC.isPartOf
22
- property :resource_type, predicate: ::RDF::DC.type do |index|
23
- index.as :stored_searchable, :facetable
24
- end
25
- property :title, predicate: ::RDF::DC.title, multiple:false do |index|
26
- index.as :stored_searchable, :facetable
27
- end
28
- property :creator, predicate: ::RDF::DC.creator do |index|
29
- index.as :stored_searchable, :facetable
30
- end
31
- property :contributor, predicate: ::RDF::DC.contributor do |index|
32
- index.as :stored_searchable, :facetable
33
- end
34
- property :description, predicate: ::RDF::DC.description, multiple: false do |index|
35
- index.type :text
36
- index.as :stored_searchable
37
- end
38
- property :tag, predicate: ::RDF::DC.relation do |index|
39
- index.as :stored_searchable, :facetable
40
- end
41
- property :rights, predicate: ::RDF::DC.rights do |index|
42
- index.as :stored_searchable
43
- end
44
- property :publisher, predicate: ::RDF::DC.publisher do |index|
45
- index.as :stored_searchable, :facetable
46
- end
47
- property :date_created, predicate: ::RDF::DC.created do |index|
48
- index.as :stored_searchable
49
- end
50
- property :date_uploaded, predicate: ::RDF::DC.dateSubmitted, multiple: false do |index|
51
- index.type :date
52
- index.as :stored_sortable
53
- end
54
- property :date_modified, predicate: ::RDF::DC.modified, multiple: false do |index|
55
- index.type :date
56
- index.as :stored_sortable
57
- end
58
- property :subject, predicate: ::RDF::DC.subject do |index|
59
- index.as :stored_searchable, :facetable
60
- end
61
- property :language, predicate: ::RDF::DC.language do |index|
62
- index.as :stored_searchable, :facetable
63
- end
64
- property :identifier, predicate: ::RDF::DC.identifier do |index|
65
- index.as :stored_searchable
66
- end
67
- property :based_near, predicate: ::RDF::FOAF.based_near do |index|
68
- index.as :stored_searchable, :facetable
69
- end
70
- property :related_url, predicate: ::RDF::RDFS.seeAlso do |index|
71
- index.as :stored_searchable
72
- end
73
- property :bibliographic_citation, predicate: ::RDF::DC.bibliographicCitation do |index|
74
- index.as :stored_searchable
75
- end
76
- property :source, predicate: ::RDF::DC.source do |index|
77
- index.as :stored_searchable
78
- end
79
-
80
- # TODO: Move this somewhere more appropriate
81
- begin
82
- LocalAuthority.register_vocabulary(self, "subject", "lc_subjects")
83
- LocalAuthority.register_vocabulary(self, "language", "lexvo_languages")
84
- LocalAuthority.register_vocabulary(self, "tag", "lc_genres")
85
- rescue
86
- puts "tables for vocabularies missing"
87
- end
88
- end
89
-
90
- # Add a schema.org itemtype
91
- def itemtype
92
- # Look up the first non-empty resource type value in a hash from the config
93
- CurationConcerns.config.resource_types_to_schema[resource_type.to_a.reject { |type| type.empty? }.first] || 'http://schema.org/CreativeWork'
94
- rescue
95
- 'http://schema.org/CreativeWork'
96
- end
97
- end
98
- end
@@ -1,29 +0,0 @@
1
- # Copied from Curate
2
- module CurationConcerns
3
- module WithGenericFiles
4
- extend ActiveSupport::Concern
5
-
6
- included do
7
- # The generic_files association and its accessor methods comes from Hydra::Works::AggregatesGenericFiles
8
- before_destroy :before_destroy_cleanup_generic_files
9
- end
10
-
11
- # Stopgap unil ActiveFedora ContainerAssociation includes an *_ids accessor.
12
- # At the moment, this is no more efficient than calling generic_files, but hopefully that will change in the future.
13
- def generic_file_ids
14
- generic_files.map { |generic_file| generic_file.id }
15
- end
16
-
17
- def before_destroy_cleanup_generic_files
18
- generic_files.each(&:destroy)
19
- end
20
-
21
- def copy_visibility_to_files
22
- generic_files.each do |gf|
23
- gf.visibility = visibility
24
- gf.save!
25
- end
26
- end
27
-
28
- end
29
- end
@@ -1,148 +0,0 @@
1
- class FitsDatastream < ActiveFedora::OmDatastream
2
- include OM::XML::Document
3
-
4
- set_terminology do |t|
5
- t.root(path: "fits",
6
- xmlns: "http://hul.harvard.edu/ois/xml/ns/fits/fits_output",
7
- schema: "http://hul.harvard.edu/ois/xml/xsd/fits/fits_output.xsd")
8
- t.identification {
9
- t.identity {
10
- t.format_label(path: {attribute: "format"})
11
- t.mime_type(path: {attribute: "mimetype"})
12
- }
13
- }
14
- t.fileinfo {
15
- t.file_size(path: "size")
16
- t.last_modified(path: "lastmodified")
17
- t.filename(path: "filename")
18
- t.original_checksum(path: "md5checksum")
19
- t.rights_basis(path: "rightsBasis")
20
- t.copyright_basis(path: "copyrightBasis")
21
- t.copyright_note(path: "copyrightNote")
22
- }
23
- t.filestatus {
24
- t.well_formed(path: "well-formed")
25
- t.valid(path: "valid")
26
- t.status_message(path: "message")
27
- }
28
- t.metadata {
29
- t.document {
30
- t.file_title(path: "title")
31
- t.file_author(path: "author")
32
- t.file_language(path: "language")
33
- t.page_count(path: "pageCount")
34
- t.word_count(path: "wordCount")
35
- t.character_count(path: "characterCount")
36
- t.paragraph_count(path: "paragraphCount")
37
- t.line_count(path: "lineCount")
38
- t.table_count(path: "tableCount")
39
- t.graphics_count(path: "graphicsCount")
40
- }
41
- t.image {
42
- t.byte_order(path: "byteOrder")
43
- t.compression(path: "compressionScheme")
44
- t.width(path: "imageWidth")
45
- t.height(path: "imageHeight")
46
- t.color_space(path: "colorSpace")
47
- t.profile_name(path: "iccProfileName")
48
- t.profile_version(path: "iccProfileVersion")
49
- t.orientation(path: "orientation")
50
- t.color_map(path: "colorMap")
51
- t.image_producer(path: "imageProducer")
52
- t.capture_device(path: "captureDevice")
53
- t.scanning_software(path: "scanningSoftwareName")
54
- t.exif_version(path: "exifVersion")
55
- t.gps_timestamp(path: "gpsTimeStamp")
56
- t.latitude(path: "gpsDestLatitude")
57
- t.longitude(path: "gpsDestLongitude")
58
- }
59
- t.text {
60
- t.character_set(path: "charset")
61
- t.markup_basis(path: "markupBasis")
62
- t.markup_language(path: "markupLanguage")
63
- }
64
- t.audio {
65
- t.duration(path: "duration")
66
- t.bit_depth(path: "bitDepth")
67
- t.sample_rate(path: "sampleRate")
68
- t.channels(path: "channels")
69
- t.data_format(path: "dataFormatType")
70
- t.offset(path: "offset")
71
- }
72
- t.video {
73
- t.width(path: "imageWidth")
74
- t.height(path: "imageHeight")
75
- t.duration(path: "duration")
76
- t.sample_rate(path: "sampleRate")
77
- t.frame_rate(path: "frameRate")
78
- }
79
- }
80
- t.format_label(proxy: [:identification, :identity, :format_label])
81
- t.mime_type(proxy: [:identification, :identity, :mime_type])
82
- t.file_size(proxy: [:fileinfo, :file_size])
83
- t.last_modified(proxy: [:fileinfo, :last_modified])
84
- t.filename(proxy: [:fileinfo, :filename])
85
- t.original_checksum(proxy: [:fileinfo, :original_checksum])
86
- t.rights_basis(proxy: [:fileinfo, :rights_basis])
87
- t.copyright_basis(proxy: [:fileinfo, :copyright_basis])
88
- t.copyright_note(proxy: [:fileinfo, :copyright_note])
89
- t.well_formed(proxy: [:filestatus, :well_formed])
90
- t.valid(proxy: [:filestatus, :valid])
91
- t.status_message(proxy: [:filestatus, :status_message])
92
- t.file_title(proxy: [:metadata, :document, :file_title])
93
- t.file_author(proxy: [:metadata, :document, :file_author])
94
- t.page_count(proxy: [:metadata, :document, :page_count])
95
- t.file_language(proxy: [:metadata, :document, :file_language])
96
- t.word_count(proxy: [:metadata, :document, :word_count])
97
- t.character_count(proxy: [:metadata, :document, :character_count])
98
- t.paragraph_count(proxy: [:metadata, :document, :paragraph_count])
99
- t.line_count(proxy: [:metadata, :document, :line_count])
100
- t.table_count(proxy: [:metadata, :document, :table_count])
101
- t.graphics_count(proxy: [:metadata, :document, :graphics_count])
102
- t.byte_order(proxy: [:metadata, :image, :byte_order])
103
- t.compression(proxy: [:metadata, :image, :compression])
104
- t.width(proxy: [:metadata, :image, :width])
105
- t.video_width( proxy: [:metadata, :video, :width])
106
- t.height(proxy: [:metadata, :image, :height])
107
- t.video_height(proxy: [:metadata, :video, :height])
108
- t.color_space(proxy: [:metadata, :image, :color_space])
109
- t.profile_name(proxy: [:metadata, :image, :profile_name])
110
- t.profile_version(proxy: [:metadata, :image, :profile_version])
111
- t.orientation(proxy: [:metadata, :image, :orientation])
112
- t.color_map(proxy: [:metadata, :image, :color_map])
113
- t.image_producer(proxy: [:metadata, :image, :image_producer])
114
- t.capture_device(proxy: [:metadata, :image, :capture_device])
115
- t.scanning_software(proxy: [:metadata, :image, :scanning_software])
116
- t.exif_version(proxy: [:metadata, :image, :exif_version])
117
- t.gps_timestamp(proxy: [:metadata, :image, :gps_timestamp])
118
- t.latitude(proxy: [:metadata, :image, :latitude])
119
- t.longitude(proxy: [:metadata, :image, :longitude])
120
- t.character_set(proxy: [:metadata, :text, :character_set])
121
- t.markup_basis(proxy: [:metadata, :text, :markup_basis])
122
- t.markup_language(proxy: [:metadata, :text, :markup_language])
123
- t.duration(proxy: [:metadata, :audio, :duration])
124
- t.video_duration(proxy: [:metadata, :video, :duration])
125
- t.bit_depth(proxy: [:metadata, :audio, :bit_depth])
126
- t.sample_rate(proxy: [:metadata, :audio, :sample_rate])
127
- t.video_sample_rate(proxy: [:metadata, :video, :sample_rate])
128
- t.channels(proxy: [:metadata, :audio, :channels])
129
- t.data_format(proxy: [:metadata, :audio, :data_format])
130
- t.offset(proxy: [:metadata, :audio, :offset])
131
- t.frame_rate(proxy: [:metadata, :video, :frame_rate])
132
- end
133
-
134
- def self.xml_template
135
- builder = Nokogiri::XML::Builder.new do |xml|
136
- xml.fits(xmlns: 'http://hul.harvard.edu/ois/xml/ns/fits/fits_output',
137
- 'xmlns:xsi' => 'http://www.w3.org/2001/XMLSchema-instance',
138
- 'xsi:schemaLocation' =>
139
- "http://hul.harvard.edu/ois/xml/ns/fits/fits_output
140
- http://hul.harvard.edu/ois/xml/xsd/fits/fits_output.xsd",
141
- version: "0.6.0",
142
- timestamp: "1/25/12 11:04 AM") {
143
- xml.identification { xml.identity(toolname: 'FITS') }
144
- }
145
- end
146
- builder.doc
147
- end
148
- end
@@ -1,71 +0,0 @@
1
- module CurationConcerns
2
- # Run FITS to gather technical metadata about the content and the full text.
3
- # Store this extracted metadata in the characterization datastream.
4
- class CharacterizationService
5
- include Hydra::Derivatives::ExtractMetadata
6
-
7
- delegate :mime_type, :uri, to: :@generic_file
8
- attr_reader :generic_file
9
-
10
- def self.run(generic_file)
11
- new(generic_file).characterize
12
- end
13
-
14
- def initialize(generic_file)
15
- @generic_file = generic_file
16
- end
17
-
18
- ## Extract the metadata from the content datastream and record it in the characterization datastream
19
- def characterize
20
- store_metadata(extract_metadata)
21
- store_fulltext(extract_fulltext)
22
- generic_file.filename = [generic_file.original_file.original_name]
23
- end
24
-
25
- protected
26
-
27
- def store_fulltext(extracted_text)
28
- if extracted_text.present?
29
- extracted_text_file = generic_file.build_extracted_text
30
- extracted_text_file.content = extracted_text
31
- end
32
- end
33
-
34
- def extract_fulltext
35
- FullTextExtractionService.run(generic_file)
36
- end
37
-
38
- def store_metadata(metadata)
39
- generic_file.characterization.ng_xml = metadata if metadata.present?
40
- append_metadata
41
- end
42
-
43
- def extract_metadata
44
- return unless generic_file.original_file.has_content?
45
- Hydra::FileCharacterization.characterize(generic_file.original_file.content, filename_for_characterization.join, :fits) do |config|
46
- config[:fits] = Hydra::Derivatives.fits_path
47
- end
48
- end
49
-
50
- # Populate GenericFile's properties with fields from FITS (e.g. Author from pdfs)
51
- def append_metadata
52
- terms = generic_file.characterization_terms
53
- CurationConcerns.config.fits_to_desc_mapping.each_pair do |k, v|
54
- if terms.has_key?(k)
55
- # coerce to array to remove a conditional
56
- terms[k] = [terms[k]] unless terms[k].is_a? Array
57
- terms[k].each do |term_value|
58
- proxy_term = generic_file.send(v)
59
- if proxy_term.kind_of?(Array)
60
- proxy_term << term_value unless proxy_term.include?(term_value)
61
- else
62
- # these are single-valued terms which cannot be appended to
63
- generic_file.send("#{v}=", term_value)
64
- end
65
- end
66
- end
67
- end
68
- end
69
-
70
- end
71
- end
@@ -1,38 +0,0 @@
1
- module CurationConcerns
2
- # Extract the full text from the content using Solr's extract handler
3
- class FullTextExtractionService
4
- def self.run(generic_file)
5
- new(generic_file).extract
6
- end
7
-
8
- delegate :original_file, :logger, :mime_type, :id, to: :@generic_file
9
-
10
- def initialize(generic_file)
11
- @generic_file = generic_file
12
- end
13
-
14
- def extract
15
- uri = URI("#{connection_url}/update/extract?extractOnly=true&wt=json&extractFormat=text")
16
- req = Net::HTTP.new(uri.host, uri.port)
17
- resp = req.post(uri.to_s, original_file.content, {
18
- 'Content-type' => "#{mime_type};charset=utf-8",
19
- 'Content-Length' => original_file.content.size.to_s
20
- })
21
- raise "URL '#{uri}' returned code #{resp.code}" unless resp.code == "200"
22
- original_file.content.rewind if original_file.content.respond_to?(:rewind)
23
- JSON.parse(resp.body)[''].rstrip
24
- rescue => e
25
- logger.error("Error extracting content from #{id}: #{e.inspect}")
26
- return nil
27
- end
28
-
29
- def connection_url
30
- case
31
- when Blacklight.connection_config[:url] then Blacklight.connection_config[:url]
32
- when Blacklight.connection_config["url"] then Blacklight.connection_config["url"]
33
- when Blacklight.connection_config[:fulltext] then Blacklight.connection_config[:fulltext]["url"]
34
- else Blacklight.connection_config[:default]["url"]
35
- end
36
- end
37
- end
38
- end
@@ -1,14 +0,0 @@
1
- module CurationConcerns
2
- class GenericFileIndexingService < ActiveFedora::IndexingService
3
- def generate_solr_document
4
- super.tap do |solr_doc|
5
- solr_doc[Solrizer.solr_name('label')] = object.label
6
- solr_doc[Solrizer.solr_name('file_format')] = object.file_format
7
- solr_doc[Solrizer.solr_name('file_format', :facetable)] = object.file_format
8
- solr_doc[Solrizer.solr_name(:file_size, :symbol)] = object.file_size[0]
9
- solr_doc['all_text_timv'] = object.full_text.content
10
- solr_doc[Solrizer.solr_name('generic_work_ids', :symbol)] = object.generic_work_ids unless object.generic_work_ids.empty?
11
- end
12
- end
13
- end
14
- end