curation_concerns-models 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (99) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +2 -0
  3. data/Rakefile +1 -1
  4. data/app/actors/concerns/curation_concerns/manages_embargoes_actor.rb +11 -19
  5. data/app/actors/curation_concerns/base_actor.rb +41 -45
  6. data/app/actors/curation_concerns/embargo_actor.rb +19 -0
  7. data/app/actors/curation_concerns/file_set_actor.rb +200 -0
  8. data/app/actors/curation_concerns/lease_actor.rb +19 -0
  9. data/app/actors/curation_concerns/work_actor_behavior.rb +55 -58
  10. data/app/indexers/curation_concerns/collection_indexer.rb +10 -0
  11. data/app/indexers/curation_concerns/file_set_indexing_service.rb +24 -0
  12. data/app/{services/curation_concerns/generic_work_indexing_service.rb → indexers/curation_concerns/work_indexing_service.rb} +6 -6
  13. data/app/jobs/active_fedora_id_based_job.rb +5 -12
  14. data/app/jobs/audit_job.rb +11 -17
  15. data/app/jobs/characterize_job.rb +8 -7
  16. data/app/jobs/create_derivatives_job.rb +8 -11
  17. data/app/jobs/import_url_job.rb +12 -25
  18. data/app/jobs/ingest_file_job.rb +16 -0
  19. data/app/jobs/ingest_local_file_job.rb +14 -35
  20. data/app/jobs/resolrize_job.rb +3 -5
  21. data/app/jobs/upload_set_update_job.rb +68 -0
  22. data/app/models/checksum_audit_log.rb +2 -3
  23. data/app/models/concerns/curation_concerns/ability.rb +18 -10
  24. data/app/models/concerns/curation_concerns/basic_metadata.rb +1 -3
  25. data/app/models/concerns/curation_concerns/collection_behavior.rb +13 -14
  26. data/app/models/concerns/curation_concerns/file_set/belongs_to_upload_sets.rb +15 -0
  27. data/app/models/concerns/curation_concerns/{generic_file → file_set}/belongs_to_works.rb +8 -14
  28. data/app/models/concerns/curation_concerns/file_set/derivatives.rb +54 -0
  29. data/app/models/concerns/curation_concerns/{generic_file → file_set}/full_text_indexing.rb +1 -2
  30. data/app/models/concerns/curation_concerns/{generic_file → file_set}/indexing.rb +2 -2
  31. data/app/models/concerns/curation_concerns/{generic_file → file_set}/versions.rb +2 -3
  32. data/app/models/concerns/curation_concerns/file_set_behavior.rb +36 -0
  33. data/app/models/concerns/curation_concerns/generic_file.rb +1 -1
  34. data/app/models/concerns/curation_concerns/has_representative.rb +6 -7
  35. data/app/models/concerns/curation_concerns/human_readable_type.rb +5 -7
  36. data/app/models/concerns/curation_concerns/permissions.rb +2 -2
  37. data/app/models/concerns/curation_concerns/permissions/readable.rb +0 -1
  38. data/app/models/concerns/curation_concerns/permissions/writable.rb +10 -51
  39. data/app/models/concerns/curation_concerns/serializers.rb +3 -5
  40. data/app/models/concerns/curation_concerns/solr_document_behavior.rb +37 -40
  41. data/app/models/concerns/curation_concerns/upload_set_behavior.rb +38 -0
  42. data/app/models/concerns/curation_concerns/user.rb +4 -51
  43. data/app/models/concerns/curation_concerns/with_file_sets.rb +28 -0
  44. data/app/models/concerns/curation_concerns/{generic_work_behavior.rb → work_behavior.rb} +12 -6
  45. data/app/models/curation_concerns/classify_concern.rb +7 -7
  46. data/app/models/curation_concerns/quick_classification_query.rb +6 -7
  47. data/app/models/single_use_link.rb +34 -0
  48. data/app/models/upload_set.rb +3 -0
  49. data/app/services/curation_concerns/derivative_path.rb +32 -0
  50. data/app/services/curation_concerns/{generic_file_audit_service.rb → file_set_audit_service.rb} +17 -18
  51. data/app/services/curation_concerns/indexes_thumbnails.rb +14 -0
  52. data/app/services/curation_concerns/local_file_service.rb +10 -0
  53. data/app/services/curation_concerns/lock_manager.rb +40 -0
  54. data/app/services/curation_concerns/noid.rb +1 -1
  55. data/app/services/curation_concerns/persist_derivatives.rb +33 -0
  56. data/app/services/curation_concerns/persist_directly_contained_output_file_service.rb +26 -0
  57. data/app/services/curation_concerns/repository_audit_service.rb +1 -3
  58. data/app/services/curation_concerns/thumbnail_path_service.rb +46 -0
  59. data/app/services/curation_concerns/time_service.rb +7 -0
  60. data/app/services/curation_concerns/versioning_service.rb +11 -12
  61. data/curation_concerns-models.gemspec +6 -6
  62. data/lib/curation_concerns/configuration.rb +154 -0
  63. data/lib/curation_concerns/messages.rb +26 -26
  64. data/lib/curation_concerns/models.rb +5 -14
  65. data/lib/curation_concerns/models/engine.rb +0 -30
  66. data/lib/curation_concerns/models/utils.rb +4 -4
  67. data/lib/curation_concerns/models/version.rb +1 -1
  68. data/lib/generators/curation_concerns/models/abstract_migration_generator.rb +8 -7
  69. data/lib/generators/curation_concerns/models/clamav_generator.rb +3 -3
  70. data/lib/generators/curation_concerns/models/install_generator.rb +13 -20
  71. data/lib/generators/curation_concerns/models/templates/app/models/file_set.rb +4 -0
  72. data/lib/generators/curation_concerns/models/templates/config/clamav.rb +1 -1
  73. data/lib/generators/curation_concerns/models/templates/config/curation_concerns.rb +52 -65
  74. data/lib/generators/curation_concerns/models/templates/config/redis_config.rb +13 -17
  75. data/lib/generators/curation_concerns/models/templates/config/resque_config.rb +2 -1
  76. data/lib/generators/curation_concerns/models/templates/migrations/create_checksum_audit_logs.rb +3 -3
  77. data/lib/generators/curation_concerns/models/templates/migrations/create_single_use_links.rb +12 -0
  78. data/lib/tasks/curation_concerns-models_tasks.rake +4 -62
  79. data/lib/tasks/migrate.rake +1 -1
  80. data/lib/tasks/resque.rake +1 -0
  81. data/lib/tasks/solr_reindex.rake +1 -1
  82. metadata +59 -52
  83. data/app/actors/curation_concerns/generic_file_actor.rb +0 -150
  84. data/app/jobs/active_fedora_pid_based_job.rb +0 -6
  85. data/app/jobs/copy_permissions_job.rb +0 -24
  86. data/app/models/concerns/curation_concerns/generic_file/characterization.rb +0 -89
  87. data/app/models/concerns/curation_concerns/generic_file/content.rb +0 -8
  88. data/app/models/concerns/curation_concerns/generic_file/export.rb +0 -343
  89. data/app/models/concerns/curation_concerns/generic_file_behavior.rb +0 -44
  90. data/app/models/concerns/curation_concerns/with_basic_metadata.rb +0 -98
  91. data/app/models/concerns/curation_concerns/with_generic_files.rb +0 -29
  92. data/app/models/datastreams/fits_datastream.rb +0 -148
  93. data/app/services/curation_concerns/characterization_service.rb +0 -71
  94. data/app/services/curation_concerns/full_text_extraction_service.rb +0 -38
  95. data/app/services/curation_concerns/generic_file_indexing_service.rb +0 -14
  96. data/lib/curation_concerns/models/resque.rb +0 -36
  97. data/lib/generators/curation_concerns/models/fulltext_generator.rb +0 -28
  98. data/lib/generators/curation_concerns/models/templates/app/models/generic_file.rb +0 -4
  99. data/lib/generators/curation_concerns/models/templates/config/resque_admin.rb +0 -10
@@ -1,44 +0,0 @@
1
- module CurationConcerns
2
- module GenericFileBehavior
3
- extend ActiveSupport::Concern
4
- include Hydra::Works::GenericFileBehavior
5
- include Hydra::Works::GenericFile::VirusCheck
6
- include Hydra::WithDepositor
7
- include CurationConcerns::Serializers
8
- include CurationConcerns::Noid
9
- include CurationConcerns::Permissions
10
- include CurationConcerns::GenericFile::Export
11
- include CurationConcerns::GenericFile::Characterization
12
- include CurationConcerns::BasicMetadata
13
- include CurationConcerns::GenericFile::Content
14
- include CurationConcerns::GenericFile::FullTextIndexing
15
- include CurationConcerns::GenericFile::Indexing
16
- include CurationConcerns::GenericFile::BelongsToWorks
17
- include Hydra::AccessControls::Embargoable
18
-
19
- included do
20
- attr_accessor :file
21
-
22
- # make filename single-value (CurationConcerns::GenericFile::Characterization makes it multivalue)
23
- def filename
24
- self[:filename].first
25
- end
26
- end
27
-
28
- def human_readable_type
29
- self.class.to_s.demodulize.titleize
30
- end
31
-
32
- def representative
33
- to_param
34
- end
35
-
36
- def to_solr(solr_doc = {})
37
- super(solr_doc).tap do |solr_doc|
38
- # Enables Riiif to not have to recalculate this each time.
39
- solr_doc['height_isi'] = Integer(height.first) if height.present?
40
- solr_doc['width_isi'] = Integer(width.first) if width.present?
41
- end
42
- end
43
- end
44
- end
@@ -1,98 +0,0 @@
1
- module CurationConcerns
2
- # This is a direct copy of Sufia::GenericFile::Metadata with a few modifications:
3
- # * title & description are single-value instead of multivalue
4
- module DefaultMetadata
5
- extend ActiveSupport::Concern
6
-
7
- included do
8
-
9
- property :label, predicate: ::RDF::DC.title, multiple: false
10
-
11
- property :depositor, predicate: ::RDF::URI.new("http://id.loc.gov/vocabulary/relators/dpt"), multiple: false do |index|
12
- index.as :symbol, :stored_searchable
13
- end
14
-
15
- property :relative_path, predicate: ::RDF::URI.new('http://scholarsphere.psu.edu/ns#relativePath'), multiple: false
16
-
17
- property :import_url, predicate: ::RDF::URI.new('http://scholarsphere.psu.edu/ns#importUrl'), multiple: false do |index|
18
- index.as :symbol
19
- end
20
-
21
- property :part_of, predicate: ::RDF::DC.isPartOf
22
- property :resource_type, predicate: ::RDF::DC.type do |index|
23
- index.as :stored_searchable, :facetable
24
- end
25
- property :title, predicate: ::RDF::DC.title, multiple:false do |index|
26
- index.as :stored_searchable, :facetable
27
- end
28
- property :creator, predicate: ::RDF::DC.creator do |index|
29
- index.as :stored_searchable, :facetable
30
- end
31
- property :contributor, predicate: ::RDF::DC.contributor do |index|
32
- index.as :stored_searchable, :facetable
33
- end
34
- property :description, predicate: ::RDF::DC.description, multiple: false do |index|
35
- index.type :text
36
- index.as :stored_searchable
37
- end
38
- property :tag, predicate: ::RDF::DC.relation do |index|
39
- index.as :stored_searchable, :facetable
40
- end
41
- property :rights, predicate: ::RDF::DC.rights do |index|
42
- index.as :stored_searchable
43
- end
44
- property :publisher, predicate: ::RDF::DC.publisher do |index|
45
- index.as :stored_searchable, :facetable
46
- end
47
- property :date_created, predicate: ::RDF::DC.created do |index|
48
- index.as :stored_searchable
49
- end
50
- property :date_uploaded, predicate: ::RDF::DC.dateSubmitted, multiple: false do |index|
51
- index.type :date
52
- index.as :stored_sortable
53
- end
54
- property :date_modified, predicate: ::RDF::DC.modified, multiple: false do |index|
55
- index.type :date
56
- index.as :stored_sortable
57
- end
58
- property :subject, predicate: ::RDF::DC.subject do |index|
59
- index.as :stored_searchable, :facetable
60
- end
61
- property :language, predicate: ::RDF::DC.language do |index|
62
- index.as :stored_searchable, :facetable
63
- end
64
- property :identifier, predicate: ::RDF::DC.identifier do |index|
65
- index.as :stored_searchable
66
- end
67
- property :based_near, predicate: ::RDF::FOAF.based_near do |index|
68
- index.as :stored_searchable, :facetable
69
- end
70
- property :related_url, predicate: ::RDF::RDFS.seeAlso do |index|
71
- index.as :stored_searchable
72
- end
73
- property :bibliographic_citation, predicate: ::RDF::DC.bibliographicCitation do |index|
74
- index.as :stored_searchable
75
- end
76
- property :source, predicate: ::RDF::DC.source do |index|
77
- index.as :stored_searchable
78
- end
79
-
80
- # TODO: Move this somewhere more appropriate
81
- begin
82
- LocalAuthority.register_vocabulary(self, "subject", "lc_subjects")
83
- LocalAuthority.register_vocabulary(self, "language", "lexvo_languages")
84
- LocalAuthority.register_vocabulary(self, "tag", "lc_genres")
85
- rescue
86
- puts "tables for vocabularies missing"
87
- end
88
- end
89
-
90
- # Add a schema.org itemtype
91
- def itemtype
92
- # Look up the first non-empty resource type value in a hash from the config
93
- CurationConcerns.config.resource_types_to_schema[resource_type.to_a.reject { |type| type.empty? }.first] || 'http://schema.org/CreativeWork'
94
- rescue
95
- 'http://schema.org/CreativeWork'
96
- end
97
- end
98
- end
@@ -1,29 +0,0 @@
1
- # Copied from Curate
2
- module CurationConcerns
3
- module WithGenericFiles
4
- extend ActiveSupport::Concern
5
-
6
- included do
7
- # The generic_files association and its accessor methods comes from Hydra::Works::AggregatesGenericFiles
8
- before_destroy :before_destroy_cleanup_generic_files
9
- end
10
-
11
- # Stopgap unil ActiveFedora ContainerAssociation includes an *_ids accessor.
12
- # At the moment, this is no more efficient than calling generic_files, but hopefully that will change in the future.
13
- def generic_file_ids
14
- generic_files.map { |generic_file| generic_file.id }
15
- end
16
-
17
- def before_destroy_cleanup_generic_files
18
- generic_files.each(&:destroy)
19
- end
20
-
21
- def copy_visibility_to_files
22
- generic_files.each do |gf|
23
- gf.visibility = visibility
24
- gf.save!
25
- end
26
- end
27
-
28
- end
29
- end
@@ -1,148 +0,0 @@
1
- class FitsDatastream < ActiveFedora::OmDatastream
2
- include OM::XML::Document
3
-
4
- set_terminology do |t|
5
- t.root(path: "fits",
6
- xmlns: "http://hul.harvard.edu/ois/xml/ns/fits/fits_output",
7
- schema: "http://hul.harvard.edu/ois/xml/xsd/fits/fits_output.xsd")
8
- t.identification {
9
- t.identity {
10
- t.format_label(path: {attribute: "format"})
11
- t.mime_type(path: {attribute: "mimetype"})
12
- }
13
- }
14
- t.fileinfo {
15
- t.file_size(path: "size")
16
- t.last_modified(path: "lastmodified")
17
- t.filename(path: "filename")
18
- t.original_checksum(path: "md5checksum")
19
- t.rights_basis(path: "rightsBasis")
20
- t.copyright_basis(path: "copyrightBasis")
21
- t.copyright_note(path: "copyrightNote")
22
- }
23
- t.filestatus {
24
- t.well_formed(path: "well-formed")
25
- t.valid(path: "valid")
26
- t.status_message(path: "message")
27
- }
28
- t.metadata {
29
- t.document {
30
- t.file_title(path: "title")
31
- t.file_author(path: "author")
32
- t.file_language(path: "language")
33
- t.page_count(path: "pageCount")
34
- t.word_count(path: "wordCount")
35
- t.character_count(path: "characterCount")
36
- t.paragraph_count(path: "paragraphCount")
37
- t.line_count(path: "lineCount")
38
- t.table_count(path: "tableCount")
39
- t.graphics_count(path: "graphicsCount")
40
- }
41
- t.image {
42
- t.byte_order(path: "byteOrder")
43
- t.compression(path: "compressionScheme")
44
- t.width(path: "imageWidth")
45
- t.height(path: "imageHeight")
46
- t.color_space(path: "colorSpace")
47
- t.profile_name(path: "iccProfileName")
48
- t.profile_version(path: "iccProfileVersion")
49
- t.orientation(path: "orientation")
50
- t.color_map(path: "colorMap")
51
- t.image_producer(path: "imageProducer")
52
- t.capture_device(path: "captureDevice")
53
- t.scanning_software(path: "scanningSoftwareName")
54
- t.exif_version(path: "exifVersion")
55
- t.gps_timestamp(path: "gpsTimeStamp")
56
- t.latitude(path: "gpsDestLatitude")
57
- t.longitude(path: "gpsDestLongitude")
58
- }
59
- t.text {
60
- t.character_set(path: "charset")
61
- t.markup_basis(path: "markupBasis")
62
- t.markup_language(path: "markupLanguage")
63
- }
64
- t.audio {
65
- t.duration(path: "duration")
66
- t.bit_depth(path: "bitDepth")
67
- t.sample_rate(path: "sampleRate")
68
- t.channels(path: "channels")
69
- t.data_format(path: "dataFormatType")
70
- t.offset(path: "offset")
71
- }
72
- t.video {
73
- t.width(path: "imageWidth")
74
- t.height(path: "imageHeight")
75
- t.duration(path: "duration")
76
- t.sample_rate(path: "sampleRate")
77
- t.frame_rate(path: "frameRate")
78
- }
79
- }
80
- t.format_label(proxy: [:identification, :identity, :format_label])
81
- t.mime_type(proxy: [:identification, :identity, :mime_type])
82
- t.file_size(proxy: [:fileinfo, :file_size])
83
- t.last_modified(proxy: [:fileinfo, :last_modified])
84
- t.filename(proxy: [:fileinfo, :filename])
85
- t.original_checksum(proxy: [:fileinfo, :original_checksum])
86
- t.rights_basis(proxy: [:fileinfo, :rights_basis])
87
- t.copyright_basis(proxy: [:fileinfo, :copyright_basis])
88
- t.copyright_note(proxy: [:fileinfo, :copyright_note])
89
- t.well_formed(proxy: [:filestatus, :well_formed])
90
- t.valid(proxy: [:filestatus, :valid])
91
- t.status_message(proxy: [:filestatus, :status_message])
92
- t.file_title(proxy: [:metadata, :document, :file_title])
93
- t.file_author(proxy: [:metadata, :document, :file_author])
94
- t.page_count(proxy: [:metadata, :document, :page_count])
95
- t.file_language(proxy: [:metadata, :document, :file_language])
96
- t.word_count(proxy: [:metadata, :document, :word_count])
97
- t.character_count(proxy: [:metadata, :document, :character_count])
98
- t.paragraph_count(proxy: [:metadata, :document, :paragraph_count])
99
- t.line_count(proxy: [:metadata, :document, :line_count])
100
- t.table_count(proxy: [:metadata, :document, :table_count])
101
- t.graphics_count(proxy: [:metadata, :document, :graphics_count])
102
- t.byte_order(proxy: [:metadata, :image, :byte_order])
103
- t.compression(proxy: [:metadata, :image, :compression])
104
- t.width(proxy: [:metadata, :image, :width])
105
- t.video_width( proxy: [:metadata, :video, :width])
106
- t.height(proxy: [:metadata, :image, :height])
107
- t.video_height(proxy: [:metadata, :video, :height])
108
- t.color_space(proxy: [:metadata, :image, :color_space])
109
- t.profile_name(proxy: [:metadata, :image, :profile_name])
110
- t.profile_version(proxy: [:metadata, :image, :profile_version])
111
- t.orientation(proxy: [:metadata, :image, :orientation])
112
- t.color_map(proxy: [:metadata, :image, :color_map])
113
- t.image_producer(proxy: [:metadata, :image, :image_producer])
114
- t.capture_device(proxy: [:metadata, :image, :capture_device])
115
- t.scanning_software(proxy: [:metadata, :image, :scanning_software])
116
- t.exif_version(proxy: [:metadata, :image, :exif_version])
117
- t.gps_timestamp(proxy: [:metadata, :image, :gps_timestamp])
118
- t.latitude(proxy: [:metadata, :image, :latitude])
119
- t.longitude(proxy: [:metadata, :image, :longitude])
120
- t.character_set(proxy: [:metadata, :text, :character_set])
121
- t.markup_basis(proxy: [:metadata, :text, :markup_basis])
122
- t.markup_language(proxy: [:metadata, :text, :markup_language])
123
- t.duration(proxy: [:metadata, :audio, :duration])
124
- t.video_duration(proxy: [:metadata, :video, :duration])
125
- t.bit_depth(proxy: [:metadata, :audio, :bit_depth])
126
- t.sample_rate(proxy: [:metadata, :audio, :sample_rate])
127
- t.video_sample_rate(proxy: [:metadata, :video, :sample_rate])
128
- t.channels(proxy: [:metadata, :audio, :channels])
129
- t.data_format(proxy: [:metadata, :audio, :data_format])
130
- t.offset(proxy: [:metadata, :audio, :offset])
131
- t.frame_rate(proxy: [:metadata, :video, :frame_rate])
132
- end
133
-
134
- def self.xml_template
135
- builder = Nokogiri::XML::Builder.new do |xml|
136
- xml.fits(xmlns: 'http://hul.harvard.edu/ois/xml/ns/fits/fits_output',
137
- 'xmlns:xsi' => 'http://www.w3.org/2001/XMLSchema-instance',
138
- 'xsi:schemaLocation' =>
139
- "http://hul.harvard.edu/ois/xml/ns/fits/fits_output
140
- http://hul.harvard.edu/ois/xml/xsd/fits/fits_output.xsd",
141
- version: "0.6.0",
142
- timestamp: "1/25/12 11:04 AM") {
143
- xml.identification { xml.identity(toolname: 'FITS') }
144
- }
145
- end
146
- builder.doc
147
- end
148
- end
@@ -1,71 +0,0 @@
1
- module CurationConcerns
2
- # Run FITS to gather technical metadata about the content and the full text.
3
- # Store this extracted metadata in the characterization datastream.
4
- class CharacterizationService
5
- include Hydra::Derivatives::ExtractMetadata
6
-
7
- delegate :mime_type, :uri, to: :@generic_file
8
- attr_reader :generic_file
9
-
10
- def self.run(generic_file)
11
- new(generic_file).characterize
12
- end
13
-
14
- def initialize(generic_file)
15
- @generic_file = generic_file
16
- end
17
-
18
- ## Extract the metadata from the content datastream and record it in the characterization datastream
19
- def characterize
20
- store_metadata(extract_metadata)
21
- store_fulltext(extract_fulltext)
22
- generic_file.filename = [generic_file.original_file.original_name]
23
- end
24
-
25
- protected
26
-
27
- def store_fulltext(extracted_text)
28
- if extracted_text.present?
29
- extracted_text_file = generic_file.build_extracted_text
30
- extracted_text_file.content = extracted_text
31
- end
32
- end
33
-
34
- def extract_fulltext
35
- FullTextExtractionService.run(generic_file)
36
- end
37
-
38
- def store_metadata(metadata)
39
- generic_file.characterization.ng_xml = metadata if metadata.present?
40
- append_metadata
41
- end
42
-
43
- def extract_metadata
44
- return unless generic_file.original_file.has_content?
45
- Hydra::FileCharacterization.characterize(generic_file.original_file.content, filename_for_characterization.join, :fits) do |config|
46
- config[:fits] = Hydra::Derivatives.fits_path
47
- end
48
- end
49
-
50
- # Populate GenericFile's properties with fields from FITS (e.g. Author from pdfs)
51
- def append_metadata
52
- terms = generic_file.characterization_terms
53
- CurationConcerns.config.fits_to_desc_mapping.each_pair do |k, v|
54
- if terms.has_key?(k)
55
- # coerce to array to remove a conditional
56
- terms[k] = [terms[k]] unless terms[k].is_a? Array
57
- terms[k].each do |term_value|
58
- proxy_term = generic_file.send(v)
59
- if proxy_term.kind_of?(Array)
60
- proxy_term << term_value unless proxy_term.include?(term_value)
61
- else
62
- # these are single-valued terms which cannot be appended to
63
- generic_file.send("#{v}=", term_value)
64
- end
65
- end
66
- end
67
- end
68
- end
69
-
70
- end
71
- end
@@ -1,38 +0,0 @@
1
- module CurationConcerns
2
- # Extract the full text from the content using Solr's extract handler
3
- class FullTextExtractionService
4
- def self.run(generic_file)
5
- new(generic_file).extract
6
- end
7
-
8
- delegate :original_file, :logger, :mime_type, :id, to: :@generic_file
9
-
10
- def initialize(generic_file)
11
- @generic_file = generic_file
12
- end
13
-
14
- def extract
15
- uri = URI("#{connection_url}/update/extract?extractOnly=true&wt=json&extractFormat=text")
16
- req = Net::HTTP.new(uri.host, uri.port)
17
- resp = req.post(uri.to_s, original_file.content, {
18
- 'Content-type' => "#{mime_type};charset=utf-8",
19
- 'Content-Length' => original_file.content.size.to_s
20
- })
21
- raise "URL '#{uri}' returned code #{resp.code}" unless resp.code == "200"
22
- original_file.content.rewind if original_file.content.respond_to?(:rewind)
23
- JSON.parse(resp.body)[''].rstrip
24
- rescue => e
25
- logger.error("Error extracting content from #{id}: #{e.inspect}")
26
- return nil
27
- end
28
-
29
- def connection_url
30
- case
31
- when Blacklight.connection_config[:url] then Blacklight.connection_config[:url]
32
- when Blacklight.connection_config["url"] then Blacklight.connection_config["url"]
33
- when Blacklight.connection_config[:fulltext] then Blacklight.connection_config[:fulltext]["url"]
34
- else Blacklight.connection_config[:default]["url"]
35
- end
36
- end
37
- end
38
- end
@@ -1,14 +0,0 @@
1
- module CurationConcerns
2
- class GenericFileIndexingService < ActiveFedora::IndexingService
3
- def generate_solr_document
4
- super.tap do |solr_doc|
5
- solr_doc[Solrizer.solr_name('label')] = object.label
6
- solr_doc[Solrizer.solr_name('file_format')] = object.file_format
7
- solr_doc[Solrizer.solr_name('file_format', :facetable)] = object.file_format
8
- solr_doc[Solrizer.solr_name(:file_size, :symbol)] = object.file_size[0]
9
- solr_doc['all_text_timv'] = object.full_text.content
10
- solr_doc[Solrizer.solr_name('generic_work_ids', :symbol)] = object.generic_work_ids unless object.generic_work_ids.empty?
11
- end
12
- end
13
- end
14
- end