dor_indexing 1.4.1 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +15 -7
- data/README.md +23 -3
- data/dor_indexing.gemspec +1 -0
- data/lib/dor_indexing/builders/document_builder.rb +14 -10
- data/lib/dor_indexing/indexers/identifiable_indexer.rb +7 -6
- data/lib/dor_indexing/indexers/releasable_indexer.rb +24 -14
- data/lib/dor_indexing/repository_error.rb +8 -0
- data/lib/dor_indexing/version.rb +1 -1
- data/lib/dor_indexing.rb +9 -2
- metadata +18 -4
- data/lib/dor_indexing/cocina_repository.rb +0 -24
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b76dd6ecf919e3653810f59c166d313f77eba4a3046bcd1598abd996bf57c74d
|
4
|
+
data.tar.gz: be615ccc690756cd7aff121e175d544caea9cf3b1dd70a6d03e0fd3c9ab14c43
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d04715c311bfd64ac62169e18d4fea188b6c2dc0e8263c38413e3eac3f985e901f3f5ba1181695f5e24ecbbdb713c937bd6c334c0e1f4ec622f293cea4a2195b
|
7
|
+
data.tar.gz: de51ceba624569579cbfe1c1f259ba3568158443ed849ed3d253a144ed6c25346d89916df530c7bc91891afe5f3de19e508cca9cc23818f9e61b262b7306a2a2
|
data/Gemfile.lock
CHANGED
@@ -1,9 +1,10 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
dor_indexing (
|
4
|
+
dor_indexing (2.0.0)
|
5
5
|
activesupport
|
6
6
|
cocina-models (~> 0.95.1)
|
7
|
+
dor-services-client (~> 14.0)
|
7
8
|
dor-workflow-client (~> 7.0)
|
8
9
|
honeybadger
|
9
10
|
marc-vocab (~> 0.3.0)
|
@@ -26,7 +27,7 @@ GEM
|
|
26
27
|
ast (2.4.2)
|
27
28
|
attr_extras (7.1.0)
|
28
29
|
base64 (0.2.0)
|
29
|
-
bigdecimal (3.1.
|
30
|
+
bigdecimal (3.1.7)
|
30
31
|
byebug (11.1.3)
|
31
32
|
cocina-models (0.95.1)
|
32
33
|
activesupport
|
@@ -51,6 +52,13 @@ GEM
|
|
51
52
|
activesupport
|
52
53
|
diff-lcs (1.5.1)
|
53
54
|
docile (1.4.0)
|
55
|
+
dor-services-client (14.4.0)
|
56
|
+
activesupport (>= 4.2, < 8)
|
57
|
+
cocina-models (~> 0.95.1)
|
58
|
+
deprecation
|
59
|
+
faraday (~> 2.0)
|
60
|
+
faraday-retry
|
61
|
+
zeitwerk (~> 2.1)
|
54
62
|
dor-workflow-client (7.0.2)
|
55
63
|
activesupport (>= 3.2.1, < 8)
|
56
64
|
deprecation (>= 0.99.0)
|
@@ -89,7 +97,7 @@ GEM
|
|
89
97
|
net-http
|
90
98
|
faraday-retry (2.2.0)
|
91
99
|
faraday (~> 2.0)
|
92
|
-
honeybadger (5.
|
100
|
+
honeybadger (5.8.0)
|
93
101
|
i18n (1.14.4)
|
94
102
|
concurrent-ruby (~> 1.0)
|
95
103
|
ice_nine (0.11.2)
|
@@ -99,7 +107,7 @@ GEM
|
|
99
107
|
multi_json
|
100
108
|
language_server-protocol (3.17.0.3)
|
101
109
|
marc-vocab (0.3.0)
|
102
|
-
minitest (5.22.
|
110
|
+
minitest (5.22.3)
|
103
111
|
mods (3.0.4)
|
104
112
|
edtf (~> 3.0)
|
105
113
|
iso-639
|
@@ -109,9 +117,9 @@ GEM
|
|
109
117
|
mutex_m (0.2.0)
|
110
118
|
net-http (0.4.1)
|
111
119
|
uri
|
112
|
-
nokogiri (1.16.
|
120
|
+
nokogiri (1.16.3-x86_64-darwin)
|
113
121
|
racc (~> 1.4)
|
114
|
-
nokogiri (1.16.
|
122
|
+
nokogiri (1.16.3-x86_64-linux)
|
115
123
|
racc (~> 1.4)
|
116
124
|
nom-xml (1.2.0)
|
117
125
|
i18n
|
@@ -146,7 +154,7 @@ GEM
|
|
146
154
|
rspec-support (3.13.1)
|
147
155
|
rss (0.3.0)
|
148
156
|
rexml
|
149
|
-
rubocop (1.62.
|
157
|
+
rubocop (1.62.1)
|
150
158
|
json (~> 2.3)
|
151
159
|
language_server-protocol (>= 3.17.0)
|
152
160
|
parallel (~> 1.10)
|
data/README.md
CHANGED
@@ -26,12 +26,32 @@ If bundler is not being used to manage dependencies, install the gem by executin
|
|
26
26
|
|
27
27
|
## Usage
|
28
28
|
|
29
|
-
DorIndexing
|
29
|
+
DorIndexing requires interaction with the SDR workflow API and also needs the following:
|
30
30
|
|
31
|
-
|
31
|
+
* a callable that takes a single argument (a druid) and returns the Cocina for the corresponding object
|
32
|
+
* a callable that takes a single argument (a druid) and returns the list of administrative tags for the corresponding object
|
33
|
+
* a callable that takes a single argument (a druid) and returns the list of release tags for the corresponding object
|
32
34
|
|
33
35
|
```ruby
|
34
36
|
require 'dor_indexing'
|
35
37
|
|
36
|
-
doc = DorIndexing.build(
|
38
|
+
doc = DorIndexing.build(
|
39
|
+
cocina_with_metadata:,
|
40
|
+
workflow_client:,
|
41
|
+
cocina_finder:,
|
42
|
+
administrative_tags_finder:,
|
43
|
+
release_tags_finder:
|
44
|
+
)
|
37
45
|
```
|
46
|
+
|
47
|
+
## Testing
|
48
|
+
|
49
|
+
### Integration Testing with Solr
|
50
|
+
|
51
|
+
We build and update the Solr index via dor-indexing-app amd dor-services-app, both of which use this gem for indexing logic.
|
52
|
+
|
53
|
+
Argo is the blacklight app that uses the Solr index extensively, and it already has the docker containers to create new test objects in dor-services-app and index them (via dor_indexing_app to Solr). And Argo is the app built on top of the Solr index, so a good place to check results.
|
54
|
+
|
55
|
+
To ensure our indexing behavior produces the desired results, it was easiest to put
|
56
|
+
the full stack integration tests in the argo repository -- they can be found in
|
57
|
+
https://github.com/sul-dlss/argo/tree/main/spec/features/indexing_xxx_spec.rb
|
data/dor_indexing.gemspec
CHANGED
@@ -33,6 +33,7 @@ Gem::Specification.new do |spec|
|
|
33
33
|
|
34
34
|
spec.add_dependency 'activesupport'
|
35
35
|
spec.add_dependency 'cocina-models', '~> 0.95.1'
|
36
|
+
spec.add_dependency 'dor-services-client', '~> 14.0'
|
36
37
|
spec.add_dependency 'dor-workflow-client', '~> 7.0'
|
37
38
|
spec.add_dependency 'honeybadger'
|
38
39
|
spec.add_dependency 'marc-vocab', '~> 0.3.0'
|
@@ -48,18 +48,20 @@ class DorIndexing
|
|
48
48
|
|
49
49
|
@@parent_collections = {} # rubocop:disable Style/ClassVars
|
50
50
|
|
51
|
-
def self.for(
|
52
|
-
new(
|
51
|
+
def self.for(...)
|
52
|
+
new(...).for
|
53
53
|
end
|
54
54
|
|
55
55
|
def self.reset_parent_collections
|
56
56
|
@@parent_collections = {} # rubocop:disable Style/ClassVars
|
57
57
|
end
|
58
58
|
|
59
|
-
def initialize(model:, workflow_client:,
|
59
|
+
def initialize(model:, workflow_client:, cocina_finder:, administrative_tags_finder:, release_tags_finder:)
|
60
60
|
@model = model
|
61
61
|
@workflow_client = workflow_client
|
62
|
-
@
|
62
|
+
@cocina_finder = cocina_finder
|
63
|
+
@administrative_tags_finder = administrative_tags_finder
|
64
|
+
@release_tags_finder = release_tags_finder
|
63
65
|
end
|
64
66
|
|
65
67
|
# @param [Cocina::Models::DROWithMetadata,Cocina::Models::CollectionWithMetadata,Cocina::Model::AdminPolicyWithMetadata] model
|
@@ -69,12 +71,14 @@ class DorIndexing
|
|
69
71
|
parent_collections:,
|
70
72
|
administrative_tags:,
|
71
73
|
workflow_client:,
|
72
|
-
|
74
|
+
cocina_finder:,
|
75
|
+
administrative_tags_finder:,
|
76
|
+
release_tags_finder:)
|
73
77
|
end
|
74
78
|
|
75
79
|
private
|
76
80
|
|
77
|
-
attr_reader :model, :workflow_client, :
|
81
|
+
attr_reader :model, :workflow_client, :cocina_finder, :administrative_tags_finder, :release_tags_finder
|
78
82
|
|
79
83
|
def id
|
80
84
|
model.externalIdentifier
|
@@ -88,8 +92,8 @@ class DorIndexing
|
|
88
92
|
return [] unless model.dro?
|
89
93
|
|
90
94
|
Array(model.structural&.isMemberOf).filter_map do |rel_druid|
|
91
|
-
@@parent_collections[rel_druid] ||=
|
92
|
-
rescue DorIndexing::
|
95
|
+
@@parent_collections[rel_druid] ||= cocina_finder.call(rel_druid)
|
96
|
+
rescue DorIndexing::RepositoryError
|
93
97
|
Honeybadger.notify("Bad association found on #{model.externalIdentifier}. #{rel_druid} could not be found")
|
94
98
|
# This may happen if the referenced Collection does not exist (bad data)
|
95
99
|
nil
|
@@ -97,8 +101,8 @@ class DorIndexing
|
|
97
101
|
end
|
98
102
|
|
99
103
|
def administrative_tags
|
100
|
-
|
101
|
-
rescue DorIndexing::
|
104
|
+
administrative_tags_finder.call(id)
|
105
|
+
rescue DorIndexing::RepositoryError
|
102
106
|
[]
|
103
107
|
end
|
104
108
|
end
|
@@ -4,13 +4,14 @@ class DorIndexing
|
|
4
4
|
module Indexers
|
5
5
|
# Indexes the druid, metadata sources, and the apo titles
|
6
6
|
class IdentifiableIndexer
|
7
|
-
attr_reader :cocina, :
|
7
|
+
attr_reader :cocina, :cocina_finder, :administrative_tags_finder
|
8
8
|
|
9
9
|
CURRENT_CATALOG_TYPE = 'folio'
|
10
10
|
|
11
|
-
def initialize(cocina:,
|
11
|
+
def initialize(cocina:, cocina_finder:, administrative_tags_finder:, **)
|
12
12
|
@cocina = cocina
|
13
|
-
@
|
13
|
+
@cocina_finder = cocina_finder
|
14
|
+
@administrative_tags_finder = administrative_tags_finder
|
14
15
|
end
|
15
16
|
|
16
17
|
## Module-level variable, shared between ALL mixin includers (and ALL *their* includers/extenders)!
|
@@ -72,13 +73,13 @@ class DorIndexing
|
|
72
73
|
# populate cache if necessary
|
73
74
|
def populate_cache(rel_druid)
|
74
75
|
@@apo_hash[rel_druid] ||= begin
|
75
|
-
related_obj =
|
76
|
+
related_obj = cocina_finder.call(rel_druid)
|
76
77
|
# APOs don't have projects, and since Hydrus is set to be retired, I don't want to
|
77
78
|
# add the cocina property. Just check the tags service instead.
|
78
79
|
is_from_hydrus = hydrus_tag?(rel_druid)
|
79
80
|
title = Cocina::Models::Builders::TitleBuilder.build(related_obj.description.title)
|
80
81
|
{ 'related_obj_title' => title, 'is_from_hydrus' => is_from_hydrus }
|
81
|
-
rescue
|
82
|
+
rescue RepositoryError
|
82
83
|
Honeybadger.notify("Bad association found on #{cocina.externalIdentifier}. #{rel_druid} could not be found")
|
83
84
|
# This may happen if the given APO or Collection does not exist (bad data)
|
84
85
|
{ 'related_obj_title' => rel_druid, 'is_from_hydrus' => false }
|
@@ -86,7 +87,7 @@ class DorIndexing
|
|
86
87
|
end
|
87
88
|
|
88
89
|
def hydrus_tag?(id)
|
89
|
-
|
90
|
+
administrative_tags_finder.call(id).include?('Project : Hydrus')
|
90
91
|
end
|
91
92
|
end
|
92
93
|
end
|
@@ -4,11 +4,12 @@ class DorIndexing
|
|
4
4
|
module Indexers
|
5
5
|
# Indexes the object's release tags
|
6
6
|
class ReleasableIndexer
|
7
|
-
attr_reader :cocina, :parent_collections
|
7
|
+
attr_reader :cocina, :parent_collections, :release_tags_finder
|
8
8
|
|
9
|
-
def initialize(cocina:, parent_collections:, **)
|
9
|
+
def initialize(cocina:, parent_collections:, release_tags_finder:, **)
|
10
10
|
@cocina = cocina
|
11
11
|
@parent_collections = parent_collections
|
12
|
+
@release_tags_finder = release_tags_finder
|
12
13
|
end
|
13
14
|
|
14
15
|
# @return [Hash] the partial solr document for releasable concerns
|
@@ -18,18 +19,27 @@ class DorIndexing
|
|
18
19
|
{
|
19
20
|
'released_to_ssim' => tags.map(&:to).uniq,
|
20
21
|
'released_to_searchworks_dttsi' => searchworks_release_date,
|
21
|
-
'released_to_earthworks_dttsi' => earthworks_release_date
|
22
|
+
'released_to_earthworks_dttsi' => earthworks_release_date,
|
23
|
+
'released_to_purl_sitemap_dttsi' => purl_sitemap_release_date
|
22
24
|
}.compact
|
23
25
|
end
|
24
26
|
|
25
27
|
private
|
26
28
|
|
29
|
+
def purl_sitemap_release_date
|
30
|
+
date_for_tag 'PURL sitemap'
|
31
|
+
end
|
32
|
+
|
27
33
|
def earthworks_release_date
|
28
|
-
|
34
|
+
date_for_tag 'Earthworks'
|
29
35
|
end
|
30
36
|
|
31
37
|
def searchworks_release_date
|
32
|
-
|
38
|
+
date_for_tag 'Searchworks'
|
39
|
+
end
|
40
|
+
|
41
|
+
def date_for_tag(project)
|
42
|
+
tags.find { |tag| tag.to == project }&.date&.utc&.iso8601
|
33
43
|
end
|
34
44
|
|
35
45
|
# Item tags have precidence over collection tags, so if the collection is release=true
|
@@ -40,8 +50,9 @@ class DorIndexing
|
|
40
50
|
|
41
51
|
def tags_from_collection
|
42
52
|
parent_collections.each_with_object({}) do |collection, result|
|
43
|
-
|
44
|
-
.
|
53
|
+
release_tags_finder
|
54
|
+
.call(collection.externalIdentifier)
|
55
|
+
.select { |tag| tag.what == 'self' }
|
45
56
|
.group_by(&:to).map do |project, releases_for_project|
|
46
57
|
result[project] = releases_for_project.max_by(&:date)
|
47
58
|
end
|
@@ -49,13 +60,12 @@ class DorIndexing
|
|
49
60
|
end
|
50
61
|
|
51
62
|
def tags_from_item
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
Array(cocina.administrative.releaseTags)
|
63
|
+
release_tags_finder
|
64
|
+
.call(cocina.externalIdentifier)
|
65
|
+
.select { |tag| tag.what == 'self' }
|
66
|
+
.group_by(&:to).transform_values do |releases_for_project|
|
67
|
+
releases_for_project.max_by(&:date)
|
68
|
+
end
|
59
69
|
end
|
60
70
|
end
|
61
71
|
end
|
@@ -0,0 +1,8 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DorIndexing
|
4
|
+
# Error raised retrieving Cocina objects, administrative tags, or release tags
|
5
|
+
# In DSA, the concrete implementation backs this with CocinaObjectStore.
|
6
|
+
# In DIA, the concrete implementation backs this with Dor Services Client.
|
7
|
+
class RepositoryError < StandardError; end
|
8
|
+
end
|
data/lib/dor_indexing/version.rb
CHANGED
data/lib/dor_indexing.rb
CHANGED
@@ -10,14 +10,21 @@ require 'active_support/core_ext/object/blank'
|
|
10
10
|
require 'active_support/core_ext/enumerable'
|
11
11
|
require 'active_support/core_ext/string'
|
12
12
|
require 'cocina/models'
|
13
|
+
require 'dor/services/client'
|
13
14
|
require 'honeybadger'
|
14
15
|
require 'marc/vocab'
|
15
16
|
|
16
17
|
# Builds solr documents for indexing.
|
17
18
|
class DorIndexing
|
18
19
|
# @return [Hash] the solr document
|
19
|
-
def self.build(cocina_with_metadata:, workflow_client:,
|
20
|
+
def self.build(cocina_with_metadata:, workflow_client:, cocina_finder:, administrative_tags_finder:, release_tags_finder:)
|
20
21
|
Honeybadger.context({ identifier: cocina_with_metadata.externalIdentifier })
|
21
|
-
DorIndexing::Builders::DocumentBuilder.for(
|
22
|
+
DorIndexing::Builders::DocumentBuilder.for(
|
23
|
+
model: cocina_with_metadata,
|
24
|
+
workflow_client:,
|
25
|
+
cocina_finder:,
|
26
|
+
administrative_tags_finder:,
|
27
|
+
release_tags_finder:
|
28
|
+
).to_solr
|
22
29
|
end
|
23
30
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dor_indexing
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Justin Littman
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-03-
|
11
|
+
date: 2024-03-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: 0.95.1
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: dor-services-client
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '14.0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '14.0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: dor-workflow-client
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -135,7 +149,6 @@ files:
|
|
135
149
|
- lib/dor_indexing/builders/publisher_name_builder.rb
|
136
150
|
- lib/dor_indexing/builders/temporal_builder.rb
|
137
151
|
- lib/dor_indexing/builders/topic_builder.rb
|
138
|
-
- lib/dor_indexing/cocina_repository.rb
|
139
152
|
- lib/dor_indexing/indexers/administrative_tag_indexer.rb
|
140
153
|
- lib/dor_indexing/indexers/basic_indexer.rb
|
141
154
|
- lib/dor_indexing/indexers/collection_title_indexer.rb
|
@@ -153,6 +166,7 @@ files:
|
|
153
166
|
- lib/dor_indexing/indexers/workflow_process_indexer.rb
|
154
167
|
- lib/dor_indexing/indexers/workflows_indexer.rb
|
155
168
|
- lib/dor_indexing/marc_country.rb
|
169
|
+
- lib/dor_indexing/repository_error.rb
|
156
170
|
- lib/dor_indexing/selectors/event_selector.rb
|
157
171
|
- lib/dor_indexing/selectors/pub_year_selector.rb
|
158
172
|
- lib/dor_indexing/version.rb
|
@@ -180,7 +194,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
180
194
|
- !ruby/object:Gem::Version
|
181
195
|
version: '0'
|
182
196
|
requirements: []
|
183
|
-
rubygems_version: 3.
|
197
|
+
rubygems_version: 3.5.6
|
184
198
|
signing_key:
|
185
199
|
specification_version: 4
|
186
200
|
summary: Library for creating Solr documents for SDR indexing.
|
@@ -1,24 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
class DorIndexing
|
4
|
-
# Interface for retrieving Cocina objects.
|
5
|
-
# In DSA, the concrete implementation backs this with CocinaObjectStore.
|
6
|
-
# In DIA, the concrete implementation backs this with Dor Services Client.
|
7
|
-
class CocinaRepository
|
8
|
-
class RepositoryError < StandardError; end
|
9
|
-
|
10
|
-
# @param [String] druid
|
11
|
-
# @return [Cocina::Models::DROWithMetadata,Cocina::Models::CollectionWithMetadata,Cocina::Models::AdminPolicyWithMetadata]
|
12
|
-
# @raise [RepositoryError] if the object is not found or other error occurs
|
13
|
-
def find(druid)
|
14
|
-
raise NotImplementedError
|
15
|
-
end
|
16
|
-
|
17
|
-
# @param [String] druid
|
18
|
-
# @return [Array<String>] administrative tags
|
19
|
-
# @raise [RepositoryError] if the object is not found or other error occurs
|
20
|
-
def administrative_tags(druid)
|
21
|
-
raise NotImplementedError
|
22
|
-
end
|
23
|
-
end
|
24
|
-
end
|