dor_indexing 1.4.1 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +15 -7
- data/README.md +23 -3
- data/dor_indexing.gemspec +1 -0
- data/lib/dor_indexing/builders/document_builder.rb +14 -10
- data/lib/dor_indexing/indexers/identifiable_indexer.rb +7 -6
- data/lib/dor_indexing/indexers/releasable_indexer.rb +24 -14
- data/lib/dor_indexing/repository_error.rb +8 -0
- data/lib/dor_indexing/version.rb +1 -1
- data/lib/dor_indexing.rb +9 -2
- metadata +18 -4
- data/lib/dor_indexing/cocina_repository.rb +0 -24
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b76dd6ecf919e3653810f59c166d313f77eba4a3046bcd1598abd996bf57c74d
|
4
|
+
data.tar.gz: be615ccc690756cd7aff121e175d544caea9cf3b1dd70a6d03e0fd3c9ab14c43
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d04715c311bfd64ac62169e18d4fea188b6c2dc0e8263c38413e3eac3f985e901f3f5ba1181695f5e24ecbbdb713c937bd6c334c0e1f4ec622f293cea4a2195b
|
7
|
+
data.tar.gz: de51ceba624569579cbfe1c1f259ba3568158443ed849ed3d253a144ed6c25346d89916df530c7bc91891afe5f3de19e508cca9cc23818f9e61b262b7306a2a2
|
data/Gemfile.lock
CHANGED
@@ -1,9 +1,10 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
dor_indexing (
|
4
|
+
dor_indexing (2.0.0)
|
5
5
|
activesupport
|
6
6
|
cocina-models (~> 0.95.1)
|
7
|
+
dor-services-client (~> 14.0)
|
7
8
|
dor-workflow-client (~> 7.0)
|
8
9
|
honeybadger
|
9
10
|
marc-vocab (~> 0.3.0)
|
@@ -26,7 +27,7 @@ GEM
|
|
26
27
|
ast (2.4.2)
|
27
28
|
attr_extras (7.1.0)
|
28
29
|
base64 (0.2.0)
|
29
|
-
bigdecimal (3.1.
|
30
|
+
bigdecimal (3.1.7)
|
30
31
|
byebug (11.1.3)
|
31
32
|
cocina-models (0.95.1)
|
32
33
|
activesupport
|
@@ -51,6 +52,13 @@ GEM
|
|
51
52
|
activesupport
|
52
53
|
diff-lcs (1.5.1)
|
53
54
|
docile (1.4.0)
|
55
|
+
dor-services-client (14.4.0)
|
56
|
+
activesupport (>= 4.2, < 8)
|
57
|
+
cocina-models (~> 0.95.1)
|
58
|
+
deprecation
|
59
|
+
faraday (~> 2.0)
|
60
|
+
faraday-retry
|
61
|
+
zeitwerk (~> 2.1)
|
54
62
|
dor-workflow-client (7.0.2)
|
55
63
|
activesupport (>= 3.2.1, < 8)
|
56
64
|
deprecation (>= 0.99.0)
|
@@ -89,7 +97,7 @@ GEM
|
|
89
97
|
net-http
|
90
98
|
faraday-retry (2.2.0)
|
91
99
|
faraday (~> 2.0)
|
92
|
-
honeybadger (5.
|
100
|
+
honeybadger (5.8.0)
|
93
101
|
i18n (1.14.4)
|
94
102
|
concurrent-ruby (~> 1.0)
|
95
103
|
ice_nine (0.11.2)
|
@@ -99,7 +107,7 @@ GEM
|
|
99
107
|
multi_json
|
100
108
|
language_server-protocol (3.17.0.3)
|
101
109
|
marc-vocab (0.3.0)
|
102
|
-
minitest (5.22.
|
110
|
+
minitest (5.22.3)
|
103
111
|
mods (3.0.4)
|
104
112
|
edtf (~> 3.0)
|
105
113
|
iso-639
|
@@ -109,9 +117,9 @@ GEM
|
|
109
117
|
mutex_m (0.2.0)
|
110
118
|
net-http (0.4.1)
|
111
119
|
uri
|
112
|
-
nokogiri (1.16.
|
120
|
+
nokogiri (1.16.3-x86_64-darwin)
|
113
121
|
racc (~> 1.4)
|
114
|
-
nokogiri (1.16.
|
122
|
+
nokogiri (1.16.3-x86_64-linux)
|
115
123
|
racc (~> 1.4)
|
116
124
|
nom-xml (1.2.0)
|
117
125
|
i18n
|
@@ -146,7 +154,7 @@ GEM
|
|
146
154
|
rspec-support (3.13.1)
|
147
155
|
rss (0.3.0)
|
148
156
|
rexml
|
149
|
-
rubocop (1.62.
|
157
|
+
rubocop (1.62.1)
|
150
158
|
json (~> 2.3)
|
151
159
|
language_server-protocol (>= 3.17.0)
|
152
160
|
parallel (~> 1.10)
|
data/README.md
CHANGED
@@ -26,12 +26,32 @@ If bundler is not being used to manage dependencies, install the gem by executin
|
|
26
26
|
|
27
27
|
## Usage
|
28
28
|
|
29
|
-
DorIndexing
|
29
|
+
DorIndexing requires interaction with the SDR workflow API and also needs the following:
|
30
30
|
|
31
|
-
|
31
|
+
* a callable that takes a single argument (a druid) and returns the Cocina for the corresponding object
|
32
|
+
* a callable that takes a single argument (a druid) and returns the list of administrative tags for the corresponding object
|
33
|
+
* a callable that takes a single argument (a druid) and returns the list of release tags for the corresponding object
|
32
34
|
|
33
35
|
```ruby
|
34
36
|
require 'dor_indexing'
|
35
37
|
|
36
|
-
doc = DorIndexing.build(
|
38
|
+
doc = DorIndexing.build(
|
39
|
+
cocina_with_metadata:,
|
40
|
+
workflow_client:,
|
41
|
+
cocina_finder:,
|
42
|
+
administrative_tags_finder:,
|
43
|
+
release_tags_finder:
|
44
|
+
)
|
37
45
|
```
|
46
|
+
|
47
|
+
## Testing
|
48
|
+
|
49
|
+
### Integration Testing with Solr
|
50
|
+
|
51
|
+
We build and update the Solr index via dor-indexing-app amd dor-services-app, both of which use this gem for indexing logic.
|
52
|
+
|
53
|
+
Argo is the blacklight app that uses the Solr index extensively, and it already has the docker containers to create new test objects in dor-services-app and index them (via dor_indexing_app to Solr). And Argo is the app built on top of the Solr index, so a good place to check results.
|
54
|
+
|
55
|
+
To ensure our indexing behavior produces the desired results, it was easiest to put
|
56
|
+
the full stack integration tests in the argo repository -- they can be found in
|
57
|
+
https://github.com/sul-dlss/argo/tree/main/spec/features/indexing_xxx_spec.rb
|
data/dor_indexing.gemspec
CHANGED
@@ -33,6 +33,7 @@ Gem::Specification.new do |spec|
|
|
33
33
|
|
34
34
|
spec.add_dependency 'activesupport'
|
35
35
|
spec.add_dependency 'cocina-models', '~> 0.95.1'
|
36
|
+
spec.add_dependency 'dor-services-client', '~> 14.0'
|
36
37
|
spec.add_dependency 'dor-workflow-client', '~> 7.0'
|
37
38
|
spec.add_dependency 'honeybadger'
|
38
39
|
spec.add_dependency 'marc-vocab', '~> 0.3.0'
|
@@ -48,18 +48,20 @@ class DorIndexing
|
|
48
48
|
|
49
49
|
@@parent_collections = {} # rubocop:disable Style/ClassVars
|
50
50
|
|
51
|
-
def self.for(
|
52
|
-
new(
|
51
|
+
def self.for(...)
|
52
|
+
new(...).for
|
53
53
|
end
|
54
54
|
|
55
55
|
def self.reset_parent_collections
|
56
56
|
@@parent_collections = {} # rubocop:disable Style/ClassVars
|
57
57
|
end
|
58
58
|
|
59
|
-
def initialize(model:, workflow_client:,
|
59
|
+
def initialize(model:, workflow_client:, cocina_finder:, administrative_tags_finder:, release_tags_finder:)
|
60
60
|
@model = model
|
61
61
|
@workflow_client = workflow_client
|
62
|
-
@
|
62
|
+
@cocina_finder = cocina_finder
|
63
|
+
@administrative_tags_finder = administrative_tags_finder
|
64
|
+
@release_tags_finder = release_tags_finder
|
63
65
|
end
|
64
66
|
|
65
67
|
# @param [Cocina::Models::DROWithMetadata,Cocina::Models::CollectionWithMetadata,Cocina::Model::AdminPolicyWithMetadata] model
|
@@ -69,12 +71,14 @@ class DorIndexing
|
|
69
71
|
parent_collections:,
|
70
72
|
administrative_tags:,
|
71
73
|
workflow_client:,
|
72
|
-
|
74
|
+
cocina_finder:,
|
75
|
+
administrative_tags_finder:,
|
76
|
+
release_tags_finder:)
|
73
77
|
end
|
74
78
|
|
75
79
|
private
|
76
80
|
|
77
|
-
attr_reader :model, :workflow_client, :
|
81
|
+
attr_reader :model, :workflow_client, :cocina_finder, :administrative_tags_finder, :release_tags_finder
|
78
82
|
|
79
83
|
def id
|
80
84
|
model.externalIdentifier
|
@@ -88,8 +92,8 @@ class DorIndexing
|
|
88
92
|
return [] unless model.dro?
|
89
93
|
|
90
94
|
Array(model.structural&.isMemberOf).filter_map do |rel_druid|
|
91
|
-
@@parent_collections[rel_druid] ||=
|
92
|
-
rescue DorIndexing::
|
95
|
+
@@parent_collections[rel_druid] ||= cocina_finder.call(rel_druid)
|
96
|
+
rescue DorIndexing::RepositoryError
|
93
97
|
Honeybadger.notify("Bad association found on #{model.externalIdentifier}. #{rel_druid} could not be found")
|
94
98
|
# This may happen if the referenced Collection does not exist (bad data)
|
95
99
|
nil
|
@@ -97,8 +101,8 @@ class DorIndexing
|
|
97
101
|
end
|
98
102
|
|
99
103
|
def administrative_tags
|
100
|
-
|
101
|
-
rescue DorIndexing::
|
104
|
+
administrative_tags_finder.call(id)
|
105
|
+
rescue DorIndexing::RepositoryError
|
102
106
|
[]
|
103
107
|
end
|
104
108
|
end
|
@@ -4,13 +4,14 @@ class DorIndexing
|
|
4
4
|
module Indexers
|
5
5
|
# Indexes the druid, metadata sources, and the apo titles
|
6
6
|
class IdentifiableIndexer
|
7
|
-
attr_reader :cocina, :
|
7
|
+
attr_reader :cocina, :cocina_finder, :administrative_tags_finder
|
8
8
|
|
9
9
|
CURRENT_CATALOG_TYPE = 'folio'
|
10
10
|
|
11
|
-
def initialize(cocina:,
|
11
|
+
def initialize(cocina:, cocina_finder:, administrative_tags_finder:, **)
|
12
12
|
@cocina = cocina
|
13
|
-
@
|
13
|
+
@cocina_finder = cocina_finder
|
14
|
+
@administrative_tags_finder = administrative_tags_finder
|
14
15
|
end
|
15
16
|
|
16
17
|
## Module-level variable, shared between ALL mixin includers (and ALL *their* includers/extenders)!
|
@@ -72,13 +73,13 @@ class DorIndexing
|
|
72
73
|
# populate cache if necessary
|
73
74
|
def populate_cache(rel_druid)
|
74
75
|
@@apo_hash[rel_druid] ||= begin
|
75
|
-
related_obj =
|
76
|
+
related_obj = cocina_finder.call(rel_druid)
|
76
77
|
# APOs don't have projects, and since Hydrus is set to be retired, I don't want to
|
77
78
|
# add the cocina property. Just check the tags service instead.
|
78
79
|
is_from_hydrus = hydrus_tag?(rel_druid)
|
79
80
|
title = Cocina::Models::Builders::TitleBuilder.build(related_obj.description.title)
|
80
81
|
{ 'related_obj_title' => title, 'is_from_hydrus' => is_from_hydrus }
|
81
|
-
rescue
|
82
|
+
rescue RepositoryError
|
82
83
|
Honeybadger.notify("Bad association found on #{cocina.externalIdentifier}. #{rel_druid} could not be found")
|
83
84
|
# This may happen if the given APO or Collection does not exist (bad data)
|
84
85
|
{ 'related_obj_title' => rel_druid, 'is_from_hydrus' => false }
|
@@ -86,7 +87,7 @@ class DorIndexing
|
|
86
87
|
end
|
87
88
|
|
88
89
|
def hydrus_tag?(id)
|
89
|
-
|
90
|
+
administrative_tags_finder.call(id).include?('Project : Hydrus')
|
90
91
|
end
|
91
92
|
end
|
92
93
|
end
|
@@ -4,11 +4,12 @@ class DorIndexing
|
|
4
4
|
module Indexers
|
5
5
|
# Indexes the object's release tags
|
6
6
|
class ReleasableIndexer
|
7
|
-
attr_reader :cocina, :parent_collections
|
7
|
+
attr_reader :cocina, :parent_collections, :release_tags_finder
|
8
8
|
|
9
|
-
def initialize(cocina:, parent_collections:, **)
|
9
|
+
def initialize(cocina:, parent_collections:, release_tags_finder:, **)
|
10
10
|
@cocina = cocina
|
11
11
|
@parent_collections = parent_collections
|
12
|
+
@release_tags_finder = release_tags_finder
|
12
13
|
end
|
13
14
|
|
14
15
|
# @return [Hash] the partial solr document for releasable concerns
|
@@ -18,18 +19,27 @@ class DorIndexing
|
|
18
19
|
{
|
19
20
|
'released_to_ssim' => tags.map(&:to).uniq,
|
20
21
|
'released_to_searchworks_dttsi' => searchworks_release_date,
|
21
|
-
'released_to_earthworks_dttsi' => earthworks_release_date
|
22
|
+
'released_to_earthworks_dttsi' => earthworks_release_date,
|
23
|
+
'released_to_purl_sitemap_dttsi' => purl_sitemap_release_date
|
22
24
|
}.compact
|
23
25
|
end
|
24
26
|
|
25
27
|
private
|
26
28
|
|
29
|
+
def purl_sitemap_release_date
|
30
|
+
date_for_tag 'PURL sitemap'
|
31
|
+
end
|
32
|
+
|
27
33
|
def earthworks_release_date
|
28
|
-
|
34
|
+
date_for_tag 'Earthworks'
|
29
35
|
end
|
30
36
|
|
31
37
|
def searchworks_release_date
|
32
|
-
|
38
|
+
date_for_tag 'Searchworks'
|
39
|
+
end
|
40
|
+
|
41
|
+
def date_for_tag(project)
|
42
|
+
tags.find { |tag| tag.to == project }&.date&.utc&.iso8601
|
33
43
|
end
|
34
44
|
|
35
45
|
# Item tags have precidence over collection tags, so if the collection is release=true
|
@@ -40,8 +50,9 @@ class DorIndexing
|
|
40
50
|
|
41
51
|
def tags_from_collection
|
42
52
|
parent_collections.each_with_object({}) do |collection, result|
|
43
|
-
|
44
|
-
.
|
53
|
+
release_tags_finder
|
54
|
+
.call(collection.externalIdentifier)
|
55
|
+
.select { |tag| tag.what == 'self' }
|
45
56
|
.group_by(&:to).map do |project, releases_for_project|
|
46
57
|
result[project] = releases_for_project.max_by(&:date)
|
47
58
|
end
|
@@ -49,13 +60,12 @@ class DorIndexing
|
|
49
60
|
end
|
50
61
|
|
51
62
|
def tags_from_item
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
Array(cocina.administrative.releaseTags)
|
63
|
+
release_tags_finder
|
64
|
+
.call(cocina.externalIdentifier)
|
65
|
+
.select { |tag| tag.what == 'self' }
|
66
|
+
.group_by(&:to).transform_values do |releases_for_project|
|
67
|
+
releases_for_project.max_by(&:date)
|
68
|
+
end
|
59
69
|
end
|
60
70
|
end
|
61
71
|
end
|
@@ -0,0 +1,8 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DorIndexing
|
4
|
+
# Error raised retrieving Cocina objects, administrative tags, or release tags
|
5
|
+
# In DSA, the concrete implementation backs this with CocinaObjectStore.
|
6
|
+
# In DIA, the concrete implementation backs this with Dor Services Client.
|
7
|
+
class RepositoryError < StandardError; end
|
8
|
+
end
|
data/lib/dor_indexing/version.rb
CHANGED
data/lib/dor_indexing.rb
CHANGED
@@ -10,14 +10,21 @@ require 'active_support/core_ext/object/blank'
|
|
10
10
|
require 'active_support/core_ext/enumerable'
|
11
11
|
require 'active_support/core_ext/string'
|
12
12
|
require 'cocina/models'
|
13
|
+
require 'dor/services/client'
|
13
14
|
require 'honeybadger'
|
14
15
|
require 'marc/vocab'
|
15
16
|
|
16
17
|
# Builds solr documents for indexing.
|
17
18
|
class DorIndexing
|
18
19
|
# @return [Hash] the solr document
|
19
|
-
def self.build(cocina_with_metadata:, workflow_client:,
|
20
|
+
def self.build(cocina_with_metadata:, workflow_client:, cocina_finder:, administrative_tags_finder:, release_tags_finder:)
|
20
21
|
Honeybadger.context({ identifier: cocina_with_metadata.externalIdentifier })
|
21
|
-
DorIndexing::Builders::DocumentBuilder.for(
|
22
|
+
DorIndexing::Builders::DocumentBuilder.for(
|
23
|
+
model: cocina_with_metadata,
|
24
|
+
workflow_client:,
|
25
|
+
cocina_finder:,
|
26
|
+
administrative_tags_finder:,
|
27
|
+
release_tags_finder:
|
28
|
+
).to_solr
|
22
29
|
end
|
23
30
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dor_indexing
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Justin Littman
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-03-
|
11
|
+
date: 2024-03-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: 0.95.1
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: dor-services-client
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '14.0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '14.0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: dor-workflow-client
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -135,7 +149,6 @@ files:
|
|
135
149
|
- lib/dor_indexing/builders/publisher_name_builder.rb
|
136
150
|
- lib/dor_indexing/builders/temporal_builder.rb
|
137
151
|
- lib/dor_indexing/builders/topic_builder.rb
|
138
|
-
- lib/dor_indexing/cocina_repository.rb
|
139
152
|
- lib/dor_indexing/indexers/administrative_tag_indexer.rb
|
140
153
|
- lib/dor_indexing/indexers/basic_indexer.rb
|
141
154
|
- lib/dor_indexing/indexers/collection_title_indexer.rb
|
@@ -153,6 +166,7 @@ files:
|
|
153
166
|
- lib/dor_indexing/indexers/workflow_process_indexer.rb
|
154
167
|
- lib/dor_indexing/indexers/workflows_indexer.rb
|
155
168
|
- lib/dor_indexing/marc_country.rb
|
169
|
+
- lib/dor_indexing/repository_error.rb
|
156
170
|
- lib/dor_indexing/selectors/event_selector.rb
|
157
171
|
- lib/dor_indexing/selectors/pub_year_selector.rb
|
158
172
|
- lib/dor_indexing/version.rb
|
@@ -180,7 +194,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
180
194
|
- !ruby/object:Gem::Version
|
181
195
|
version: '0'
|
182
196
|
requirements: []
|
183
|
-
rubygems_version: 3.
|
197
|
+
rubygems_version: 3.5.6
|
184
198
|
signing_key:
|
185
199
|
specification_version: 4
|
186
200
|
summary: Library for creating Solr documents for SDR indexing.
|
@@ -1,24 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
class DorIndexing
|
4
|
-
# Interface for retrieving Cocina objects.
|
5
|
-
# In DSA, the concrete implementation backs this with CocinaObjectStore.
|
6
|
-
# In DIA, the concrete implementation backs this with Dor Services Client.
|
7
|
-
class CocinaRepository
|
8
|
-
class RepositoryError < StandardError; end
|
9
|
-
|
10
|
-
# @param [String] druid
|
11
|
-
# @return [Cocina::Models::DROWithMetadata,Cocina::Models::CollectionWithMetadata,Cocina::Models::AdminPolicyWithMetadata]
|
12
|
-
# @raise [RepositoryError] if the object is not found or other error occurs
|
13
|
-
def find(druid)
|
14
|
-
raise NotImplementedError
|
15
|
-
end
|
16
|
-
|
17
|
-
# @param [String] druid
|
18
|
-
# @return [Array<String>] administrative tags
|
19
|
-
# @raise [RepositoryError] if the object is not found or other error occurs
|
20
|
-
def administrative_tags(druid)
|
21
|
-
raise NotImplementedError
|
22
|
-
end
|
23
|
-
end
|
24
|
-
end
|