oarepo-runtime 1.7.2__py3-none-any.whl → 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -155,11 +155,11 @@ class StreamEntry:
155
155
  def __str__(self):
156
156
  ret = [
157
157
  f"Entry #{self.seq}: id {self.id or 'not yet set'}, filtered: {self.filtered}, deleted: {self.deleted}",
158
- f"Content:",
158
+ "Content:",
159
159
  textwrap.indent(
160
160
  json.dumps(self.entry, ensure_ascii=False, indent=4), " "
161
161
  ),
162
- f"Context:",
162
+ "Context:",
163
163
  textwrap.indent(
164
164
  json.dumps(self.context, ensure_ascii=False, indent=4), " "
165
165
  ),
@@ -238,14 +238,14 @@ class DataStreamCallback:
238
238
  def batch_started(self, batch):
239
239
  log.info("Batch started: %s", batch)
240
240
 
241
- def batch_finished(self, batch):
242
- log.info("Batch finished: %s", batch)
241
+ def batch_finished(self, batch: StreamBatch):
242
+ log.info("Batch finished: %s", batch.seq)
243
243
  for err in batch.errors:
244
- log.error("Failed batch: %s: %s", err, batch)
244
+ log.error("Failed batch: %s: %s", err, batch.seq)
245
245
  if self.log_error_entry:
246
246
  for entry in batch.entries:
247
247
  if entry.errors:
248
- log.error("Failed entry: %s", entry)
248
+ log.error("Failed entry: %s in batch %s", entry, batch.seq)
249
249
 
250
250
  def reader_error(self, reader, exception):
251
251
  log.error("Reader error: %s: %s", reader, exception)
@@ -1,12 +1,27 @@
1
- from .icu import ICUField, ICUSortField, ICUSuggestField
1
+ from .icu import (
2
+ FulltextIndexField,
3
+ ICUField,
4
+ ICUSearchField,
5
+ ICUSortField,
6
+ ICUSuggestField,
7
+ TermIndexField,
8
+ )
2
9
  from .mapping import MappingSystemFieldMixin, SystemFieldDumperExt
3
- from .selectors import FirstItemSelector, PathSelector, Selector, FilteredSelector, MultiSelector
10
+ from .selectors import (
11
+ FilteredSelector,
12
+ FirstItemSelector,
13
+ MultiSelector,
14
+ PathSelector,
15
+ Selector,
16
+ )
4
17
  from .synthetic import SyntheticSystemField
5
18
 
6
19
  __all__ = (
7
20
  "ICUField",
8
21
  "ICUSuggestField",
9
22
  "ICUSortField",
23
+ "ICUSearchField",
24
+ "FulltextIndexField",
10
25
  "MappingSystemFieldMixin",
11
26
  "SystemFieldDumperExt",
12
27
  "SyntheticSystemField",
@@ -15,4 +30,5 @@ __all__ = (
15
30
  "FirstItemSelector",
16
31
  "FilteredSelector",
17
32
  "MultiSelector",
33
+ "TermIndexField",
18
34
  )
@@ -1,3 +1,4 @@
1
+ from abc import abstractmethod, abstractproperty
1
2
  from functools import cached_property
2
3
  from typing import Dict
3
4
 
@@ -8,17 +9,16 @@ from oarepo_runtime.records.relations.lookup import lookup_key
8
9
  from oarepo_runtime.records.systemfields.mapping import MappingSystemFieldMixin
9
10
 
10
11
 
11
- class ICUField(MappingSystemFieldMixin, SystemField):
12
+ class ICUBase(MappingSystemFieldMixin, SystemField):
12
13
  """
13
- A system field that acts as an opensearch "proxy" to another field.
14
- It creates a top-level mapping field with the same name and copies
15
- content of {another field}.language into {mapping field}.language.
16
-
17
- The language accessor can be modified by overriding get_values method.
14
+ Base class for ICU system fields.
15
+ It provides the basic functionality for ICU fields, such as
16
+ getting the attribute name and handling the key.
18
17
  """
19
18
 
20
- def __init__(self, *, source_field, key=None):
21
- super().__init__(key)
19
+ def __init__(self, source_field=None, key=None):
20
+ super().__init__(key=key)
21
+ self._attr_name = key or self.__class__.__name__.lower()
22
22
  self.source_field = source_field
23
23
 
24
24
  @cached_property
@@ -37,26 +37,71 @@ class ICUField(MappingSystemFieldMixin, SystemField):
37
37
  ret = []
38
38
  for l in lookup_key(data, f"{self.source_field}"):
39
39
  if isinstance(l.value, str):
40
+ # take single value as being always the the language provided
40
41
  ret.append(l.value)
41
42
  elif isinstance(l.value, dict):
43
+ # expected to be {"cs": "", "en": ""}
42
44
  val = l.value.get(language)
43
45
  if val:
44
46
  ret.append(val)
47
+ elif "lang" in l.value:
48
+ # for [{"lang": "", "value": ""}, ...] we get each item separately
49
+ # that's why we do not iterate over l.value
50
+ if l.value["lang"] == language:
51
+ ret.append(l.value["value"])
45
52
  return ret
46
53
 
54
+ @abstractproperty
55
+ def mapping(self):
56
+ """
57
+ The mapping for the field. It should return a dictionary with the
58
+ mapping for the field, based on the current configuration of the application.
59
+ """
60
+ raise NotImplementedError("Subclasses must implement the mapping property.")
61
+
62
+ @abstractmethod
47
63
  def search_dump(self, data, record):
48
- ret = {}
49
- for lang in self.languages:
50
- ret[lang] = self.get_values(data, lang)
51
- data[self.attr_name] = ret
64
+ """
65
+ Dump custom field. This method should be implemented by subclasses
66
+ to provide the functionality for dumping the field data into the
67
+ OpenSearch data structure.
68
+ """
69
+ raise NotImplementedError("Subclasses must implement the search_dump method.")
52
70
 
53
71
  def search_load(self, data, record_cls):
72
+ """
73
+ Just remove the field from the data on load.
74
+ """
54
75
  data.pop(self.attr_name, None)
55
76
 
56
77
  def __get__(self, instance, owner):
57
78
  return self
58
79
 
59
80
 
81
+ class ICUField(ICUBase):
82
+ """
83
+ A system field that acts as an opensearch "proxy" to another field.
84
+ It creates a top-level mapping field with the same name and copies
85
+ content of {another field}.language into {mapping field}.language.
86
+
87
+ The language accessor can be modified by overriding get_values method.
88
+ """
89
+
90
+ def __init__(self, *, source_field, key=None):
91
+ super().__init__(source_field=source_field, key=key)
92
+
93
+ def search_dump(self, data, record):
94
+ ret = {}
95
+ for lang in self.languages:
96
+ r = self.get_values(data, lang)
97
+ if r:
98
+ # if the language is not empty, add it to the result
99
+ # otherwise do not add it at all to safe transport
100
+ ret[lang] = r
101
+ if ret:
102
+ data[self.attr_name] = ret
103
+
104
+
60
105
  class ICUSortField(ICUField):
61
106
  """
62
107
  A field that adds icu sorting field
@@ -132,23 +177,24 @@ class ICUSuggestField(ICUField):
132
177
  }
133
178
 
134
179
 
135
- class ICUSearchField(ICUField):
136
- """
137
- A field that adds stemming-aware search field
138
- """
180
+ class ICUSearchAnalyzerMixin:
139
181
 
140
182
  default_stemming_analyzers = {
141
183
  "stemming_analyzer_cs": {
142
184
  "tokenizer": "standard",
143
- "filter": ["stemming_filter_cs"],
185
+ "filter": ["stemming_filter_cs", "lowercase"],
144
186
  },
145
187
  "stemming_analyzer_en": {
146
188
  "tokenizer": "standard",
147
- "filter": ["stemming_filter_en"],
189
+ "filter": ["stemming_filter_en", "lowercase"],
148
190
  },
149
191
  "ascii_folding_analyzer": {
150
192
  "tokenizer": "standard",
151
- "filter": ["ascii_folding_filter"],
193
+ "filter": ["ascii_folding_filter", "lowercase"],
194
+ },
195
+ "lowercase_analyzer": {
196
+ "tokenizer": "standard",
197
+ "filter": ["lowercase"],
152
198
  },
153
199
  }
154
200
 
@@ -166,8 +212,31 @@ class ICUSearchField(ICUField):
166
212
  "ascii_folding_filter": {"type": "asciifolding", "preserve_original": True},
167
213
  }
168
214
 
169
- def __init__(self, source_field, key=None):
215
+ @property
216
+ def mapping_settings(self):
217
+ return {
218
+ "analysis": {
219
+ "analyzer": current_app.config.get(
220
+ "OAREPO_ICU_SEARCH_ANALYZERS", self.default_stemming_analyzers
221
+ ),
222
+ "filter": current_app.config.get(
223
+ "OAREPO_ICU_SEARCH_FILTERS", self.default_stemming_filters
224
+ ),
225
+ }
226
+ }
227
+
228
+
229
+ class ICUSearchField(ICUSearchAnalyzerMixin, ICUField):
230
+ """
231
+ A field that adds stemming-aware search field for multilingual data (
232
+ e.g. data that contains {"cs": "...", "en": "..."}
233
+ or [{"lang": "cs", "value": "..."}, ...]
234
+ )
235
+ """
236
+
237
+ def __init__(self, source_field, key=None, boost=1):
170
238
  super().__init__(source_field=source_field, key=key)
239
+ self.boost = boost
171
240
 
172
241
  @property
173
242
  def mapping(self):
@@ -180,17 +249,22 @@ class ICUSearchField(ICUField):
180
249
  "search",
181
250
  {
182
251
  "type": "text",
183
- "boost": 1,
252
+ "boost": 1 * self.boost,
184
253
  "fields": {
185
254
  "stemmed": {
186
255
  "type": "text",
187
256
  "analyzer": f"stemming_analyzer_{lang}",
188
- "boost": 0.5,
257
+ "boost": 0.5 * self.boost,
258
+ },
259
+ "lowercase": {
260
+ "type": "text",
261
+ "boost": 0.8 * self.boost,
262
+ "analyzer": "lowercase_analyzer",
189
263
  },
190
264
  "ascii_folded": {
191
265
  "type": "text",
192
266
  "analyzer": "ascii_folding_analyzer",
193
- "boost": 0.3,
267
+ "boost": 0.3 * self.boost,
194
268
  },
195
269
  },
196
270
  },
@@ -200,15 +274,98 @@ class ICUSearchField(ICUField):
200
274
  },
201
275
  }
202
276
 
277
+ def get_values(self, data, language):
278
+ return super().get_values(data, language=language)
279
+
280
+
281
+ class SingleLanguageSearchField(ICUSearchAnalyzerMixin, ICUBase):
282
+ """
283
+ A base class for single-language search fields - that is, data contain a text
284
+ value in a pre-defined, single language.
285
+ """
286
+
287
+ def __init__(self, *, source_field, key=None, language=None, boost=1):
288
+ super().__init__(source_field=source_field, key=key)
289
+ self.language = language
290
+ self.boost = boost
291
+
292
+ def search_dump(self, data, record):
293
+ """Dump custom field."""
294
+ ret = self.get_values(data, language=self.language)
295
+ if ret:
296
+ data[self.attr_name] = ret
297
+
298
+
299
+ class FulltextIndexField(SingleLanguageSearchField):
300
+ """
301
+ A system field that makes the field searchable in OpenSearch,
302
+ regardless if it is indexed/analyzed, embedded in Nested or not.
303
+
304
+ It creates a top-level mapping field and copies
305
+ content of {source_field} into it. It also provides the correct mapping
306
+ for the field based on the current configuration of the application.
307
+
308
+ Unlike the ICU, this field is a single-language and the language should
309
+ be provided when initializing the field.
310
+ It defaults to the BABEL_DEFAULT_LOCALE if not provided.
311
+ """
312
+
203
313
  @property
204
- def mapping_settings(self):
205
- return {
206
- "analysis": {
207
- "analyzer": current_app.config.get(
208
- "OAREPO_ICU_SEARCH_ANALYZERS", self.default_stemming_analyzers
209
- ),
210
- "filter": current_app.config.get(
211
- "OAREPO_ICU_SEARCH_FILTERS", self.default_stemming_filters
212
- ),
314
+ def mapping(self):
315
+ language = self.language or current_app.config.get("BABEL_DEFAULT_LOCALE", "en")
316
+ mapping_settings = self.languages.get(language, None)
317
+ if mapping_settings:
318
+ mapping_settings = mapping_settings.get("search")
319
+ if not mapping_settings:
320
+ mapping_settings = {
321
+ "type": "text",
322
+ "boost": 1 * self.boost,
323
+ "fields": {
324
+ "stemmed": {
325
+ "type": "text",
326
+ "analyzer": f"stemming_analyzer_{language}",
327
+ "boost": 0.5 * self.boost,
328
+ },
329
+ "lowercase": {
330
+ "type": "text",
331
+ "boost": 0.8 * self.boost,
332
+ "analyzer": "lowercase_analyzer",
333
+ },
334
+ "ascii_folded": {
335
+ "type": "text",
336
+ "analyzer": "ascii_folding_analyzer",
337
+ "boost": 0.3 * self.boost,
338
+ },
339
+ },
213
340
  }
341
+
342
+ return {self.attr_name: mapping_settings}
343
+
344
+ def search_load(self, data, record_cls):
345
+ """Load custom field."""
346
+ data.pop(self.attr_name, None)
347
+
348
+
349
+ class TermIndexField(SingleLanguageSearchField):
350
+ """
351
+ A system field that makes the field searchable in OpenSearch,
352
+ regardless if it is indexed/analyzed, embedded in Nested or not.
353
+
354
+ It creates a top-level mapping field and copies
355
+ content of {source_field} into it. It also provides the correct mapping
356
+ for the field based on the current configuration of the application.
357
+
358
+ Unlike the ICU, this field is a single-language and the language should
359
+ be provided when initializing the field.
360
+ It defaults to the BABEL_DEFAULT_LOCALE if not provided.
361
+ """
362
+
363
+ @property
364
+ def mapping(self):
365
+ mapping_settings = {
366
+ "type": "keyword",
367
+ "boost": 1 * self.boost,
368
+ "ignore_above": 256,
214
369
  }
370
+
371
+ return {self.attr_name: mapping_settings}
@@ -16,12 +16,10 @@ class MappingSystemFieldMixin:
16
16
  def dynamic_templates(self):
17
17
  return []
18
18
 
19
- @classmethod
20
- def search_dump(cls, data, record):
19
+ def search_dump(self, data, record):
21
20
  """Dump custom field."""
22
21
 
23
- @classmethod
24
- def search_load(cls, data, record_cls):
22
+ def search_load(self, data, record_cls):
25
23
  """Load custom field."""
26
24
 
27
25
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: oarepo-runtime
3
- Version: 1.7.2
3
+ Version: 1.9.0
4
4
  Summary: A set of runtime extensions of Invenio repository
5
5
  Description-Content-Type: text/markdown
6
6
  License-File: LICENSE
@@ -31,7 +31,7 @@ oarepo_runtime/datastreams/json.py,sha256=OkIkGKUawtYoGelyS5V92DVk7Ei7SlkMMny2Ue
31
31
  oarepo_runtime/datastreams/semi_asynchronous.py,sha256=kNc6BBnV6oFoY9kHgf5l8fd1wibRfI0dwyzLtu4fmUA,2940
32
32
  oarepo_runtime/datastreams/synchronous.py,sha256=t5lfnMkLqy3jK5zMl-nIuA0HlMPiHGjwCqZ8XQP-3GM,2595
33
33
  oarepo_runtime/datastreams/transformers.py,sha256=q5KzHPl2kJg7HP1BtKJ7F_UMqg_7L1ZGDX0O7s8D6UI,521
34
- oarepo_runtime/datastreams/types.py,sha256=KZjblc3T_UFFW7LrMDmiR8lqVf86V484LAHj6yg05EI,9908
34
+ oarepo_runtime/datastreams/types.py,sha256=ZeXX5rVPwCpzJ0drl5ApgN1l43G-aSjStQbt_vdJgRk,9950
35
35
  oarepo_runtime/datastreams/utils.py,sha256=w24qRRw1ZawPFbgqtxhpYnxPnghSn0pp49JLLy8wTms,4280
36
36
  oarepo_runtime/datastreams/readers/__init__.py,sha256=P1n3llZQ3AFHnSPbeT1VaCJcEtRFz9AbHfjkZv5LG7s,1103
37
37
  oarepo_runtime/datastreams/readers/attachments.py,sha256=A7EC1TqyTHG-go5DIaRotlBSOm6o9hGqAKyVVAceCRU,1956
@@ -70,11 +70,11 @@ oarepo_runtime/records/relations/base.py,sha256=ESTwj0-eT8HRTJ8QcE5fmqzjOjBFHpQq
70
70
  oarepo_runtime/records/relations/internal.py,sha256=OTp8iJqyl80sWDk0Q0AK42l6UsxZDABspVU_GwWza9o,1556
71
71
  oarepo_runtime/records/relations/lookup.py,sha256=wi3jPfOedazOmhOMrgu50PUETc1jfSdpmjK0wvOFsEM,848
72
72
  oarepo_runtime/records/relations/pid_relation.py,sha256=eojw5uIo5zXmJGge_bj6Wj2njCRY5S4o4B_h_HFyaDY,3901
73
- oarepo_runtime/records/systemfields/__init__.py,sha256=LL1R64RUakA_4r0IkTq9MtwqD5eV-AQaj5u96zkWa74,533
73
+ oarepo_runtime/records/systemfields/__init__.py,sha256=fU1IiZ24AX92TQi4GZsuBjT9JvtSklk5ANXbabcRkZg,709
74
74
  oarepo_runtime/records/systemfields/featured_file.py,sha256=MbSaYR130_o5S9gEOblnChq-PVK4xGPGpSCrzwG3cwc,1720
75
75
  oarepo_runtime/records/systemfields/has_draftcheck.py,sha256=4JkMEefPLpqtPtlTgK3UT0KzTRgyw5_Qtkss2qcz5xk,1643
76
- oarepo_runtime/records/systemfields/icu.py,sha256=sSGAgi5WhsAY4cCBL7-7nMpvHAuctpW8Y8vRExHQUfk,6738
77
- oarepo_runtime/records/systemfields/mapping.py,sha256=tXOK_jkdY1pOUO7_VfChfDNB8UTi21GUXaidpugTnO8,1017
76
+ oarepo_runtime/records/systemfields/icu.py,sha256=LqttWKdbtBw2FIIrtfDzn3-8jj-e0blm3Oz_XjFtiJw,12625
77
+ oarepo_runtime/records/systemfields/mapping.py,sha256=_Xr9iFJg5Gu198o4rH8bD3TQkRE4AVKXfZBTHa6QHh0,985
78
78
  oarepo_runtime/records/systemfields/owner.py,sha256=dYRVBinniW7ECHuSnTAjeN6x1KhhJtNR9vxmD1KswMs,3805
79
79
  oarepo_runtime/records/systemfields/record_status.py,sha256=U3kem4-JkNsT17e0iAl3HIAZ2MvO5lY_0U757aZvTKE,935
80
80
  oarepo_runtime/records/systemfields/selectors.py,sha256=Q9jE1smSN3heT2LIpK_jB6bIRjll1kX0AW9AhTsIYiU,2830
@@ -150,13 +150,13 @@ oarepo_runtime/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
150
150
  oarepo_runtime/utils/functools.py,sha256=gKS9YZtlIYcDvdNA9cmYO00yjiXBYV1jg8VpcRUyQyg,1324
151
151
  oarepo_runtime/utils/index.py,sha256=ArrUUXB-KowUcUksRKqcFpmqct4bn9alO1zd_kX2tmU,292
152
152
  oarepo_runtime/utils/path.py,sha256=V1NVyk3m12_YLbj7QHYvUpE1wScO78bYsX1LOLeXDkI,3108
153
- oarepo_runtime-1.7.2.dist-info/licenses/LICENSE,sha256=h2uWz0OaB3EN-J1ImdGJZzc7yvfQjvHVYdUhQ-H7ypY,1064
153
+ oarepo_runtime-1.9.0.dist-info/licenses/LICENSE,sha256=h2uWz0OaB3EN-J1ImdGJZzc7yvfQjvHVYdUhQ-H7ypY,1064
154
154
  tests/marshmallow_to_json/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
155
155
  tests/marshmallow_to_json/test_datacite_ui_schema.py,sha256=82iLj8nW45lZOUewpWbLX3mpSkpa9lxo-vK-Qtv_1bU,48552
156
156
  tests/marshmallow_to_json/test_simple_schema.py,sha256=izZN9p0v6kovtSZ6AdxBYmK_c6ZOti2_z_wPT_zXIr0,1500
157
157
  tests/pkg_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
158
- oarepo_runtime-1.7.2.dist-info/METADATA,sha256=zgNr97N1sEuZ23n7YI99fqBLKHIRKdQdxwp9vS9DtLo,4788
159
- oarepo_runtime-1.7.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
160
- oarepo_runtime-1.7.2.dist-info/entry_points.txt,sha256=k7O5LZUOGsVeSpB7ulU0txBUNp1CVQG7Q7TJIVTPbzU,491
161
- oarepo_runtime-1.7.2.dist-info/top_level.txt,sha256=bHhlkT1_RQC4IkfTQCqA3iN4KCB6cSFQlsXpQMSP-bE,21
162
- oarepo_runtime-1.7.2.dist-info/RECORD,,
158
+ oarepo_runtime-1.9.0.dist-info/METADATA,sha256=PNBhOPkZ-HmVABJk4k9cb5up2HStMFInTy0ZSw0NEZs,4788
159
+ oarepo_runtime-1.9.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
160
+ oarepo_runtime-1.9.0.dist-info/entry_points.txt,sha256=k7O5LZUOGsVeSpB7ulU0txBUNp1CVQG7Q7TJIVTPbzU,491
161
+ oarepo_runtime-1.9.0.dist-info/top_level.txt,sha256=bHhlkT1_RQC4IkfTQCqA3iN4KCB6cSFQlsXpQMSP-bE,21
162
+ oarepo_runtime-1.9.0.dist-info/RECORD,,