biblicus 1.0.0__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. biblicus/__init__.py +5 -5
  2. biblicus/analysis/__init__.py +1 -1
  3. biblicus/analysis/base.py +10 -10
  4. biblicus/analysis/markov.py +78 -68
  5. biblicus/analysis/models.py +47 -47
  6. biblicus/analysis/profiling.py +58 -48
  7. biblicus/analysis/topic_modeling.py +56 -51
  8. biblicus/cli.py +224 -177
  9. biblicus/{recipes.py → configuration.py} +14 -14
  10. biblicus/constants.py +2 -2
  11. biblicus/context_engine/assembler.py +49 -19
  12. biblicus/context_engine/retrieval.py +46 -42
  13. biblicus/corpus.py +116 -108
  14. biblicus/errors.py +3 -3
  15. biblicus/evaluation.py +27 -25
  16. biblicus/extraction.py +103 -98
  17. biblicus/extraction_evaluation.py +26 -26
  18. biblicus/extractors/deepgram_stt.py +7 -7
  19. biblicus/extractors/docling_granite_text.py +11 -11
  20. biblicus/extractors/docling_smol_text.py +11 -11
  21. biblicus/extractors/markitdown_text.py +4 -4
  22. biblicus/extractors/openai_stt.py +7 -7
  23. biblicus/extractors/paddleocr_vl_text.py +20 -18
  24. biblicus/extractors/pipeline.py +8 -8
  25. biblicus/extractors/rapidocr_text.py +3 -3
  26. biblicus/extractors/unstructured_text.py +3 -3
  27. biblicus/hooks.py +4 -4
  28. biblicus/knowledge_base.py +33 -31
  29. biblicus/models.py +78 -78
  30. biblicus/retrieval.py +47 -40
  31. biblicus/retrievers/__init__.py +50 -0
  32. biblicus/retrievers/base.py +65 -0
  33. biblicus/{backends → retrievers}/embedding_index_common.py +44 -41
  34. biblicus/{backends → retrievers}/embedding_index_file.py +87 -58
  35. biblicus/{backends → retrievers}/embedding_index_inmemory.py +88 -59
  36. biblicus/retrievers/hybrid.py +301 -0
  37. biblicus/{backends → retrievers}/scan.py +83 -73
  38. biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
  39. biblicus/{backends → retrievers}/tf_vector.py +87 -77
  40. biblicus/text/prompts.py +16 -8
  41. biblicus/text/tool_loop.py +63 -5
  42. {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/METADATA +52 -43
  43. biblicus-1.1.1.dist-info/RECORD +91 -0
  44. biblicus/backends/__init__.py +0 -50
  45. biblicus/backends/base.py +0 -65
  46. biblicus/backends/hybrid.py +0 -292
  47. biblicus-1.0.0.dist-info/RECORD +0 -91
  48. {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/WHEEL +0 -0
  49. {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/entry_points.txt +0 -0
  50. {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/licenses/LICENSE +0 -0
  51. {biblicus-1.0.0.dist-info → biblicus-1.1.1.dist-info}/top_level.txt +0 -0
@@ -11,21 +11,21 @@ from pydantic import Field, field_validator, model_validator
11
11
 
12
12
  from ..ai.models import EmbeddingsClientConfig, LlmClientConfig
13
13
  from ..constants import ANALYSIS_SCHEMA_VERSION
14
- from ..models import ExtractionRunReference
14
+ from ..models import ExtractionSnapshotReference
15
15
  from .schema import AnalysisSchemaModel
16
16
 
17
17
 
18
- class AnalysisRecipeManifest(AnalysisSchemaModel):
18
+ class AnalysisConfigurationManifest(AnalysisSchemaModel):
19
19
  """
20
20
  Reproducible configuration for an analysis pipeline.
21
21
 
22
- :ivar recipe_id: Deterministic recipe identifier.
23
- :vartype recipe_id: str
22
+ :ivar configuration_id: Deterministic configuration identifier.
23
+ :vartype configuration_id: str
24
24
  :ivar analysis_id: Analysis backend identifier.
25
25
  :vartype analysis_id: str
26
- :ivar name: Human-readable recipe name.
26
+ :ivar name: Human-readable configuration name.
27
27
  :vartype name: str
28
- :ivar created_at: International Organization for Standardization 8601 timestamp for recipe creation.
28
+ :ivar created_at: International Organization for Standardization 8601 timestamp for configuration creation.
29
29
  :vartype created_at: str
30
30
  :ivar config: Analysis-specific configuration values.
31
31
  :vartype config: dict[str, Any]
@@ -33,7 +33,7 @@ class AnalysisRecipeManifest(AnalysisSchemaModel):
33
33
  :vartype description: str or None
34
34
  """
35
35
 
36
- recipe_id: str
36
+ configuration_id: str
37
37
  analysis_id: str
38
38
  name: str
39
39
  created_at: str
@@ -43,30 +43,30 @@ class AnalysisRecipeManifest(AnalysisSchemaModel):
43
43
 
44
44
  class AnalysisRunInput(AnalysisSchemaModel):
45
45
  """
46
- Inputs required to execute an analysis run.
46
+ Inputs required to execute an analysis snapshot.
47
47
 
48
- :ivar extraction_run: Extraction run reference for analysis inputs.
49
- :vartype extraction_run: biblicus.models.ExtractionRunReference
48
+ :ivar extraction_snapshot: Extraction snapshot reference for analysis inputs.
49
+ :vartype extraction_snapshot: biblicus.models.ExtractionSnapshotReference
50
50
  """
51
51
 
52
- extraction_run: ExtractionRunReference
52
+ extraction_snapshot: ExtractionSnapshotReference
53
53
 
54
54
 
55
55
  class AnalysisRunManifest(AnalysisSchemaModel):
56
56
  """
57
- Immutable record of an analysis run.
57
+ Immutable record of an analysis snapshot.
58
58
 
59
- :ivar run_id: Unique run identifier.
60
- :vartype run_id: str
61
- :ivar recipe: Recipe manifest for this run.
62
- :vartype recipe: AnalysisRecipeManifest
59
+ :ivar snapshot_id: Unique snapshot identifier.
60
+ :vartype snapshot_id: str
61
+ :ivar configuration: Configuration manifest for this run.
62
+ :vartype configuration: AnalysisConfigurationManifest
63
63
  :ivar corpus_uri: Canonical uniform resource identifier for the corpus root.
64
64
  :vartype corpus_uri: str
65
65
  :ivar catalog_generated_at: Catalog timestamp used for the run.
66
66
  :vartype catalog_generated_at: str
67
67
  :ivar created_at: International Organization for Standardization 8601 timestamp for run creation.
68
68
  :vartype created_at: str
69
- :ivar input: Inputs used for this analysis run.
69
+ :ivar input: Inputs used for this analysis snapshot.
70
70
  :vartype input: AnalysisRunInput
71
71
  :ivar artifact_paths: Relative paths to materialized artifacts.
72
72
  :vartype artifact_paths: list[str]
@@ -74,8 +74,8 @@ class AnalysisRunManifest(AnalysisSchemaModel):
74
74
  :vartype stats: dict[str, Any]
75
75
  """
76
76
 
77
- run_id: str
78
- recipe: AnalysisRecipeManifest
77
+ snapshot_id: str
78
+ configuration: AnalysisConfigurationManifest
79
79
  corpus_uri: str
80
80
  catalog_generated_at: str
81
81
  created_at: str
@@ -84,9 +84,9 @@ class AnalysisRunManifest(AnalysisSchemaModel):
84
84
  stats: Dict[str, Any] = Field(default_factory=dict)
85
85
 
86
86
 
87
- class ProfilingRecipeConfig(AnalysisSchemaModel):
87
+ class ProfilingConfiguration(AnalysisSchemaModel):
88
88
  """
89
- Recipe configuration for profiling analysis.
89
+ Configuration for profiling analysis.
90
90
 
91
91
  :ivar schema_version: Analysis schema version.
92
92
  :vartype schema_version: int
@@ -110,7 +110,7 @@ class ProfilingRecipeConfig(AnalysisSchemaModel):
110
110
  tag_filters: Optional[List[str]] = None
111
111
 
112
112
  @model_validator(mode="after")
113
- def _validate_schema_version(self) -> "ProfilingRecipeConfig":
113
+ def _validate_schema_version(self) -> "ProfilingConfiguration":
114
114
  if self.schema_version != ANALYSIS_SCHEMA_VERSION:
115
115
  raise ValueError(f"Unsupported analysis schema version: {self.schema_version}")
116
116
  return self
@@ -237,7 +237,7 @@ class ProfilingExtractedTextReport(AnalysisSchemaModel):
237
237
  """
238
238
  Summary of extracted text coverage.
239
239
 
240
- :ivar source_items: Count of source items in the extraction run.
240
+ :ivar source_items: Count of source items in the extraction snapshot.
241
241
  :vartype source_items: int
242
242
  :ivar extracted_nonempty_items: Count of extracted items with non-empty text.
243
243
  :vartype extracted_nonempty_items: int
@@ -286,8 +286,8 @@ class ProfilingOutput(AnalysisSchemaModel):
286
286
  :vartype analysis_id: str
287
287
  :ivar generated_at: International Organization for Standardization 8601 timestamp for output creation.
288
288
  :vartype generated_at: str
289
- :ivar run: Analysis run manifest.
290
- :vartype run: AnalysisRunManifest
289
+ :ivar snapshot: Analysis snapshot manifest.
290
+ :vartype snapshot: AnalysisRunManifest
291
291
  :ivar report: Profiling report data.
292
292
  :vartype report: ProfilingReport
293
293
  """
@@ -295,7 +295,7 @@ class ProfilingOutput(AnalysisSchemaModel):
295
295
  schema_version: int = Field(default=ANALYSIS_SCHEMA_VERSION, ge=1)
296
296
  analysis_id: str
297
297
  generated_at: str
298
- run: AnalysisRunManifest
298
+ snapshot: AnalysisRunManifest
299
299
  report: ProfilingReport
300
300
 
301
301
 
@@ -482,9 +482,9 @@ class TopicModelingLlmFineTuningConfig(AnalysisSchemaModel):
482
482
  return self
483
483
 
484
484
 
485
- class TopicModelingRecipeConfig(AnalysisSchemaModel):
485
+ class TopicModelingConfiguration(AnalysisSchemaModel):
486
486
  """
487
- Recipe configuration for topic modeling analysis.
487
+ Configuration for topic modeling analysis.
488
488
 
489
489
  :ivar schema_version: Analysis schema version.
490
490
  :vartype schema_version: int
@@ -518,7 +518,7 @@ class TopicModelingRecipeConfig(AnalysisSchemaModel):
518
518
  )
519
519
 
520
520
  @model_validator(mode="after")
521
- def _validate_schema_version(self) -> "TopicModelingRecipeConfig":
521
+ def _validate_schema_version(self) -> "TopicModelingConfiguration":
522
522
  if self.schema_version != ANALYSIS_SCHEMA_VERSION:
523
523
  raise ValueError(f"Unsupported analysis schema version: {self.schema_version}")
524
524
  return self
@@ -764,8 +764,8 @@ class TopicModelingOutput(AnalysisSchemaModel):
764
764
  :vartype analysis_id: str
765
765
  :ivar generated_at: International Organization for Standardization 8601 timestamp for output creation.
766
766
  :vartype generated_at: str
767
- :ivar run: Analysis run manifest.
768
- :vartype run: AnalysisRunManifest
767
+ :ivar snapshot: Analysis snapshot manifest.
768
+ :vartype snapshot: AnalysisRunManifest
769
769
  :ivar report: Topic modeling report data.
770
770
  :vartype report: TopicModelingReport
771
771
  """
@@ -773,7 +773,7 @@ class TopicModelingOutput(AnalysisSchemaModel):
773
773
  schema_version: int = Field(default=ANALYSIS_SCHEMA_VERSION, ge=1)
774
774
  analysis_id: str
775
775
  generated_at: str
776
- run: AnalysisRunManifest
776
+ snapshot: AnalysisRunManifest
777
777
  report: TopicModelingReport
778
778
 
779
779
 
@@ -1049,26 +1049,26 @@ class MarkovAnalysisTopicModelingConfig(AnalysisSchemaModel):
1049
1049
 
1050
1050
  :ivar enabled: Whether to run topic modeling on segments.
1051
1051
  :vartype enabled: bool
1052
- :ivar recipe: Topic modeling recipe applied to segments.
1053
- :vartype recipe: TopicModelingRecipeConfig or None
1052
+ :ivar configuration: Topic modeling configuration applied to segments.
1053
+ :vartype configuration: TopicModelingConfiguration or None
1054
1054
  """
1055
1055
 
1056
1056
  enabled: bool = Field(default=False)
1057
- recipe: Optional["TopicModelingRecipeConfig"] = None
1057
+ configuration: Optional["TopicModelingConfiguration"] = None
1058
1058
 
1059
1059
  @model_validator(mode="after")
1060
1060
  def _validate_requirements(self) -> "MarkovAnalysisTopicModelingConfig":
1061
1061
  if not self.enabled:
1062
1062
  return self
1063
- if self.recipe is None:
1063
+ if self.configuration is None:
1064
1064
  raise ValueError(
1065
- "topic_modeling.recipe is required when topic_modeling.enabled is true"
1065
+ "topic_modeling.configuration is required when topic_modeling.enabled is true"
1066
1066
  )
1067
- if self.recipe.llm_extraction.enabled and (
1068
- self.recipe.llm_extraction.method != TopicModelingLlmExtractionMethod.SINGLE
1067
+ if self.configuration.llm_extraction.enabled and (
1068
+ self.configuration.llm_extraction.method != TopicModelingLlmExtractionMethod.SINGLE
1069
1069
  ):
1070
1070
  raise ValueError(
1071
- "topic_modeling.recipe.llm_extraction.method must be 'single' for Markov topic modeling"
1071
+ "topic_modeling.configuration.llm_extraction.method must be 'single' for Markov topic modeling"
1072
1072
  )
1073
1073
  return self
1074
1074
 
@@ -1288,9 +1288,9 @@ class MarkovAnalysisStateNamingConfig(AnalysisSchemaModel):
1288
1288
  return self
1289
1289
 
1290
1290
 
1291
- class MarkovAnalysisRecipeConfig(AnalysisSchemaModel):
1291
+ class MarkovAnalysisConfiguration(AnalysisSchemaModel):
1292
1292
  """
1293
- Recipe configuration for Markov analysis.
1293
+ Configuration for Markov analysis.
1294
1294
 
1295
1295
  :ivar schema_version: Analysis schema version.
1296
1296
  :vartype schema_version: int
@@ -1334,7 +1334,7 @@ class MarkovAnalysisRecipeConfig(AnalysisSchemaModel):
1334
1334
  report: MarkovAnalysisReportConfig = Field(default_factory=MarkovAnalysisReportConfig)
1335
1335
 
1336
1336
  @model_validator(mode="after")
1337
- def _validate_schema_version(self) -> "MarkovAnalysisRecipeConfig":
1337
+ def _validate_schema_version(self) -> "MarkovAnalysisConfiguration":
1338
1338
  if self.schema_version != ANALYSIS_SCHEMA_VERSION:
1339
1339
  raise ValueError(f"Unsupported analysis schema version: {self.schema_version}")
1340
1340
  return self
@@ -1346,7 +1346,7 @@ class MarkovAnalysisTextCollectionReport(AnalysisSchemaModel):
1346
1346
 
1347
1347
  :ivar status: Stage status.
1348
1348
  :vartype status: MarkovAnalysisStageStatus
1349
- :ivar source_items: Count of items in extraction run.
1349
+ :ivar source_items: Count of items in extraction snapshot.
1350
1350
  :vartype source_items: int
1351
1351
  :ivar documents: Count of documents included.
1352
1352
  :vartype documents: int
@@ -1517,8 +1517,8 @@ class MarkovAnalysisOutput(AnalysisSchemaModel):
1517
1517
  :vartype analysis_id: str
1518
1518
  :ivar generated_at: International Organization for Standardization 8601 timestamp for output creation.
1519
1519
  :vartype generated_at: str
1520
- :ivar run: Analysis run manifest.
1521
- :vartype run: AnalysisRunManifest
1520
+ :ivar snapshot: Analysis snapshot manifest.
1521
+ :vartype snapshot: AnalysisRunManifest
1522
1522
  :ivar report: Markov analysis report data.
1523
1523
  :vartype report: MarkovAnalysisReport
1524
1524
  """
@@ -1526,5 +1526,5 @@ class MarkovAnalysisOutput(AnalysisSchemaModel):
1526
1526
  schema_version: int = Field(default=ANALYSIS_SCHEMA_VERSION, ge=1)
1527
1527
  analysis_id: str
1528
1528
  generated_at: str
1529
- run: AnalysisRunManifest
1529
+ snapshot: AnalysisRunManifest
1530
1530
  report: MarkovAnalysisReport
@@ -12,20 +12,20 @@ from typing import Dict, Iterable, List, Sequence
12
12
  from pydantic import BaseModel
13
13
 
14
14
  from ..corpus import Corpus
15
- from ..models import CatalogItem, ExtractionRunReference
15
+ from ..models import CatalogItem, ExtractionSnapshotReference
16
16
  from ..retrieval import hash_text
17
17
  from ..time import utc_now_iso
18
18
  from .base import CorpusAnalysisBackend
19
19
  from .models import (
20
- AnalysisRecipeManifest,
20
+ AnalysisConfigurationManifest,
21
21
  AnalysisRunInput,
22
22
  AnalysisRunManifest,
23
+ ProfilingConfiguration,
23
24
  ProfilingDistributionReport,
24
25
  ProfilingExtractedTextReport,
25
26
  ProfilingOutput,
26
27
  ProfilingPercentileValue,
27
28
  ProfilingRawItemsReport,
28
- ProfilingRecipeConfig,
29
29
  ProfilingReport,
30
30
  ProfilingTagCount,
31
31
  ProfilingTagReport,
@@ -46,62 +46,67 @@ class ProfilingBackend(CorpusAnalysisBackend):
46
46
  self,
47
47
  corpus: Corpus,
48
48
  *,
49
- recipe_name: str,
50
- config: Dict[str, object],
51
- extraction_run: ExtractionRunReference,
49
+ configuration_name: str,
50
+ configuration: Dict[str, object],
51
+ extraction_snapshot: ExtractionSnapshotReference,
52
52
  ) -> BaseModel:
53
53
  """
54
54
  Run the profiling analysis pipeline.
55
55
 
56
56
  :param corpus: Corpus to analyze.
57
57
  :type corpus: Corpus
58
- :param recipe_name: Human-readable recipe name.
59
- :type recipe_name: str
60
- :param config: Analysis configuration values.
61
- :type config: dict[str, object]
62
- :param extraction_run: Extraction run reference for text inputs.
63
- :type extraction_run: biblicus.models.ExtractionRunReference
58
+ :param configuration_name: Human-readable configuration name.
59
+ :type configuration_name: str
60
+ :param configuration: Analysis configuration values.
61
+ :type configuration: dict[str, object]
62
+ :param extraction_snapshot: Extraction snapshot reference for text inputs.
63
+ :type extraction_snapshot: biblicus.models.ExtractionSnapshotReference
64
64
  :return: Profiling output model.
65
65
  :rtype: pydantic.BaseModel
66
66
  """
67
67
  parsed_config = (
68
- config
69
- if isinstance(config, ProfilingRecipeConfig)
70
- else ProfilingRecipeConfig.model_validate(config)
68
+ configuration
69
+ if isinstance(configuration, ProfilingConfiguration)
70
+ else ProfilingConfiguration.model_validate(configuration)
71
71
  )
72
72
  return _run_profiling(
73
73
  corpus=corpus,
74
- recipe_name=recipe_name,
74
+ configuration_name=configuration_name,
75
75
  config=parsed_config,
76
- extraction_run=extraction_run,
76
+ extraction_snapshot=extraction_snapshot,
77
77
  )
78
78
 
79
79
 
80
80
  def _run_profiling(
81
81
  *,
82
82
  corpus: Corpus,
83
- recipe_name: str,
84
- config: ProfilingRecipeConfig,
85
- extraction_run: ExtractionRunReference,
83
+ configuration_name: str,
84
+ config: ProfilingConfiguration,
85
+ extraction_snapshot: ExtractionSnapshotReference,
86
86
  ) -> ProfilingOutput:
87
- recipe = _create_recipe_manifest(name=recipe_name, config=config)
87
+ configuration_manifest = _create_configuration_manifest(
88
+ name=configuration_name,
89
+ config=config,
90
+ )
88
91
  catalog = corpus.load_catalog()
89
- run_id = _analysis_run_id(
90
- recipe_id=recipe.recipe_id,
91
- extraction_run=extraction_run,
92
+ snapshot_id = _analysis_snapshot_id(
93
+ configuration_id=configuration_manifest.configuration_id,
94
+ extraction_snapshot=extraction_snapshot,
92
95
  catalog_generated_at=catalog.generated_at,
93
96
  )
94
97
  run_manifest = AnalysisRunManifest(
95
- run_id=run_id,
96
- recipe=recipe,
98
+ snapshot_id=snapshot_id,
99
+ configuration=configuration_manifest,
97
100
  corpus_uri=catalog.corpus_uri,
98
101
  catalog_generated_at=catalog.generated_at,
99
102
  created_at=utc_now_iso(),
100
- input=AnalysisRunInput(extraction_run=extraction_run),
103
+ input=AnalysisRunInput(extraction_snapshot=extraction_snapshot),
101
104
  artifact_paths=[],
102
105
  stats={},
103
106
  )
104
- run_dir = corpus.analysis_run_dir(analysis_id=ProfilingBackend.analysis_id, run_id=run_id)
107
+ run_dir = corpus.analysis_run_dir(
108
+ analysis_id=ProfilingBackend.analysis_id, snapshot_id=snapshot_id
109
+ )
105
110
  output_path = run_dir / "output.json"
106
111
  run_dir.mkdir(parents=True, exist_ok=True)
107
112
 
@@ -109,7 +114,7 @@ def _run_profiling(
109
114
  raw_report = _build_raw_items_report(items=ordered_items, config=config)
110
115
  extracted_report = _build_extracted_text_report(
111
116
  corpus=corpus,
112
- extraction_run=extraction_run,
117
+ extraction_snapshot=extraction_snapshot,
113
118
  config=config,
114
119
  )
115
120
 
@@ -133,15 +138,17 @@ def _run_profiling(
133
138
  output = ProfilingOutput(
134
139
  analysis_id=ProfilingBackend.analysis_id,
135
140
  generated_at=utc_now_iso(),
136
- run=run_manifest,
141
+ snapshot=run_manifest,
137
142
  report=report,
138
143
  )
139
144
  _write_profiling_output(path=output_path, output=output)
140
145
  return output
141
146
 
142
147
 
143
- def _create_recipe_manifest(*, name: str, config: ProfilingRecipeConfig) -> AnalysisRecipeManifest:
144
- recipe_payload = json.dumps(
148
+ def _create_configuration_manifest(
149
+ *, name: str, config: ProfilingConfiguration
150
+ ) -> AnalysisConfigurationManifest:
151
+ configuration_payload = json.dumps(
145
152
  {
146
153
  "analysis_id": ProfilingBackend.analysis_id,
147
154
  "name": name,
@@ -149,9 +156,9 @@ def _create_recipe_manifest(*, name: str, config: ProfilingRecipeConfig) -> Anal
149
156
  },
150
157
  sort_keys=True,
151
158
  )
152
- recipe_id = hash_text(recipe_payload)
153
- return AnalysisRecipeManifest(
154
- recipe_id=recipe_id,
159
+ configuration_id = hash_text(configuration_payload)
160
+ return AnalysisConfigurationManifest(
161
+ configuration_id=configuration_id,
155
162
  analysis_id=ProfilingBackend.analysis_id,
156
163
  name=name,
157
164
  created_at=utc_now_iso(),
@@ -159,10 +166,13 @@ def _create_recipe_manifest(*, name: str, config: ProfilingRecipeConfig) -> Anal
159
166
  )
160
167
 
161
168
 
162
- def _analysis_run_id(
163
- *, recipe_id: str, extraction_run: ExtractionRunReference, catalog_generated_at: str
169
+ def _analysis_snapshot_id(
170
+ *,
171
+ configuration_id: str,
172
+ extraction_snapshot: ExtractionSnapshotReference,
173
+ catalog_generated_at: str,
164
174
  ) -> str:
165
- run_seed = f"{recipe_id}:{extraction_run.as_string()}:{catalog_generated_at}"
175
+ run_seed = f"{configuration_id}:{extraction_snapshot.as_string()}:{catalog_generated_at}"
166
176
  return hash_text(run_seed)
167
177
 
168
178
 
@@ -186,7 +196,7 @@ def _ordered_catalog_items(
186
196
 
187
197
 
188
198
  def _build_raw_items_report(
189
- *, items: Sequence[CatalogItem], config: ProfilingRecipeConfig
199
+ *, items: Sequence[CatalogItem], config: ProfilingConfiguration
190
200
  ) -> ProfilingRawItemsReport:
191
201
  media_type_counts: Dict[str, int] = {}
192
202
  for item in items:
@@ -205,7 +215,7 @@ def _build_raw_items_report(
205
215
 
206
216
 
207
217
  def _build_tag_report(
208
- *, items: Sequence[CatalogItem], config: ProfilingRecipeConfig
218
+ *, items: Sequence[CatalogItem], config: ProfilingConfiguration
209
219
  ) -> ProfilingTagReport:
210
220
  tag_filters = config.tag_filters
211
221
  tag_filter_set = set(tag_filters or [])
@@ -236,20 +246,20 @@ def _build_tag_report(
236
246
  def _build_extracted_text_report(
237
247
  *,
238
248
  corpus: Corpus,
239
- extraction_run: ExtractionRunReference,
240
- config: ProfilingRecipeConfig,
249
+ extraction_snapshot: ExtractionSnapshotReference,
250
+ config: ProfilingConfiguration,
241
251
  ) -> ProfilingExtractedTextReport:
242
- manifest = corpus.load_extraction_run_manifest(
243
- extractor_id=extraction_run.extractor_id,
244
- run_id=extraction_run.run_id,
252
+ manifest = corpus.load_extraction_snapshot_manifest(
253
+ extractor_id=extraction_snapshot.extractor_id,
254
+ snapshot_id=extraction_snapshot.snapshot_id,
245
255
  )
246
256
  nonempty_items = 0
247
257
  empty_items = 0
248
258
  missing_items = 0
249
259
  text_lengths: List[int] = []
250
- text_dir = corpus.extraction_run_dir(
251
- extractor_id=extraction_run.extractor_id,
252
- run_id=extraction_run.run_id,
260
+ text_dir = corpus.extraction_snapshot_dir(
261
+ extractor_id=extraction_snapshot.extractor_id,
262
+ snapshot_id=extraction_snapshot.snapshot_id,
253
263
  )
254
264
 
255
265
  for item_result in manifest.items: