biblicus 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +5 -5
- biblicus/analysis/__init__.py +1 -1
- biblicus/analysis/base.py +10 -10
- biblicus/analysis/markov.py +78 -68
- biblicus/analysis/models.py +47 -47
- biblicus/analysis/profiling.py +58 -48
- biblicus/analysis/topic_modeling.py +56 -51
- biblicus/cli.py +224 -177
- biblicus/{recipes.py → configuration.py} +14 -14
- biblicus/constants.py +2 -2
- biblicus/context_engine/assembler.py +49 -19
- biblicus/context_engine/retrieval.py +46 -42
- biblicus/corpus.py +116 -108
- biblicus/errors.py +3 -3
- biblicus/evaluation.py +27 -25
- biblicus/extraction.py +103 -98
- biblicus/extraction_evaluation.py +26 -26
- biblicus/extractors/deepgram_stt.py +7 -7
- biblicus/extractors/docling_granite_text.py +11 -11
- biblicus/extractors/docling_smol_text.py +11 -11
- biblicus/extractors/markitdown_text.py +4 -4
- biblicus/extractors/openai_stt.py +7 -7
- biblicus/extractors/paddleocr_vl_text.py +20 -18
- biblicus/extractors/pipeline.py +8 -8
- biblicus/extractors/rapidocr_text.py +3 -3
- biblicus/extractors/unstructured_text.py +3 -3
- biblicus/hooks.py +4 -4
- biblicus/knowledge_base.py +33 -31
- biblicus/models.py +78 -78
- biblicus/retrieval.py +47 -40
- biblicus/retrievers/__init__.py +50 -0
- biblicus/retrievers/base.py +65 -0
- biblicus/{backends → retrievers}/embedding_index_common.py +44 -41
- biblicus/{backends → retrievers}/embedding_index_file.py +87 -58
- biblicus/{backends → retrievers}/embedding_index_inmemory.py +88 -59
- biblicus/retrievers/hybrid.py +301 -0
- biblicus/{backends → retrievers}/scan.py +83 -73
- biblicus/{backends → retrievers}/sqlite_full_text_search.py +115 -101
- biblicus/{backends → retrievers}/tf_vector.py +87 -77
- biblicus/text/prompts.py +16 -8
- biblicus/text/tool_loop.py +63 -5
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/METADATA +30 -21
- biblicus-1.1.0.dist-info/RECORD +91 -0
- biblicus/backends/__init__.py +0 -50
- biblicus/backends/base.py +0 -65
- biblicus/backends/hybrid.py +0 -292
- biblicus-1.0.0.dist-info/RECORD +0 -91
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/WHEEL +0 -0
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/entry_points.txt +0 -0
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-1.0.0.dist-info → biblicus-1.1.0.dist-info}/top_level.txt +0 -0
biblicus/analysis/models.py
CHANGED
|
@@ -11,21 +11,21 @@ from pydantic import Field, field_validator, model_validator
|
|
|
11
11
|
|
|
12
12
|
from ..ai.models import EmbeddingsClientConfig, LlmClientConfig
|
|
13
13
|
from ..constants import ANALYSIS_SCHEMA_VERSION
|
|
14
|
-
from ..models import
|
|
14
|
+
from ..models import ExtractionSnapshotReference
|
|
15
15
|
from .schema import AnalysisSchemaModel
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
class
|
|
18
|
+
class AnalysisConfigurationManifest(AnalysisSchemaModel):
|
|
19
19
|
"""
|
|
20
20
|
Reproducible configuration for an analysis pipeline.
|
|
21
21
|
|
|
22
|
-
:ivar
|
|
23
|
-
:vartype
|
|
22
|
+
:ivar configuration_id: Deterministic configuration identifier.
|
|
23
|
+
:vartype configuration_id: str
|
|
24
24
|
:ivar analysis_id: Analysis backend identifier.
|
|
25
25
|
:vartype analysis_id: str
|
|
26
|
-
:ivar name: Human-readable
|
|
26
|
+
:ivar name: Human-readable configuration name.
|
|
27
27
|
:vartype name: str
|
|
28
|
-
:ivar created_at: International Organization for Standardization 8601 timestamp for
|
|
28
|
+
:ivar created_at: International Organization for Standardization 8601 timestamp for configuration creation.
|
|
29
29
|
:vartype created_at: str
|
|
30
30
|
:ivar config: Analysis-specific configuration values.
|
|
31
31
|
:vartype config: dict[str, Any]
|
|
@@ -33,7 +33,7 @@ class AnalysisRecipeManifest(AnalysisSchemaModel):
|
|
|
33
33
|
:vartype description: str or None
|
|
34
34
|
"""
|
|
35
35
|
|
|
36
|
-
|
|
36
|
+
configuration_id: str
|
|
37
37
|
analysis_id: str
|
|
38
38
|
name: str
|
|
39
39
|
created_at: str
|
|
@@ -43,30 +43,30 @@ class AnalysisRecipeManifest(AnalysisSchemaModel):
|
|
|
43
43
|
|
|
44
44
|
class AnalysisRunInput(AnalysisSchemaModel):
|
|
45
45
|
"""
|
|
46
|
-
Inputs required to execute an analysis
|
|
46
|
+
Inputs required to execute an analysis snapshot.
|
|
47
47
|
|
|
48
|
-
:ivar
|
|
49
|
-
:vartype
|
|
48
|
+
:ivar extraction_snapshot: Extraction snapshot reference for analysis inputs.
|
|
49
|
+
:vartype extraction_snapshot: biblicus.models.ExtractionSnapshotReference
|
|
50
50
|
"""
|
|
51
51
|
|
|
52
|
-
|
|
52
|
+
extraction_snapshot: ExtractionSnapshotReference
|
|
53
53
|
|
|
54
54
|
|
|
55
55
|
class AnalysisRunManifest(AnalysisSchemaModel):
|
|
56
56
|
"""
|
|
57
|
-
Immutable record of an analysis
|
|
57
|
+
Immutable record of an analysis snapshot.
|
|
58
58
|
|
|
59
|
-
:ivar
|
|
60
|
-
:vartype
|
|
61
|
-
:ivar
|
|
62
|
-
:vartype
|
|
59
|
+
:ivar snapshot_id: Unique snapshot identifier.
|
|
60
|
+
:vartype snapshot_id: str
|
|
61
|
+
:ivar configuration: Configuration manifest for this run.
|
|
62
|
+
:vartype configuration: AnalysisConfigurationManifest
|
|
63
63
|
:ivar corpus_uri: Canonical uniform resource identifier for the corpus root.
|
|
64
64
|
:vartype corpus_uri: str
|
|
65
65
|
:ivar catalog_generated_at: Catalog timestamp used for the run.
|
|
66
66
|
:vartype catalog_generated_at: str
|
|
67
67
|
:ivar created_at: International Organization for Standardization 8601 timestamp for run creation.
|
|
68
68
|
:vartype created_at: str
|
|
69
|
-
:ivar input: Inputs used for this analysis
|
|
69
|
+
:ivar input: Inputs used for this analysis snapshot.
|
|
70
70
|
:vartype input: AnalysisRunInput
|
|
71
71
|
:ivar artifact_paths: Relative paths to materialized artifacts.
|
|
72
72
|
:vartype artifact_paths: list[str]
|
|
@@ -74,8 +74,8 @@ class AnalysisRunManifest(AnalysisSchemaModel):
|
|
|
74
74
|
:vartype stats: dict[str, Any]
|
|
75
75
|
"""
|
|
76
76
|
|
|
77
|
-
|
|
78
|
-
|
|
77
|
+
snapshot_id: str
|
|
78
|
+
configuration: AnalysisConfigurationManifest
|
|
79
79
|
corpus_uri: str
|
|
80
80
|
catalog_generated_at: str
|
|
81
81
|
created_at: str
|
|
@@ -84,9 +84,9 @@ class AnalysisRunManifest(AnalysisSchemaModel):
|
|
|
84
84
|
stats: Dict[str, Any] = Field(default_factory=dict)
|
|
85
85
|
|
|
86
86
|
|
|
87
|
-
class
|
|
87
|
+
class ProfilingConfiguration(AnalysisSchemaModel):
|
|
88
88
|
"""
|
|
89
|
-
|
|
89
|
+
Configuration for profiling analysis.
|
|
90
90
|
|
|
91
91
|
:ivar schema_version: Analysis schema version.
|
|
92
92
|
:vartype schema_version: int
|
|
@@ -110,7 +110,7 @@ class ProfilingRecipeConfig(AnalysisSchemaModel):
|
|
|
110
110
|
tag_filters: Optional[List[str]] = None
|
|
111
111
|
|
|
112
112
|
@model_validator(mode="after")
|
|
113
|
-
def _validate_schema_version(self) -> "
|
|
113
|
+
def _validate_schema_version(self) -> "ProfilingConfiguration":
|
|
114
114
|
if self.schema_version != ANALYSIS_SCHEMA_VERSION:
|
|
115
115
|
raise ValueError(f"Unsupported analysis schema version: {self.schema_version}")
|
|
116
116
|
return self
|
|
@@ -237,7 +237,7 @@ class ProfilingExtractedTextReport(AnalysisSchemaModel):
|
|
|
237
237
|
"""
|
|
238
238
|
Summary of extracted text coverage.
|
|
239
239
|
|
|
240
|
-
:ivar source_items: Count of source items in the extraction
|
|
240
|
+
:ivar source_items: Count of source items in the extraction snapshot.
|
|
241
241
|
:vartype source_items: int
|
|
242
242
|
:ivar extracted_nonempty_items: Count of extracted items with non-empty text.
|
|
243
243
|
:vartype extracted_nonempty_items: int
|
|
@@ -286,8 +286,8 @@ class ProfilingOutput(AnalysisSchemaModel):
|
|
|
286
286
|
:vartype analysis_id: str
|
|
287
287
|
:ivar generated_at: International Organization for Standardization 8601 timestamp for output creation.
|
|
288
288
|
:vartype generated_at: str
|
|
289
|
-
:ivar
|
|
290
|
-
:vartype
|
|
289
|
+
:ivar snapshot: Analysis snapshot manifest.
|
|
290
|
+
:vartype snapshot: AnalysisRunManifest
|
|
291
291
|
:ivar report: Profiling report data.
|
|
292
292
|
:vartype report: ProfilingReport
|
|
293
293
|
"""
|
|
@@ -295,7 +295,7 @@ class ProfilingOutput(AnalysisSchemaModel):
|
|
|
295
295
|
schema_version: int = Field(default=ANALYSIS_SCHEMA_VERSION, ge=1)
|
|
296
296
|
analysis_id: str
|
|
297
297
|
generated_at: str
|
|
298
|
-
|
|
298
|
+
snapshot: AnalysisRunManifest
|
|
299
299
|
report: ProfilingReport
|
|
300
300
|
|
|
301
301
|
|
|
@@ -482,9 +482,9 @@ class TopicModelingLlmFineTuningConfig(AnalysisSchemaModel):
|
|
|
482
482
|
return self
|
|
483
483
|
|
|
484
484
|
|
|
485
|
-
class
|
|
485
|
+
class TopicModelingConfiguration(AnalysisSchemaModel):
|
|
486
486
|
"""
|
|
487
|
-
|
|
487
|
+
Configuration for topic modeling analysis.
|
|
488
488
|
|
|
489
489
|
:ivar schema_version: Analysis schema version.
|
|
490
490
|
:vartype schema_version: int
|
|
@@ -518,7 +518,7 @@ class TopicModelingRecipeConfig(AnalysisSchemaModel):
|
|
|
518
518
|
)
|
|
519
519
|
|
|
520
520
|
@model_validator(mode="after")
|
|
521
|
-
def _validate_schema_version(self) -> "
|
|
521
|
+
def _validate_schema_version(self) -> "TopicModelingConfiguration":
|
|
522
522
|
if self.schema_version != ANALYSIS_SCHEMA_VERSION:
|
|
523
523
|
raise ValueError(f"Unsupported analysis schema version: {self.schema_version}")
|
|
524
524
|
return self
|
|
@@ -764,8 +764,8 @@ class TopicModelingOutput(AnalysisSchemaModel):
|
|
|
764
764
|
:vartype analysis_id: str
|
|
765
765
|
:ivar generated_at: International Organization for Standardization 8601 timestamp for output creation.
|
|
766
766
|
:vartype generated_at: str
|
|
767
|
-
:ivar
|
|
768
|
-
:vartype
|
|
767
|
+
:ivar snapshot: Analysis snapshot manifest.
|
|
768
|
+
:vartype snapshot: AnalysisRunManifest
|
|
769
769
|
:ivar report: Topic modeling report data.
|
|
770
770
|
:vartype report: TopicModelingReport
|
|
771
771
|
"""
|
|
@@ -773,7 +773,7 @@ class TopicModelingOutput(AnalysisSchemaModel):
|
|
|
773
773
|
schema_version: int = Field(default=ANALYSIS_SCHEMA_VERSION, ge=1)
|
|
774
774
|
analysis_id: str
|
|
775
775
|
generated_at: str
|
|
776
|
-
|
|
776
|
+
snapshot: AnalysisRunManifest
|
|
777
777
|
report: TopicModelingReport
|
|
778
778
|
|
|
779
779
|
|
|
@@ -1049,26 +1049,26 @@ class MarkovAnalysisTopicModelingConfig(AnalysisSchemaModel):
|
|
|
1049
1049
|
|
|
1050
1050
|
:ivar enabled: Whether to run topic modeling on segments.
|
|
1051
1051
|
:vartype enabled: bool
|
|
1052
|
-
:ivar
|
|
1053
|
-
:vartype
|
|
1052
|
+
:ivar configuration: Topic modeling configuration applied to segments.
|
|
1053
|
+
:vartype configuration: TopicModelingConfiguration or None
|
|
1054
1054
|
"""
|
|
1055
1055
|
|
|
1056
1056
|
enabled: bool = Field(default=False)
|
|
1057
|
-
|
|
1057
|
+
configuration: Optional["TopicModelingConfiguration"] = None
|
|
1058
1058
|
|
|
1059
1059
|
@model_validator(mode="after")
|
|
1060
1060
|
def _validate_requirements(self) -> "MarkovAnalysisTopicModelingConfig":
|
|
1061
1061
|
if not self.enabled:
|
|
1062
1062
|
return self
|
|
1063
|
-
if self.
|
|
1063
|
+
if self.configuration is None:
|
|
1064
1064
|
raise ValueError(
|
|
1065
|
-
"topic_modeling.
|
|
1065
|
+
"topic_modeling.configuration is required when topic_modeling.enabled is true"
|
|
1066
1066
|
)
|
|
1067
|
-
if self.
|
|
1068
|
-
self.
|
|
1067
|
+
if self.configuration.llm_extraction.enabled and (
|
|
1068
|
+
self.configuration.llm_extraction.method != TopicModelingLlmExtractionMethod.SINGLE
|
|
1069
1069
|
):
|
|
1070
1070
|
raise ValueError(
|
|
1071
|
-
"topic_modeling.
|
|
1071
|
+
"topic_modeling.configuration.llm_extraction.method must be 'single' for Markov topic modeling"
|
|
1072
1072
|
)
|
|
1073
1073
|
return self
|
|
1074
1074
|
|
|
@@ -1288,9 +1288,9 @@ class MarkovAnalysisStateNamingConfig(AnalysisSchemaModel):
|
|
|
1288
1288
|
return self
|
|
1289
1289
|
|
|
1290
1290
|
|
|
1291
|
-
class
|
|
1291
|
+
class MarkovAnalysisConfiguration(AnalysisSchemaModel):
|
|
1292
1292
|
"""
|
|
1293
|
-
|
|
1293
|
+
Configuration for Markov analysis.
|
|
1294
1294
|
|
|
1295
1295
|
:ivar schema_version: Analysis schema version.
|
|
1296
1296
|
:vartype schema_version: int
|
|
@@ -1334,7 +1334,7 @@ class MarkovAnalysisRecipeConfig(AnalysisSchemaModel):
|
|
|
1334
1334
|
report: MarkovAnalysisReportConfig = Field(default_factory=MarkovAnalysisReportConfig)
|
|
1335
1335
|
|
|
1336
1336
|
@model_validator(mode="after")
|
|
1337
|
-
def _validate_schema_version(self) -> "
|
|
1337
|
+
def _validate_schema_version(self) -> "MarkovAnalysisConfiguration":
|
|
1338
1338
|
if self.schema_version != ANALYSIS_SCHEMA_VERSION:
|
|
1339
1339
|
raise ValueError(f"Unsupported analysis schema version: {self.schema_version}")
|
|
1340
1340
|
return self
|
|
@@ -1346,7 +1346,7 @@ class MarkovAnalysisTextCollectionReport(AnalysisSchemaModel):
|
|
|
1346
1346
|
|
|
1347
1347
|
:ivar status: Stage status.
|
|
1348
1348
|
:vartype status: MarkovAnalysisStageStatus
|
|
1349
|
-
:ivar source_items: Count of items in extraction
|
|
1349
|
+
:ivar source_items: Count of items in extraction snapshot.
|
|
1350
1350
|
:vartype source_items: int
|
|
1351
1351
|
:ivar documents: Count of documents included.
|
|
1352
1352
|
:vartype documents: int
|
|
@@ -1517,8 +1517,8 @@ class MarkovAnalysisOutput(AnalysisSchemaModel):
|
|
|
1517
1517
|
:vartype analysis_id: str
|
|
1518
1518
|
:ivar generated_at: International Organization for Standardization 8601 timestamp for output creation.
|
|
1519
1519
|
:vartype generated_at: str
|
|
1520
|
-
:ivar
|
|
1521
|
-
:vartype
|
|
1520
|
+
:ivar snapshot: Analysis snapshot manifest.
|
|
1521
|
+
:vartype snapshot: AnalysisRunManifest
|
|
1522
1522
|
:ivar report: Markov analysis report data.
|
|
1523
1523
|
:vartype report: MarkovAnalysisReport
|
|
1524
1524
|
"""
|
|
@@ -1526,5 +1526,5 @@ class MarkovAnalysisOutput(AnalysisSchemaModel):
|
|
|
1526
1526
|
schema_version: int = Field(default=ANALYSIS_SCHEMA_VERSION, ge=1)
|
|
1527
1527
|
analysis_id: str
|
|
1528
1528
|
generated_at: str
|
|
1529
|
-
|
|
1529
|
+
snapshot: AnalysisRunManifest
|
|
1530
1530
|
report: MarkovAnalysisReport
|
biblicus/analysis/profiling.py
CHANGED
|
@@ -12,20 +12,20 @@ from typing import Dict, Iterable, List, Sequence
|
|
|
12
12
|
from pydantic import BaseModel
|
|
13
13
|
|
|
14
14
|
from ..corpus import Corpus
|
|
15
|
-
from ..models import CatalogItem,
|
|
15
|
+
from ..models import CatalogItem, ExtractionSnapshotReference
|
|
16
16
|
from ..retrieval import hash_text
|
|
17
17
|
from ..time import utc_now_iso
|
|
18
18
|
from .base import CorpusAnalysisBackend
|
|
19
19
|
from .models import (
|
|
20
|
-
|
|
20
|
+
AnalysisConfigurationManifest,
|
|
21
21
|
AnalysisRunInput,
|
|
22
22
|
AnalysisRunManifest,
|
|
23
|
+
ProfilingConfiguration,
|
|
23
24
|
ProfilingDistributionReport,
|
|
24
25
|
ProfilingExtractedTextReport,
|
|
25
26
|
ProfilingOutput,
|
|
26
27
|
ProfilingPercentileValue,
|
|
27
28
|
ProfilingRawItemsReport,
|
|
28
|
-
ProfilingRecipeConfig,
|
|
29
29
|
ProfilingReport,
|
|
30
30
|
ProfilingTagCount,
|
|
31
31
|
ProfilingTagReport,
|
|
@@ -46,62 +46,67 @@ class ProfilingBackend(CorpusAnalysisBackend):
|
|
|
46
46
|
self,
|
|
47
47
|
corpus: Corpus,
|
|
48
48
|
*,
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
49
|
+
configuration_name: str,
|
|
50
|
+
configuration: Dict[str, object],
|
|
51
|
+
extraction_snapshot: ExtractionSnapshotReference,
|
|
52
52
|
) -> BaseModel:
|
|
53
53
|
"""
|
|
54
54
|
Run the profiling analysis pipeline.
|
|
55
55
|
|
|
56
56
|
:param corpus: Corpus to analyze.
|
|
57
57
|
:type corpus: Corpus
|
|
58
|
-
:param
|
|
59
|
-
:type
|
|
60
|
-
:param
|
|
61
|
-
:type
|
|
62
|
-
:param
|
|
63
|
-
:type
|
|
58
|
+
:param configuration_name: Human-readable configuration name.
|
|
59
|
+
:type configuration_name: str
|
|
60
|
+
:param configuration: Analysis configuration values.
|
|
61
|
+
:type configuration: dict[str, object]
|
|
62
|
+
:param extraction_snapshot: Extraction snapshot reference for text inputs.
|
|
63
|
+
:type extraction_snapshot: biblicus.models.ExtractionSnapshotReference
|
|
64
64
|
:return: Profiling output model.
|
|
65
65
|
:rtype: pydantic.BaseModel
|
|
66
66
|
"""
|
|
67
67
|
parsed_config = (
|
|
68
|
-
|
|
69
|
-
if isinstance(
|
|
70
|
-
else
|
|
68
|
+
configuration
|
|
69
|
+
if isinstance(configuration, ProfilingConfiguration)
|
|
70
|
+
else ProfilingConfiguration.model_validate(configuration)
|
|
71
71
|
)
|
|
72
72
|
return _run_profiling(
|
|
73
73
|
corpus=corpus,
|
|
74
|
-
|
|
74
|
+
configuration_name=configuration_name,
|
|
75
75
|
config=parsed_config,
|
|
76
|
-
|
|
76
|
+
extraction_snapshot=extraction_snapshot,
|
|
77
77
|
)
|
|
78
78
|
|
|
79
79
|
|
|
80
80
|
def _run_profiling(
|
|
81
81
|
*,
|
|
82
82
|
corpus: Corpus,
|
|
83
|
-
|
|
84
|
-
config:
|
|
85
|
-
|
|
83
|
+
configuration_name: str,
|
|
84
|
+
config: ProfilingConfiguration,
|
|
85
|
+
extraction_snapshot: ExtractionSnapshotReference,
|
|
86
86
|
) -> ProfilingOutput:
|
|
87
|
-
|
|
87
|
+
configuration_manifest = _create_configuration_manifest(
|
|
88
|
+
name=configuration_name,
|
|
89
|
+
config=config,
|
|
90
|
+
)
|
|
88
91
|
catalog = corpus.load_catalog()
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
+
snapshot_id = _analysis_snapshot_id(
|
|
93
|
+
configuration_id=configuration_manifest.configuration_id,
|
|
94
|
+
extraction_snapshot=extraction_snapshot,
|
|
92
95
|
catalog_generated_at=catalog.generated_at,
|
|
93
96
|
)
|
|
94
97
|
run_manifest = AnalysisRunManifest(
|
|
95
|
-
|
|
96
|
-
|
|
98
|
+
snapshot_id=snapshot_id,
|
|
99
|
+
configuration=configuration_manifest,
|
|
97
100
|
corpus_uri=catalog.corpus_uri,
|
|
98
101
|
catalog_generated_at=catalog.generated_at,
|
|
99
102
|
created_at=utc_now_iso(),
|
|
100
|
-
input=AnalysisRunInput(
|
|
103
|
+
input=AnalysisRunInput(extraction_snapshot=extraction_snapshot),
|
|
101
104
|
artifact_paths=[],
|
|
102
105
|
stats={},
|
|
103
106
|
)
|
|
104
|
-
run_dir = corpus.analysis_run_dir(
|
|
107
|
+
run_dir = corpus.analysis_run_dir(
|
|
108
|
+
analysis_id=ProfilingBackend.analysis_id, snapshot_id=snapshot_id
|
|
109
|
+
)
|
|
105
110
|
output_path = run_dir / "output.json"
|
|
106
111
|
run_dir.mkdir(parents=True, exist_ok=True)
|
|
107
112
|
|
|
@@ -109,7 +114,7 @@ def _run_profiling(
|
|
|
109
114
|
raw_report = _build_raw_items_report(items=ordered_items, config=config)
|
|
110
115
|
extracted_report = _build_extracted_text_report(
|
|
111
116
|
corpus=corpus,
|
|
112
|
-
|
|
117
|
+
extraction_snapshot=extraction_snapshot,
|
|
113
118
|
config=config,
|
|
114
119
|
)
|
|
115
120
|
|
|
@@ -133,15 +138,17 @@ def _run_profiling(
|
|
|
133
138
|
output = ProfilingOutput(
|
|
134
139
|
analysis_id=ProfilingBackend.analysis_id,
|
|
135
140
|
generated_at=utc_now_iso(),
|
|
136
|
-
|
|
141
|
+
snapshot=run_manifest,
|
|
137
142
|
report=report,
|
|
138
143
|
)
|
|
139
144
|
_write_profiling_output(path=output_path, output=output)
|
|
140
145
|
return output
|
|
141
146
|
|
|
142
147
|
|
|
143
|
-
def
|
|
144
|
-
|
|
148
|
+
def _create_configuration_manifest(
|
|
149
|
+
*, name: str, config: ProfilingConfiguration
|
|
150
|
+
) -> AnalysisConfigurationManifest:
|
|
151
|
+
configuration_payload = json.dumps(
|
|
145
152
|
{
|
|
146
153
|
"analysis_id": ProfilingBackend.analysis_id,
|
|
147
154
|
"name": name,
|
|
@@ -149,9 +156,9 @@ def _create_recipe_manifest(*, name: str, config: ProfilingRecipeConfig) -> Anal
|
|
|
149
156
|
},
|
|
150
157
|
sort_keys=True,
|
|
151
158
|
)
|
|
152
|
-
|
|
153
|
-
return
|
|
154
|
-
|
|
159
|
+
configuration_id = hash_text(configuration_payload)
|
|
160
|
+
return AnalysisConfigurationManifest(
|
|
161
|
+
configuration_id=configuration_id,
|
|
155
162
|
analysis_id=ProfilingBackend.analysis_id,
|
|
156
163
|
name=name,
|
|
157
164
|
created_at=utc_now_iso(),
|
|
@@ -159,10 +166,13 @@ def _create_recipe_manifest(*, name: str, config: ProfilingRecipeConfig) -> Anal
|
|
|
159
166
|
)
|
|
160
167
|
|
|
161
168
|
|
|
162
|
-
def
|
|
163
|
-
*,
|
|
169
|
+
def _analysis_snapshot_id(
|
|
170
|
+
*,
|
|
171
|
+
configuration_id: str,
|
|
172
|
+
extraction_snapshot: ExtractionSnapshotReference,
|
|
173
|
+
catalog_generated_at: str,
|
|
164
174
|
) -> str:
|
|
165
|
-
run_seed = f"{
|
|
175
|
+
run_seed = f"{configuration_id}:{extraction_snapshot.as_string()}:{catalog_generated_at}"
|
|
166
176
|
return hash_text(run_seed)
|
|
167
177
|
|
|
168
178
|
|
|
@@ -186,7 +196,7 @@ def _ordered_catalog_items(
|
|
|
186
196
|
|
|
187
197
|
|
|
188
198
|
def _build_raw_items_report(
|
|
189
|
-
*, items: Sequence[CatalogItem], config:
|
|
199
|
+
*, items: Sequence[CatalogItem], config: ProfilingConfiguration
|
|
190
200
|
) -> ProfilingRawItemsReport:
|
|
191
201
|
media_type_counts: Dict[str, int] = {}
|
|
192
202
|
for item in items:
|
|
@@ -205,7 +215,7 @@ def _build_raw_items_report(
|
|
|
205
215
|
|
|
206
216
|
|
|
207
217
|
def _build_tag_report(
|
|
208
|
-
*, items: Sequence[CatalogItem], config:
|
|
218
|
+
*, items: Sequence[CatalogItem], config: ProfilingConfiguration
|
|
209
219
|
) -> ProfilingTagReport:
|
|
210
220
|
tag_filters = config.tag_filters
|
|
211
221
|
tag_filter_set = set(tag_filters or [])
|
|
@@ -236,20 +246,20 @@ def _build_tag_report(
|
|
|
236
246
|
def _build_extracted_text_report(
|
|
237
247
|
*,
|
|
238
248
|
corpus: Corpus,
|
|
239
|
-
|
|
240
|
-
config:
|
|
249
|
+
extraction_snapshot: ExtractionSnapshotReference,
|
|
250
|
+
config: ProfilingConfiguration,
|
|
241
251
|
) -> ProfilingExtractedTextReport:
|
|
242
|
-
manifest = corpus.
|
|
243
|
-
extractor_id=
|
|
244
|
-
|
|
252
|
+
manifest = corpus.load_extraction_snapshot_manifest(
|
|
253
|
+
extractor_id=extraction_snapshot.extractor_id,
|
|
254
|
+
snapshot_id=extraction_snapshot.snapshot_id,
|
|
245
255
|
)
|
|
246
256
|
nonempty_items = 0
|
|
247
257
|
empty_items = 0
|
|
248
258
|
missing_items = 0
|
|
249
259
|
text_lengths: List[int] = []
|
|
250
|
-
text_dir = corpus.
|
|
251
|
-
extractor_id=
|
|
252
|
-
|
|
260
|
+
text_dir = corpus.extraction_snapshot_dir(
|
|
261
|
+
extractor_id=extraction_snapshot.extractor_id,
|
|
262
|
+
snapshot_id=extraction_snapshot.snapshot_id,
|
|
253
263
|
)
|
|
254
264
|
|
|
255
265
|
for item_result in manifest.items:
|