indw 1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- app/cli.py +34 -0
- app/commands/__init__.py +16 -0
- app/commands/audit.py +22 -0
- app/commands/benchmark.py +15 -0
- app/commands/doctor.py +22 -0
- app/commands/merge.py +43 -0
- app/commands/test.py +20 -0
- app/commands/validate.py +15 -0
- app/workflows.py +70 -0
- indw/__init__.py +42 -0
- indw/_compat.py +35 -0
- indw/clean/__init__.py +29 -0
- indw/clean/artifact/calibrate.py +80 -0
- indw/clean/artifact/confidence.py +132 -0
- indw/clean/artifact/decompose.py +265 -0
- indw/clean/artifact/discovery_clean.py +57 -0
- indw/clean/artifact/discovery_config.py +74 -0
- indw/clean/artifact/discovery_corpus.py +364 -0
- indw/clean/artifact/discovery_engine.py +229 -0
- indw/clean/artifact/discovery_registry.py +191 -0
- indw/clean/artifact/discovery_structural.py +48 -0
- indw/clean/artifact/discovery_validation.py +54 -0
- indw/clean/artifact/engine.py +161 -0
- indw/clean/artifact/evidence.py +73 -0
- indw/clean/artifact/evidence_cache.py +348 -0
- indw/clean/artifact/evidence_engine.py +328 -0
- indw/clean/artifact/evidence_features.py +197 -0
- indw/clean/artifact/evidence_model.py +536 -0
- indw/clean/artifact/evidence_util.py +71 -0
- indw/clean/artifact/novelty.py +34 -0
- indw/clean/artifact/positional.py +29 -0
- indw/clean/artifact/registry.py +139 -0
- indw/clean/artifact/safeguards.py +42 -0
- indw/clean/artifact/strip.py +144 -0
- indw/clean/artifact/trim.py +76 -0
- indw/clean/corpus.py +712 -0
- indw/clean/document/adaptive.py +161 -0
- indw/clean/document/boilerplate.py +71 -0
- indw/clean/document/clean.py +56 -0
- indw/clean/document/code_preservation.py +287 -0
- indw/clean/document/compression.py +109 -0
- indw/clean/document/config.py +148 -0
- indw/clean/document/conversation.py +219 -0
- indw/clean/document/dedup.py +33 -0
- indw/clean/document/html.py +189 -0
- indw/clean/document/license.py +702 -0
- indw/clean/document/metrics.py +293 -0
- indw/clean/document/normalize.py +136 -0
- indw/clean/document/patterns.py +96 -0
- indw/clean/document/segment.py +217 -0
- indw/clean/document/stage_manifest.py +94 -0
- indw/clean/document/stats.py +134 -0
- indw/clean/document/ui.py +103 -0
- indw/clean/document/validate.py +105 -0
- indw/clean/document/value.py +557 -0
- indw/clean/gate/evaluate.py +502 -0
- indw/clean/gate/policy.py +88 -0
- indw/clean/meta/clean.py +225 -0
- indw/clean/meta/foundation.py +505 -0
- indw/clean/meta/patterns.py +358 -0
- indw/clean/meta/stats.py +72 -0
- indw/clean/meta/strip.py +319 -0
- indw/clean/semantic/boilerplate.py +102 -0
- indw/clean/semantic/classifier.py +174 -0
- indw/clean/semantic/clean.py +145 -0
- indw/clean/semantic/config.py +114 -0
- indw/clean/semantic/embedded.py +396 -0
- indw/clean/semantic/fingerprints.py +89 -0
- indw/clean/semantic/ocr_normalize.py +91 -0
- indw/clean/semantic/pipeline.py +241 -0
- indw/clean/semantic/report.py +126 -0
- indw/clean/semantic/routing.py +146 -0
- indw/clean/semantic/scoring.py +86 -0
- indw/clean/semantic/section_artifacts.py +345 -0
- indw/clean/semantic/spec.py +8 -0
- indw/clean/semantic/structure.py +241 -0
- indw/clean/semantic/thresholds.py +52 -0
- indw/clean/structure/__init__.py +35 -0
- indw/clean/structure/extract.py +121 -0
- indw/clean/structure/labeled_qa.py +104 -0
- indw/clean/structure/reference_sections.py +87 -0
- indw/config/__init__.py +44 -0
- indw/config/defaults.py +147 -0
- indw/config/loader.py +215 -0
- indw/config/resolve.py +171 -0
- indw/config/validation.py +103 -0
- indw/dedup/__init__.py +21 -0
- indw/dedup/backends/__init__.py +21 -0
- indw/dedup/backends/fuzzy.py +7 -0
- indw/dedup/embed/__init__.py +10 -0
- indw/dedup/embed/ann.py +61 -0
- indw/dedup/embed/candidate.py +84 -0
- indw/dedup/embed/cluster.py +45 -0
- indw/dedup/embed/config.py +59 -0
- indw/dedup/embed/contracts.py +104 -0
- indw/dedup/embed/e5.py +324 -0
- indw/dedup/embed/pipeline.py +212 -0
- indw/dedup/embed/pools/__init__.py +0 -0
- indw/dedup/embed/pools/gpu_worker.py +34 -0
- indw/dedup/embed/providers.py +93 -0
- indw/dedup/embed/representative.py +20 -0
- indw/dedup/embed/similarity.py +36 -0
- indw/dedup/embed/threshold.py +47 -0
- indw/dedup/exact.py +173 -0
- indw/dedup/fuzzy.py +110 -0
- indw/dedup/normalize.py +58 -0
- indw/dedup/replay.py +68 -0
- indw/dedup/semantic.py +140 -0
- indw/dedup/service/__init__.py +3 -0
- indw/dedup/service/exact_shard.py +106 -0
- indw/dedup/storage.py +60 -0
- indw/extract/__init__.py +20 -0
- indw/extract/assess/doc_type.py +64 -0
- indw/extract/assess/engine.py +226 -0
- indw/extract/assess/feedback.py +79 -0
- indw/extract/assess/metrics.py +116 -0
- indw/extract/assess/quality.py +70 -0
- indw/extract/core/clean.py +389 -0
- indw/extract/core/context.py +312 -0
- indw/extract/core/profile.py +223 -0
- indw/extract/core/units.py +1020 -0
- indw/extract/nav/context.py +442 -0
- indw/extract/nav/metrics.py +138 -0
- indw/extract/nav/template.py +83 -0
- indw/extract/roles/education.py +551 -0
- indw/extract/roles/forum.py +733 -0
- indw/extract/roles/publication.py +1053 -0
- indw/extract/roles/units.py +246 -0
- indw/extract/sections/boundaries.py +535 -0
- indw/extract/sections/classify.py +594 -0
- indw/extract/sections/integrity.py +462 -0
- indw/extract/sections/quality.py +440 -0
- indw/extract/sections/scratch.py +37 -0
- indw/extract/sections/semantic.py +522 -0
- indw/extract/structure/aggregate.py +773 -0
- indw/extract/structure/analyze.py +150 -0
- indw/extract/structure/document.py +67 -0
- indw/extract/structure/inline.py +457 -0
- indw/extract/structure/recovery.py +68 -0
- indw/extract/structure/segment.py +129 -0
- indw/filter/__init__.py +23 -0
- indw/filter/content/code.py +681 -0
- indw/filter/content/domain.py +65 -0
- indw/filter/content/filters.py +504 -0
- indw/filter/content/metadata.py +143 -0
- indw/filter/decide/calibrate.py +207 -0
- indw/filter/decide/curator.py +92 -0
- indw/filter/decide/engine.py +302 -0
- indw/filter/decide/policy.py +374 -0
- indw/filter/decide/threshold.py +160 -0
- indw/filter/gate/diagnostics.py +90 -0
- indw/filter/gate/quality.py +528 -0
- indw/filter/gate/reports.py +153 -0
- indw/filter/gate/scorer.py +77 -0
- indw/filter/language/bridge.py +54 -0
- indw/filter/language/confidence.py +42 -0
- indw/filter/language/config.py +132 -0
- indw/filter/language/detect.py +168 -0
- indw/filter/language/fast_detector.py +86 -0
- indw/filter/language/mixed.py +85 -0
- indw/filter/language/reports.py +127 -0
- indw/filter/language/script.py +339 -0
- indw/filter/language/script_metrics.py +157 -0
- indw/filter/language/script_opt.py +141 -0
- indw/filter/language/script_orch.py +84 -0
- indw/filter/language/script_policy.py +40 -0
- indw/filter/language/script_table.py +127 -0
- indw/filter/language/stats.py +39 -0
- indw/filter/language/telemetry.py +161 -0
- indw/filter/language/token_metrics.py +121 -0
- indw/filter/language/validation.py +218 -0
- indw/filter/license/classifier.py +68 -0
- indw/filter/license/config.py +89 -0
- indw/filter/license/detector.py +188 -0
- indw/filter/license/manifest.py +70 -0
- indw/filter/license/normalize.py +140 -0
- indw/filter/license/policy.py +244 -0
- indw/filter/license/record.py +74 -0
- indw/filter/license/reports.py +104 -0
- indw/filter/license/schema.py +101 -0
- indw/filter/license/source_policy.py +157 -0
- indw/filter/pii/config.py +130 -0
- indw/filter/pii/context.py +72 -0
- indw/filter/pii/detect.py +76 -0
- indw/filter/pii/entities.py +182 -0
- indw/filter/pii/redaction.py +25 -0
- indw/filter/pii/reports.py +96 -0
- indw/filter/pii/risk.py +91 -0
- indw/filter/pii/secrets.py +206 -0
- indw/filter/pii/validation.py +180 -0
- indw/filter/refine/audit.py +124 -0
- indw/filter/refine/corpus.py +226 -0
- indw/filter/refine/processor.py +102 -0
- indw/filter/refine/rewrite.py +146 -0
- indw/filter/refine/settings.py +74 -0
- indw/filter/refine/truncation.py +189 -0
- indw/filter/score/adaptive.py +65 -0
- indw/filter/score/analysis.py +274 -0
- indw/filter/score/artifacts.py +174 -0
- indw/filter/score/builder.py +167 -0
- indw/filter/score/canonical.py +112 -0
- indw/filter/score/continuous.py +199 -0
- indw/filter/score/engine.py +41 -0
- indw/filter/score/signals.py +179 -0
- indw/filter/score/types.py +108 -0
- indw/filter/spec/document.py +163 -0
- indw/filter/spec/export.py +44 -0
- indw/filter/spec/pipeline.py +441 -0
- indw/filter/spec/quality.py +412 -0
- indw/filter/spec/validate.py +39 -0
- indw/filter/stage0/admission.py +68 -0
- indw/filter/stage0/audit.py +528 -0
- indw/filter/stage0/engine.py +289 -0
- indw/filter/stage0/verify.py +316 -0
- indw/filter/toxicity/classifier_labels.py +10 -0
- indw/filter/toxicity/config.py +138 -0
- indw/filter/toxicity/context.py +100 -0
- indw/filter/toxicity/detect.py +90 -0
- indw/filter/toxicity/patterns.py +62 -0
- indw/filter/toxicity/reports.py +96 -0
- indw/filter/toxicity/rule_scorer.py +89 -0
- indw/filter/toxicity/rules.py +83 -0
- indw/filter/toxicity/scorer.py +126 -0
- indw/filter/toxicity/validation.py +146 -0
- indw/ingest/__init__.py +18 -0
- indw/ingest/download.py +199 -0
- indw/ingest/format.py +165 -0
- indw/ingest/hash.py +65 -0
- indw/ingest/hf_datasets.py +79 -0
- indw/ingest/hf_env.py +12 -0
- indw/ingest/log.py +57 -0
- indw/ingest/resume.py +39 -0
- indw/ingest/run.py +166 -0
- indw/ingest/sink.py +132 -0
- indw/ingest/transcript.py +56 -0
- indw/schedule/__init__.py +18 -0
- indw/schedule/admission/__init__.py +21 -0
- indw/schedule/admission/tier01.py +90 -0
- indw/schedule/admission/tiers.py +35 -0
- indw/schedule/admission/tracker.py +61 -0
- indw/schedule/apply/coordinator.py +165 -0
- indw/schedule/apply/dedup.py +121 -0
- indw/schedule/apply/lifecycle.py +145 -0
- indw/schedule/apply/merge.py +284 -0
- indw/schedule/apply/serialize.py +79 -0
- indw/schedule/architecture/__init__.py +26 -0
- indw/schedule/architecture/classify.py +76 -0
- indw/schedule/architecture/graph.py +67 -0
- indw/schedule/architecture/ownership.py +131 -0
- indw/schedule/architecture/resources.py +44 -0
- indw/schedule/backends/__init__.py +13 -0
- indw/schedule/backends/config.py +40 -0
- indw/schedule/backends/contract.py +30 -0
- indw/schedule/backends/dask.py +131 -0
- indw/schedule/backends/factory.py +27 -0
- indw/schedule/backends/local.py +76 -0
- indw/schedule/backends/multiprocess.py +120 -0
- indw/schedule/backends/thread.py +121 -0
- indw/schedule/config/hardware.py +140 -0
- indw/schedule/config/pin.py +47 -0
- indw/schedule/config/policy.py +356 -0
- indw/schedule/config/resolve.py +80 -0
- indw/schedule/config/tune.py +258 -0
- indw/schedule/core.py +128 -0
- indw/schedule/dispatch/alloc.py +289 -0
- indw/schedule/dispatch/lanes.py +178 -0
- indw/schedule/dispatch/parallel.py +802 -0
- indw/schedule/dispatch/workers.py +252 -0
- indw/schedule/graph/__init__.py +3 -0
- indw/schedule/graph/artifacts.py +33 -0
- indw/schedule/graph/config.py +30 -0
- indw/schedule/graph/envelope.py +5 -0
- indw/schedule/graph/queues.py +161 -0
- indw/schedule/graph/runner.py +297 -0
- indw/schedule/ingest/coordinator.py +20 -0
- indw/schedule/intel/coordination.py +106 -0
- indw/schedule/intel/fingerprints.py +171 -0
- indw/schedule/intel/genome.py +229 -0
- indw/schedule/intel/hardware.py +109 -0
- indw/schedule/intel/incremental.py +192 -0
- indw/schedule/intel/inheritance.py +55 -0
- indw/schedule/intel/lci_graph.py +599 -0
- indw/schedule/intel/lci_router.py +112 -0
- indw/schedule/intel/merge_session.py +72 -0
- indw/schedule/intel/pci.py +307 -0
- indw/schedule/intel/pools/__init__.py +0 -0
- indw/schedule/intel/pools/acim.py +43 -0
- indw/schedule/intel/pools/pci.py +41 -0
- indw/schedule/intel/promotion.py +138 -0
- indw/schedule/intel/router.py +117 -0
- indw/schedule/intel/scores.py +86 -0
- indw/schedule/intel/session.py +417 -0
- indw/schedule/intel/store.py +289 -0
- indw/schedule/mix/config.py +80 -0
- indw/schedule/mix/curriculum.py +97 -0
- indw/schedule/mix/mixture_planner.py +130 -0
- indw/schedule/mix/plan.py +83 -0
- indw/schedule/mix/sampler.py +181 -0
- indw/schedule/mix/telemetry.py +116 -0
- indw/schedule/monitor/audit.py +23 -0
- indw/schedule/monitor/budget.py +33 -0
- indw/schedule/monitor/cost.py +183 -0
- indw/schedule/monitor/cpu.py +17 -0
- indw/schedule/monitor/doc.py +264 -0
- indw/schedule/monitor/invariants.py +105 -0
- indw/schedule/monitor/live.py +211 -0
- indw/schedule/monitor/obs.py +123 -0
- indw/schedule/monitor/pipeline_exporter.py +744 -0
- indw/schedule/read/gates.py +163 -0
- indw/schedule/read/ingest.py +38 -0
- indw/schedule/read/preprocess.py +53 -0
- indw/schedule/read/probe.py +230 -0
- indw/schedule/read/sources.py +57 -0
- indw/schedule/routing/__init__.py +0 -0
- indw/schedule/routing/admission.py +24 -0
- indw/schedule/row/index.py +127 -0
- indw/schedule/row/provenance.py +37 -0
- indw/schedule/row/reject.py +58 -0
- indw/schedule/row/resolve.py +40 -0
- indw/schedule/row/signals.py +55 -0
- indw/schedule/stages/__init__.py +21 -0
- indw/schedule/stages/artifact_cleaning.py +11 -0
- indw/schedule/stages/classification.py +96 -0
- indw/schedule/stages/contracts.py +71 -0
- indw/schedule/stages/curator.py +20 -0
- indw/schedule/stages/engine.py +325 -0
- indw/schedule/stages/knowledge.py +16 -0
- indw/schedule/stages/normalization.py +11 -0
- indw/schedule/stages/pools/__init__.py +20 -0
- indw/schedule/stages/pools/chain.py +111 -0
- indw/schedule/stages/pools/clean.py +57 -0
- indw/schedule/stages/pools/filter.py +58 -0
- indw/schedule/stages/pools/preprocess.py +67 -0
- indw/schedule/stages/pools/stage0.py +93 -0
- indw/schedule/stages/quality.py +20 -0
- indw/schedule/stages/rewrite.py +17 -0
- indw/schedule/stages/runner.py +121 -0
- indw/schedule/stages/structural_repair.py +61 -0
- indw/schedule/stages/validation.py +15 -0
- indw/schedule/state/artifacts.py +358 -0
- indw/schedule/state/checkpoint.py +770 -0
- indw/schedule/state/context.py +130 -0
- indw/schedule/state/lock.py +135 -0
- indw/schedule/state/sessions.py +81 -0
- indw/schedule/state/setup.py +235 -0
- indw/schedule/state/survivor.py +87 -0
- indw/store/corpus/manifest.py +80 -0
- indw/store/corpus/registry.py +134 -0
- indw/store/eval/compare.py +121 -0
- indw/store/eval/config.py +87 -0
- indw/store/eval/decision.py +93 -0
- indw/store/eval/diversity.py +51 -0
- indw/store/eval/evaluator.py +153 -0
- indw/store/eval/knowledge.py +18 -0
- indw/store/eval/metrics.py +79 -0
- indw/store/eval/reports.py +102 -0
- indw/store/eval/scoring.py +53 -0
- indw/store/eval/validation.py +214 -0
- indw/store/export/config.py +73 -0
- indw/store/export/export_items.py +9 -0
- indw/store/export/fast_export.py +173 -0
- indw/store/export/memmap_stream.py +159 -0
- indw/store/export/packed_stream.py +323 -0
- indw/store/export/packing/__init__.py +12 -0
- indw/store/export/packing/binpack.py +234 -0
- indw/store/export/packing/collate.py +53 -0
- indw/store/export/packing/config.py +35 -0
- indw/store/export/pipeline.py +117 -0
- indw/store/export/prefetch.py +101 -0
- indw/store/export/replay_export.py +49 -0
- indw/store/export/shard_io.py +113 -0
- indw/store/export/shard_meta.py +129 -0
- indw/store/export/splits.py +37 -0
- indw/store/io/atomic.py +101 -0
- indw/store/io/cache.py +101 -0
- indw/store/io/columnar.py +73 -0
- indw/store/io/json_codec.py +33 -0
- indw/store/io/jsonl.py +93 -0
- indw/store/io/retry.py +106 -0
- indw/tools/__init__.py +0 -0
- indw/tools/metrics/alerts.py +57 -0
- indw/tools/metrics/config.py +70 -0
- indw/tools/metrics/pipeline_health.py +198 -0
- indw/tools/metrics/recovery.py +65 -0
- indw/tools/metrics/regression.py +191 -0
- indw/tools/metrics/reject_log.py +74 -0
- indw/tools/metrics/reports.py +114 -0
- indw/tools/metrics/snapshot.py +153 -0
- indw/tools/metrics/stage_profile.py +134 -0
- indw/tools/metrics/storage.py +71 -0
- indw/tools/metrics/trends.py +86 -0
- indw/tools/metrics/validation.py +199 -0
- indw/tools/reports/__init__.py +0 -0
- indw/tools/reports/admission_cost.py +187 -0
- indw/tools/reports/audit_left/artifact_leakage.py +105 -0
- indw/tools/reports/audit_left/pipeline.py +344 -0
- indw/tools/reports/audit_left/validation.py +526 -0
- indw/tools/reports/batch_efficiency_audit.py +174 -0
- indw/tools/reports/benchmark/__init__.py +21 -0
- indw/tools/reports/benchmark/scale.py +462 -0
- indw/tools/reports/dask_integration.py +149 -0
- indw/tools/reports/execution_consolidation.py +210 -0
- indw/tools/reports/fast/__init__.py +0 -0
- indw/tools/reports/fast/analyze.py +521 -0
- indw/tools/reports/fast/patterns.py +22 -0
- indw/tools/reports/fast/report.py +95 -0
- indw/tools/reports/fast/sample.py +81 -0
- indw/tools/reports/fast/stats.py +110 -0
- indw/tools/reports/foundation_cost.py +154 -0
- indw/tools/reports/heavy_cost.py +169 -0
- indw/tools/reports/library_migration.py +216 -0
- indw/tools/reports/pipeline_audit.py +216 -0
- indw/tools/reports/pipeline_tune_report.py +144 -0
- indw/tools/reports/production_scale_report.py +354 -0
- indw/tools/reports/stabilization_audit.py +362 -0
- indw/tools/reports/stage0_cost.py +164 -0
- indw/util/hf_tokenizers.py +21 -0
- indw/util/stable_hash.py +41 -0
- indw/util/stats.py +15 -0
- indw-1.0.dist-info/METADATA +279 -0
- indw-1.0.dist-info/RECORD +426 -0
- indw-1.0.dist-info/WHEEL +5 -0
- indw-1.0.dist-info/entry_points.txt +2 -0
- indw-1.0.dist-info/licenses/LICENSE +190 -0
- indw-1.0.dist-info/licenses/NOTICE +17 -0
- indw-1.0.dist-info/top_level.txt +2 -0
app/cli.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
from app.commands import (
|
|
7
|
+
register_audit,
|
|
8
|
+
register_benchmark,
|
|
9
|
+
register_doctor,
|
|
10
|
+
register_merge,
|
|
11
|
+
register_test,
|
|
12
|
+
register_validate,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
17
|
+
ap = argparse.ArgumentParser(prog="indw", description="INDW — Instant Data Workflow")
|
|
18
|
+
sub = ap.add_subparsers(dest="command", required=True)
|
|
19
|
+
register_merge(sub)
|
|
20
|
+
register_test(sub)
|
|
21
|
+
register_validate(sub)
|
|
22
|
+
register_audit(sub)
|
|
23
|
+
register_benchmark(sub)
|
|
24
|
+
register_doctor(sub)
|
|
25
|
+
return ap
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def main(argv: list[str] | None = None) -> int:
|
|
29
|
+
args = build_parser().parse_args(argv)
|
|
30
|
+
return int(args._handler(args))
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
if __name__ == "__main__":
|
|
34
|
+
raise SystemExit(main())
|
app/commands/__init__.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from app.commands.audit import register as register_audit
|
|
2
|
+
from app.commands.benchmark import register as register_benchmark
|
|
3
|
+
from app.commands.doctor import register as register_doctor
|
|
4
|
+
from app.commands.merge import register as register_merge
|
|
5
|
+
from app.commands.test import register as register_test
|
|
6
|
+
|
|
7
|
+
from app.commands.validate import register as register_validate
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"register_audit",
|
|
11
|
+
"register_benchmark",
|
|
12
|
+
"register_doctor",
|
|
13
|
+
"register_merge",
|
|
14
|
+
"register_test",
|
|
15
|
+
"register_validate",
|
|
16
|
+
]
|
app/commands/audit.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def register(sub: argparse._SubParsersAction) -> None:
|
|
8
|
+
p = sub.add_parser("audit", help="run pipeline audit reports")
|
|
9
|
+
p.add_argument(
|
|
10
|
+
"--kind",
|
|
11
|
+
choices=("pipeline", "dask", "production", "library", "stage0"),
|
|
12
|
+
default="pipeline",
|
|
13
|
+
)
|
|
14
|
+
p.add_argument("--work-dir", type=Path, default=None)
|
|
15
|
+
p.add_argument("--workers", type=int, default=4)
|
|
16
|
+
p.set_defaults(_handler=run)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def run(args: argparse.Namespace) -> int:
|
|
20
|
+
from app.workflows import run_audit
|
|
21
|
+
|
|
22
|
+
return run_audit(kind=args.kind, work_dir=args.work_dir, workers=args.workers)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def register(sub: argparse._SubParsersAction) -> None:
|
|
7
|
+
p = sub.add_parser("benchmark", help="production scale benchmark")
|
|
8
|
+
p.add_argument("--workers", default="1 2 4", help="worker counts")
|
|
9
|
+
p.set_defaults(_handler=run)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def run(args: argparse.Namespace) -> int:
|
|
13
|
+
from app.workflows import run_benchmark
|
|
14
|
+
|
|
15
|
+
return run_benchmark(workers=args.workers)
|
app/commands/doctor.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import importlib.util
|
|
5
|
+
import platform
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def register(sub: argparse._SubParsersAction) -> None:
|
|
9
|
+
p = sub.add_parser("doctor", help="check install and backend availability")
|
|
10
|
+
p.set_defaults(_handler=run)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def run(_args: argparse.Namespace) -> int:
|
|
14
|
+
import indw
|
|
15
|
+
from indw.schedule.backends.config import pipeline_execution_backend
|
|
16
|
+
from indw.schedule.backends.factory import resolve_execution_backend
|
|
17
|
+
|
|
18
|
+
print(f"indw={indw.__version__} python={platform.python_version()} platform={platform.platform()}")
|
|
19
|
+
print(f"backend={pipeline_execution_backend()} resolved={resolve_execution_backend().name}")
|
|
20
|
+
for pkg in ("orjson", "trafilatura", "dask"):
|
|
21
|
+
print(f"{pkg}={'ok' if importlib.util.find_spec(pkg) else 'missing'}")
|
|
22
|
+
return 0
|
app/commands/merge.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import os
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def register(sub: argparse._SubParsersAction) -> None:
|
|
9
|
+
p = sub.add_parser("merge", help="run quality merge on raw corpus")
|
|
10
|
+
p.add_argument("raw_dir", type=Path)
|
|
11
|
+
p.add_argument("out_path", type=Path)
|
|
12
|
+
p.add_argument("--work-dir", type=Path, default=None)
|
|
13
|
+
p.add_argument("--workers", type=int, default=1)
|
|
14
|
+
p.add_argument("--chunk-size", type=int, default=500)
|
|
15
|
+
p.add_argument("--fresh", action="store_true")
|
|
16
|
+
p.add_argument(
|
|
17
|
+
"--backend",
|
|
18
|
+
choices=("local", "thread", "multiprocess", "dask"),
|
|
19
|
+
default=None,
|
|
20
|
+
help="execution backend (INSTANT_PIPELINE_BACKEND)",
|
|
21
|
+
)
|
|
22
|
+
p.set_defaults(_handler=run)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def run(args: argparse.Namespace) -> int:
|
|
26
|
+
from indw.filter.spec.quality import QualityPipelineConfig
|
|
27
|
+
from indw.schedule.core import merge_with_quality
|
|
28
|
+
|
|
29
|
+
if args.backend:
|
|
30
|
+
os.environ["INSTANT_PIPELINE_BACKEND"] = args.backend
|
|
31
|
+
os.environ.setdefault("INSTANT_MERGE_HW_PROBE", "0")
|
|
32
|
+
cfg = QualityPipelineConfig()
|
|
33
|
+
merge_with_quality(
|
|
34
|
+
args.raw_dir,
|
|
35
|
+
args.out_path,
|
|
36
|
+
quality_config=cfg,
|
|
37
|
+
work_dir=args.work_dir,
|
|
38
|
+
fresh=args.fresh,
|
|
39
|
+
resume=not args.fresh,
|
|
40
|
+
workers=args.workers,
|
|
41
|
+
chunk_size=args.chunk_size,
|
|
42
|
+
)
|
|
43
|
+
return 0
|
app/commands/test.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def register(sub: argparse._SubParsersAction) -> None:
|
|
7
|
+
p = sub.add_parser("test", help="run framework test suite")
|
|
8
|
+
p.add_argument(
|
|
9
|
+
"--profile",
|
|
10
|
+
choices=("unit", "critical", "parity", "integration", "smoke"),
|
|
11
|
+
default="unit",
|
|
12
|
+
)
|
|
13
|
+
p.add_argument("pytest_args", nargs="*", help="extra pytest arguments")
|
|
14
|
+
p.set_defaults(_handler=run)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def run(args: argparse.Namespace) -> int:
|
|
18
|
+
from app.workflows import run_tests
|
|
19
|
+
|
|
20
|
+
return run_tests(args.profile, extra_args=args.pytest_args or None)
|
app/commands/validate.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def register(sub: argparse._SubParsersAction) -> None:
|
|
7
|
+
p = sub.add_parser("validate", help="run parity and acceptance validation")
|
|
8
|
+
p.add_argument("pytest_args", nargs="*", help="extra pytest arguments")
|
|
9
|
+
p.set_defaults(_handler=run)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def run(args: argparse.Namespace) -> int:
|
|
13
|
+
from app.workflows import run_tests
|
|
14
|
+
|
|
15
|
+
return run_tests("parity", extra_args=args.pytest_args or None)
|
app/workflows.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import subprocess
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
_ROOT = Path(__file__).resolve().parents[1]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def test_profiles() -> dict[str, dict[str, Any]]:
|
|
12
|
+
return {
|
|
13
|
+
"unit": {"markers": "not integration and not slow", "parallel": True, "paths": ["tests/"]},
|
|
14
|
+
"critical": {"markers": "critical and not integration", "parallel": False, "paths": ["tests/subsystems/"]},
|
|
15
|
+
"parity": {
|
|
16
|
+
"markers": "integration",
|
|
17
|
+
"parallel": False,
|
|
18
|
+
"paths": [
|
|
19
|
+
"tests/subsystems/test_stage_pool_parity.py",
|
|
20
|
+
"tests/subsystems/test_parallel_merge_parity.py",
|
|
21
|
+
"tests/subsystems/test_tier_admission_parity.py",
|
|
22
|
+
"tests/subsystems/test_execution_backend.py",
|
|
23
|
+
],
|
|
24
|
+
},
|
|
25
|
+
"integration": {"markers": "integration or slow", "parallel": False, "paths": ["tests/"]},
|
|
26
|
+
"smoke": {"markers": "smoke", "parallel": False, "paths": ["tests/"]},
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def run_tests(profile: str = "unit", *, extra_args: list[str] | None = None) -> int:
|
|
31
|
+
profiles = test_profiles()
|
|
32
|
+
if profile not in profiles:
|
|
33
|
+
raise ValueError(f"unknown profile {profile}; choose from {sorted(profiles)}")
|
|
34
|
+
spec = profiles[profile]
|
|
35
|
+
cmd = [sys.executable, "-m", "pytest", *spec["paths"], "-m", spec["markers"], "--tb=short", "--strict-markers"]
|
|
36
|
+
if spec.get("parallel"):
|
|
37
|
+
cmd.extend(["-n", "auto", "--dist", "loadfile", "-q"])
|
|
38
|
+
else:
|
|
39
|
+
cmd.append("-v")
|
|
40
|
+
if extra_args:
|
|
41
|
+
cmd.extend(extra_args)
|
|
42
|
+
return subprocess.run(cmd, cwd=_ROOT).returncode
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def run_benchmark(*, workers: str = "1 2 4") -> int:
|
|
46
|
+
script = _ROOT / "scripts" / "production_scale_audit.py"
|
|
47
|
+
if not script.is_file():
|
|
48
|
+
print("benchmark script missing", file=sys.stderr)
|
|
49
|
+
return 1
|
|
50
|
+
cmd = [sys.executable, str(script), "--workers", *workers.split()]
|
|
51
|
+
return subprocess.run(cmd, cwd=_ROOT).returncode
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def run_audit(*, kind: str = "pipeline", work_dir: Path | None = None, workers: int = 4) -> int:
|
|
55
|
+
scripts = {
|
|
56
|
+
"pipeline": ("scripts/pipeline_audit.py", []),
|
|
57
|
+
"dask": ("scripts/dask_integration_report.py", []),
|
|
58
|
+
"production": ("scripts/production_scale_audit.py", ["--workers", "1", "2"]),
|
|
59
|
+
"library": ("scripts/library_migration_report.py", []),
|
|
60
|
+
"stage0": ("scripts/stage0_production_verify.py", ["--workers", str(workers)]),
|
|
61
|
+
}
|
|
62
|
+
rel, extra = scripts.get(kind, scripts["pipeline"])
|
|
63
|
+
script = _ROOT / rel
|
|
64
|
+
if not script.is_file():
|
|
65
|
+
print(f"audit script not found: {kind}", file=sys.stderr)
|
|
66
|
+
return 1
|
|
67
|
+
cmd = [sys.executable, str(script), *extra]
|
|
68
|
+
if work_dir is not None and kind == "pipeline":
|
|
69
|
+
cmd.extend(["--work-dir", str(work_dir)])
|
|
70
|
+
return subprocess.run(cmd, cwd=_ROOT).returncode
|
indw/__init__.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
2
|
+
|
|
3
|
+
try:
|
|
4
|
+
__version__ = version('indw')
|
|
5
|
+
except PackageNotFoundError:
|
|
6
|
+
__version__ = '0.0.0'
|
|
7
|
+
|
|
8
|
+
_LAZY_EXPORTS = {
|
|
9
|
+
'CorpusRegistry': ('indw.store.corpus.registry', 'CorpusRegistry'),
|
|
10
|
+
'DatasetDownloader': ('indw.ingest.download', 'DatasetDownloader'),
|
|
11
|
+
'FastDatasetPipeline': ('indw.ingest.run', 'FastDatasetPipeline'),
|
|
12
|
+
'setup_dataset_logging': ('indw.ingest.log', 'setup_dataset_logging'),
|
|
13
|
+
'ScriptProfile': ('indw.filter.language.script', 'ScriptProfile'),
|
|
14
|
+
'analyze_script_profile': ('indw.filter.language.script', 'analyze_script_profile'),
|
|
15
|
+
'MultilingualPolicyConfig': ('indw.filter.language.script_policy', 'MultilingualPolicyConfig'),
|
|
16
|
+
'MixtureOrchestrationConfig': ('indw.schedule.mix.config', 'MixtureOrchestrationConfig'),
|
|
17
|
+
'CorpusMixturePlan': ('indw.schedule.mix.plan', 'CorpusMixturePlan'),
|
|
18
|
+
'adapt_mixture_from_telemetry': ('indw.schedule.mix.telemetry', 'adapt_mixture_from_telemetry'),
|
|
19
|
+
'build_corpus_mixture_plan': ('indw.schedule.mix.mixture_planner', 'build_corpus_mixture_plan'),
|
|
20
|
+
'QualityPipelineConfig': ('indw.filter.spec.quality', 'QualityPipelineConfig'),
|
|
21
|
+
'merge_with_quality': ('indw.schedule.core', 'merge_with_quality'),
|
|
22
|
+
'QualityGate': ('indw.filter.gate.quality', 'QualityGate'),
|
|
23
|
+
'export_token_bins_fast': ('indw.store.export.fast_export', 'export_token_bins_fast'),
|
|
24
|
+
'build_pretrain_dataloader': ('indw.store.export.memmap_stream', 'build_pretrain_dataloader'),
|
|
25
|
+
'build_val_dataloader': ('indw.store.export.memmap_stream', 'build_val_dataloader'),
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
'__version__',
|
|
30
|
+
*sorted(_LAZY_EXPORTS),
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def __getattr__(name: str):
|
|
35
|
+
if name in _LAZY_EXPORTS:
|
|
36
|
+
import importlib
|
|
37
|
+
|
|
38
|
+
mod_path, attr = _LAZY_EXPORTS[name]
|
|
39
|
+
val = getattr(importlib.import_module(mod_path), attr)
|
|
40
|
+
globals()[name] = val
|
|
41
|
+
return val
|
|
42
|
+
raise AttributeError(f'module {__name__!r} has no attribute {name!r}')
|
indw/_compat.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import importlib
|
|
4
|
+
import importlib.abc
|
|
5
|
+
import importlib.util
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
_installed = False
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class _IndwCompatFinder(importlib.abc.MetaPathFinder):
|
|
12
|
+
def find_spec(self, fullname, path, target=None):
|
|
13
|
+
if fullname != "data" and not fullname.startswith("data."):
|
|
14
|
+
return None
|
|
15
|
+
indw_name = "indw" if fullname == "data" else "indw" + fullname[4:]
|
|
16
|
+
if fullname in sys.modules:
|
|
17
|
+
return importlib.util.spec_from_loader(fullname, loader=None)
|
|
18
|
+
try:
|
|
19
|
+
mod = importlib.import_module(indw_name)
|
|
20
|
+
except ModuleNotFoundError:
|
|
21
|
+
return None
|
|
22
|
+
sys.modules[fullname] = mod
|
|
23
|
+
return importlib.util.spec_from_loader(fullname, loader=None)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def install_compat() -> None:
|
|
27
|
+
global _installed
|
|
28
|
+
if _installed:
|
|
29
|
+
return
|
|
30
|
+
for finder in sys.meta_path:
|
|
31
|
+
if isinstance(finder, _IndwCompatFinder):
|
|
32
|
+
_installed = True
|
|
33
|
+
return
|
|
34
|
+
sys.meta_path.insert(0, _IndwCompatFinder())
|
|
35
|
+
_installed = True
|
indw/clean/__init__.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
__all__ = [
|
|
2
|
+
'CorpusCleaningPipeline',
|
|
3
|
+
'CleaningResult',
|
|
4
|
+
'extract_row_text',
|
|
5
|
+
'final_pass_jsonl_row',
|
|
6
|
+
'process_jsonl_row',
|
|
7
|
+
'row_text_key',
|
|
8
|
+
]
|
|
9
|
+
|
|
10
|
+
_LAZY = {
|
|
11
|
+
'CorpusCleaningPipeline': ('indw.clean.corpus', 'CorpusCleaningPipeline'),
|
|
12
|
+
'CleaningResult': ('indw.clean.corpus', 'CleaningResult'),
|
|
13
|
+
'extract_row_text': ('indw.clean.corpus', 'extract_row_text'),
|
|
14
|
+
'final_pass_jsonl_row': ('indw.clean.corpus', 'final_pass_jsonl_row'),
|
|
15
|
+
'process_jsonl_row': ('indw.clean.corpus', 'process_jsonl_row'),
|
|
16
|
+
'row_text_key': ('indw.clean.corpus', 'row_text_key'),
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def __getattr__(name: str):
|
|
21
|
+
if name in _LAZY:
|
|
22
|
+
module_path, attr = _LAZY[name]
|
|
23
|
+
import importlib
|
|
24
|
+
|
|
25
|
+
mod = importlib.import_module(module_path)
|
|
26
|
+
val = getattr(mod, attr)
|
|
27
|
+
globals()[name] = val
|
|
28
|
+
return val
|
|
29
|
+
raise AttributeError(f'module {__name__!r} has no attribute {name!r}')
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from indw.clean.artifact.discovery_config import DiscoveryConfig
|
|
9
|
+
from indw.clean.artifact.discovery_corpus import CorpusStatsAccumulator
|
|
10
|
+
from indw.clean.artifact.discovery_registry import DynamicArtifactRegistry
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class ShadowDisagreement:
|
|
14
|
+
doc_id: str = ''
|
|
15
|
+
legacy_ratio: float = 0.0
|
|
16
|
+
discovery_ratio: float = 0.0
|
|
17
|
+
delta: float = 0.0
|
|
18
|
+
|
|
19
|
+
def to_dict(self) -> dict[str, Any]:
|
|
20
|
+
return {
|
|
21
|
+
'doc_id': self.doc_id,
|
|
22
|
+
'legacy_ratio': round(self.legacy_ratio, 4),
|
|
23
|
+
'discovery_ratio': round(self.discovery_ratio, 4),
|
|
24
|
+
'delta': round(self.delta, 4),
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class CalibrationReport:
|
|
29
|
+
batch_id: int = 0
|
|
30
|
+
docs_seen: int = 0
|
|
31
|
+
promoted: int = 0
|
|
32
|
+
demoted: int = 0
|
|
33
|
+
registry_size: int = 0
|
|
34
|
+
shadow_disagreements: list[ShadowDisagreement] = field(default_factory=list)
|
|
35
|
+
trim_threshold: float = 0.92
|
|
36
|
+
|
|
37
|
+
def to_dict(self) -> dict[str, Any]:
|
|
38
|
+
return {
|
|
39
|
+
'batch_id': self.batch_id,
|
|
40
|
+
'docs_seen': self.docs_seen,
|
|
41
|
+
'promoted': self.promoted,
|
|
42
|
+
'demoted': self.demoted,
|
|
43
|
+
'registry_size': self.registry_size,
|
|
44
|
+
'trim_threshold': self.trim_threshold,
|
|
45
|
+
'shadow_disagreements': [d.to_dict() for d in self.shadow_disagreements[-500:]],
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
def batch_calibrate(
|
|
49
|
+
accumulator: CorpusStatsAccumulator,
|
|
50
|
+
registry: DynamicArtifactRegistry,
|
|
51
|
+
config: DiscoveryConfig,
|
|
52
|
+
*,
|
|
53
|
+
corpus_dir: str = '',
|
|
54
|
+
shadow_disagreements: list[ShadowDisagreement] | None = None,
|
|
55
|
+
) -> CalibrationReport:
|
|
56
|
+
accumulator.end_batch(decay=config.decay)
|
|
57
|
+
cal = registry.calibrate(accumulator)
|
|
58
|
+
report = CalibrationReport(
|
|
59
|
+
batch_id=accumulator.batch_id,
|
|
60
|
+
docs_seen=accumulator.docs_seen,
|
|
61
|
+
promoted=cal['promoted'],
|
|
62
|
+
demoted=cal['demoted'],
|
|
63
|
+
registry_size=cal['total'],
|
|
64
|
+
shadow_disagreements=shadow_disagreements or [],
|
|
65
|
+
trim_threshold=config.min_trim_confidence,
|
|
66
|
+
)
|
|
67
|
+
if corpus_dir:
|
|
68
|
+
out = Path(corpus_dir) / 'discovery_calibration.json'
|
|
69
|
+
out.parent.mkdir(parents=True, exist_ok=True)
|
|
70
|
+
existing: list[dict] = []
|
|
71
|
+
if out.exists():
|
|
72
|
+
try:
|
|
73
|
+
existing = json.loads(out.read_text(encoding='utf-8'))
|
|
74
|
+
if not isinstance(existing, list):
|
|
75
|
+
existing = [existing]
|
|
76
|
+
except (json.JSONDecodeError, OSError):
|
|
77
|
+
existing = []
|
|
78
|
+
existing.append(report.to_dict())
|
|
79
|
+
out.write_text(json.dumps(existing[-50:], indent=2), encoding='utf-8')
|
|
80
|
+
return report
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
from indw.clean.artifact.discovery_corpus import CorpusStatsAccumulator
|
|
6
|
+
from indw.clean.artifact.decompose import DocumentUnit, position_bin
|
|
7
|
+
from indw.clean.artifact.discovery_registry import ArtifactEntry, DynamicArtifactRegistry
|
|
8
|
+
from indw.clean.artifact.safeguards import is_protected_unit
|
|
9
|
+
from indw.clean.document.value import analyze_content_value
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class FusedConfidence:
|
|
13
|
+
artifact_confidence: float = 0.0
|
|
14
|
+
knowledge_confidence: float = 0.0
|
|
15
|
+
frequency_confidence: float = 0.0
|
|
16
|
+
position_confidence: float = 0.0
|
|
17
|
+
structural_confidence: float = 0.0
|
|
18
|
+
novelty_confidence: float = 0.0
|
|
19
|
+
repetition_confidence: float = 0.0
|
|
20
|
+
coverage_confidence: float = 0.0
|
|
21
|
+
entropy_confidence: float = 0.0
|
|
22
|
+
trim_tier: str = 'keep'
|
|
23
|
+
would_trim: bool = False
|
|
24
|
+
|
|
25
|
+
def to_dict(self) -> dict[str, float | bool | str]:
|
|
26
|
+
return {
|
|
27
|
+
'artifact_confidence': round(self.artifact_confidence, 4),
|
|
28
|
+
'knowledge_confidence': round(self.knowledge_confidence, 4),
|
|
29
|
+
'frequency_confidence': round(self.frequency_confidence, 4),
|
|
30
|
+
'position_confidence': round(self.position_confidence, 4),
|
|
31
|
+
'structural_confidence': round(self.structural_confidence, 4),
|
|
32
|
+
'novelty_confidence': round(self.novelty_confidence, 4),
|
|
33
|
+
'repetition_confidence': round(self.repetition_confidence, 4),
|
|
34
|
+
'coverage_confidence': round(self.coverage_confidence, 4),
|
|
35
|
+
'entropy_confidence': round(self.entropy_confidence, 4),
|
|
36
|
+
'trim_tier': self.trim_tier,
|
|
37
|
+
'would_trim': self.would_trim,
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
class ConfidenceFusion:
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
*,
|
|
44
|
+
min_trim_confidence: float = 0.92,
|
|
45
|
+
medium_trim_confidence: float = 0.72,
|
|
46
|
+
knowledge_dampen: float = 0.55,
|
|
47
|
+
) -> None:
|
|
48
|
+
self.min_trim_confidence = min_trim_confidence
|
|
49
|
+
self.medium_trim_confidence = medium_trim_confidence
|
|
50
|
+
self.knowledge_dampen = knowledge_dampen
|
|
51
|
+
|
|
52
|
+
def fuse_unit(
|
|
53
|
+
self,
|
|
54
|
+
unit: DocumentUnit,
|
|
55
|
+
entry: ArtifactEntry | None,
|
|
56
|
+
*,
|
|
57
|
+
doc_text: str,
|
|
58
|
+
count_in_doc: int = 1,
|
|
59
|
+
doc_len: int = 0,
|
|
60
|
+
) -> FusedConfidence:
|
|
61
|
+
if entry is None:
|
|
62
|
+
return FusedConfidence()
|
|
63
|
+
|
|
64
|
+
if is_protected_unit(unit.text, kind=unit.kind, in_fence=unit.in_fence):
|
|
65
|
+
return FusedConfidence(knowledge_confidence=1.0, trim_tier='protected')
|
|
66
|
+
|
|
67
|
+
ctx_start = max(0, unit.start - 200)
|
|
68
|
+
ctx_end = min(len(doc_text), unit.end + 200)
|
|
69
|
+
context = doc_text[ctx_start:ctx_end]
|
|
70
|
+
cv = analyze_content_value(context)
|
|
71
|
+
knowledge = cv.overall_value_score
|
|
72
|
+
if cv.evidence and cv.evidence.preserve:
|
|
73
|
+
knowledge = max(knowledge, 0.85)
|
|
74
|
+
|
|
75
|
+
repetition = 1.0 - min(1.0, 1.0 / max(count_in_doc, 1))
|
|
76
|
+
coverage = entry.frequency_confidence
|
|
77
|
+
entropy_signal = 1.0 - entry.novelty_confidence
|
|
78
|
+
|
|
79
|
+
artifact = entry.artifact_confidence
|
|
80
|
+
artifact = min(1.0, artifact * 0.75 + repetition * 0.15 + coverage * 0.10)
|
|
81
|
+
if knowledge > self.knowledge_dampen:
|
|
82
|
+
artifact *= max(0.0, 1.0 - (knowledge - self.knowledge_dampen))
|
|
83
|
+
|
|
84
|
+
tier = 'keep'
|
|
85
|
+
would_trim = False
|
|
86
|
+
if artifact >= self.min_trim_confidence and knowledge < self.knowledge_dampen and entry.novelty_confidence < 0.35:
|
|
87
|
+
if not unit.in_fence and unit.kind != 'code':
|
|
88
|
+
tier = 'high'
|
|
89
|
+
would_trim = True
|
|
90
|
+
elif artifact >= self.medium_trim_confidence and knowledge < 0.45 and entry.novelty_confidence < 0.25:
|
|
91
|
+
bin_idx = position_bin(unit.start, max(doc_len, 1))
|
|
92
|
+
if bin_idx in (0, 4) and unit.kind in ('line', 'header', 'footer'):
|
|
93
|
+
tier = 'medium'
|
|
94
|
+
would_trim = True
|
|
95
|
+
|
|
96
|
+
return FusedConfidence(
|
|
97
|
+
artifact_confidence=artifact,
|
|
98
|
+
knowledge_confidence=knowledge,
|
|
99
|
+
frequency_confidence=entry.frequency_confidence,
|
|
100
|
+
position_confidence=entry.position_confidence,
|
|
101
|
+
structural_confidence=entry.structural_confidence,
|
|
102
|
+
novelty_confidence=entry.novelty_confidence,
|
|
103
|
+
repetition_confidence=repetition,
|
|
104
|
+
coverage_confidence=coverage,
|
|
105
|
+
entropy_confidence=entropy_signal,
|
|
106
|
+
trim_tier=tier,
|
|
107
|
+
would_trim=would_trim,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
def fuse_document(
|
|
111
|
+
self,
|
|
112
|
+
units: list[DocumentUnit],
|
|
113
|
+
registry: DynamicArtifactRegistry,
|
|
114
|
+
accumulator: CorpusStatsAccumulator,
|
|
115
|
+
doc_text: str,
|
|
116
|
+
*,
|
|
117
|
+
key_counts: dict[str, int] | None = None,
|
|
118
|
+
) -> list[tuple[DocumentUnit, FusedConfidence]]:
|
|
119
|
+
counts: dict[str, int] = key_counts or {}
|
|
120
|
+
doc_len = len(doc_text)
|
|
121
|
+
out: list[tuple[DocumentUnit, FusedConfidence]] = []
|
|
122
|
+
for unit in units:
|
|
123
|
+
from indw.clean.artifact.discovery_corpus import fragment_key
|
|
124
|
+
|
|
125
|
+
key = fragment_key(unit.text, unit.layout)
|
|
126
|
+
cnt = counts.get(key, 1)
|
|
127
|
+
entry = registry.lookup(unit.text, accumulator, layout=unit.layout, count_in_doc=cnt)
|
|
128
|
+
fused = self.fuse_unit(
|
|
129
|
+
unit, entry, doc_text=doc_text, count_in_doc=cnt, doc_len=doc_len,
|
|
130
|
+
)
|
|
131
|
+
out.append((unit, fused))
|
|
132
|
+
return out
|