indw 1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (426) hide show
  1. app/cli.py +34 -0
  2. app/commands/__init__.py +16 -0
  3. app/commands/audit.py +22 -0
  4. app/commands/benchmark.py +15 -0
  5. app/commands/doctor.py +22 -0
  6. app/commands/merge.py +43 -0
  7. app/commands/test.py +20 -0
  8. app/commands/validate.py +15 -0
  9. app/workflows.py +70 -0
  10. indw/__init__.py +42 -0
  11. indw/_compat.py +35 -0
  12. indw/clean/__init__.py +29 -0
  13. indw/clean/artifact/calibrate.py +80 -0
  14. indw/clean/artifact/confidence.py +132 -0
  15. indw/clean/artifact/decompose.py +265 -0
  16. indw/clean/artifact/discovery_clean.py +57 -0
  17. indw/clean/artifact/discovery_config.py +74 -0
  18. indw/clean/artifact/discovery_corpus.py +364 -0
  19. indw/clean/artifact/discovery_engine.py +229 -0
  20. indw/clean/artifact/discovery_registry.py +191 -0
  21. indw/clean/artifact/discovery_structural.py +48 -0
  22. indw/clean/artifact/discovery_validation.py +54 -0
  23. indw/clean/artifact/engine.py +161 -0
  24. indw/clean/artifact/evidence.py +73 -0
  25. indw/clean/artifact/evidence_cache.py +348 -0
  26. indw/clean/artifact/evidence_engine.py +328 -0
  27. indw/clean/artifact/evidence_features.py +197 -0
  28. indw/clean/artifact/evidence_model.py +536 -0
  29. indw/clean/artifact/evidence_util.py +71 -0
  30. indw/clean/artifact/novelty.py +34 -0
  31. indw/clean/artifact/positional.py +29 -0
  32. indw/clean/artifact/registry.py +139 -0
  33. indw/clean/artifact/safeguards.py +42 -0
  34. indw/clean/artifact/strip.py +144 -0
  35. indw/clean/artifact/trim.py +76 -0
  36. indw/clean/corpus.py +712 -0
  37. indw/clean/document/adaptive.py +161 -0
  38. indw/clean/document/boilerplate.py +71 -0
  39. indw/clean/document/clean.py +56 -0
  40. indw/clean/document/code_preservation.py +287 -0
  41. indw/clean/document/compression.py +109 -0
  42. indw/clean/document/config.py +148 -0
  43. indw/clean/document/conversation.py +219 -0
  44. indw/clean/document/dedup.py +33 -0
  45. indw/clean/document/html.py +189 -0
  46. indw/clean/document/license.py +702 -0
  47. indw/clean/document/metrics.py +293 -0
  48. indw/clean/document/normalize.py +136 -0
  49. indw/clean/document/patterns.py +96 -0
  50. indw/clean/document/segment.py +217 -0
  51. indw/clean/document/stage_manifest.py +94 -0
  52. indw/clean/document/stats.py +134 -0
  53. indw/clean/document/ui.py +103 -0
  54. indw/clean/document/validate.py +105 -0
  55. indw/clean/document/value.py +557 -0
  56. indw/clean/gate/evaluate.py +502 -0
  57. indw/clean/gate/policy.py +88 -0
  58. indw/clean/meta/clean.py +225 -0
  59. indw/clean/meta/foundation.py +505 -0
  60. indw/clean/meta/patterns.py +358 -0
  61. indw/clean/meta/stats.py +72 -0
  62. indw/clean/meta/strip.py +319 -0
  63. indw/clean/semantic/boilerplate.py +102 -0
  64. indw/clean/semantic/classifier.py +174 -0
  65. indw/clean/semantic/clean.py +145 -0
  66. indw/clean/semantic/config.py +114 -0
  67. indw/clean/semantic/embedded.py +396 -0
  68. indw/clean/semantic/fingerprints.py +89 -0
  69. indw/clean/semantic/ocr_normalize.py +91 -0
  70. indw/clean/semantic/pipeline.py +241 -0
  71. indw/clean/semantic/report.py +126 -0
  72. indw/clean/semantic/routing.py +146 -0
  73. indw/clean/semantic/scoring.py +86 -0
  74. indw/clean/semantic/section_artifacts.py +345 -0
  75. indw/clean/semantic/spec.py +8 -0
  76. indw/clean/semantic/structure.py +241 -0
  77. indw/clean/semantic/thresholds.py +52 -0
  78. indw/clean/structure/__init__.py +35 -0
  79. indw/clean/structure/extract.py +121 -0
  80. indw/clean/structure/labeled_qa.py +104 -0
  81. indw/clean/structure/reference_sections.py +87 -0
  82. indw/config/__init__.py +44 -0
  83. indw/config/defaults.py +147 -0
  84. indw/config/loader.py +215 -0
  85. indw/config/resolve.py +171 -0
  86. indw/config/validation.py +103 -0
  87. indw/dedup/__init__.py +21 -0
  88. indw/dedup/backends/__init__.py +21 -0
  89. indw/dedup/backends/fuzzy.py +7 -0
  90. indw/dedup/embed/__init__.py +10 -0
  91. indw/dedup/embed/ann.py +61 -0
  92. indw/dedup/embed/candidate.py +84 -0
  93. indw/dedup/embed/cluster.py +45 -0
  94. indw/dedup/embed/config.py +59 -0
  95. indw/dedup/embed/contracts.py +104 -0
  96. indw/dedup/embed/e5.py +324 -0
  97. indw/dedup/embed/pipeline.py +212 -0
  98. indw/dedup/embed/pools/__init__.py +0 -0
  99. indw/dedup/embed/pools/gpu_worker.py +34 -0
  100. indw/dedup/embed/providers.py +93 -0
  101. indw/dedup/embed/representative.py +20 -0
  102. indw/dedup/embed/similarity.py +36 -0
  103. indw/dedup/embed/threshold.py +47 -0
  104. indw/dedup/exact.py +173 -0
  105. indw/dedup/fuzzy.py +110 -0
  106. indw/dedup/normalize.py +58 -0
  107. indw/dedup/replay.py +68 -0
  108. indw/dedup/semantic.py +140 -0
  109. indw/dedup/service/__init__.py +3 -0
  110. indw/dedup/service/exact_shard.py +106 -0
  111. indw/dedup/storage.py +60 -0
  112. indw/extract/__init__.py +20 -0
  113. indw/extract/assess/doc_type.py +64 -0
  114. indw/extract/assess/engine.py +226 -0
  115. indw/extract/assess/feedback.py +79 -0
  116. indw/extract/assess/metrics.py +116 -0
  117. indw/extract/assess/quality.py +70 -0
  118. indw/extract/core/clean.py +389 -0
  119. indw/extract/core/context.py +312 -0
  120. indw/extract/core/profile.py +223 -0
  121. indw/extract/core/units.py +1020 -0
  122. indw/extract/nav/context.py +442 -0
  123. indw/extract/nav/metrics.py +138 -0
  124. indw/extract/nav/template.py +83 -0
  125. indw/extract/roles/education.py +551 -0
  126. indw/extract/roles/forum.py +733 -0
  127. indw/extract/roles/publication.py +1053 -0
  128. indw/extract/roles/units.py +246 -0
  129. indw/extract/sections/boundaries.py +535 -0
  130. indw/extract/sections/classify.py +594 -0
  131. indw/extract/sections/integrity.py +462 -0
  132. indw/extract/sections/quality.py +440 -0
  133. indw/extract/sections/scratch.py +37 -0
  134. indw/extract/sections/semantic.py +522 -0
  135. indw/extract/structure/aggregate.py +773 -0
  136. indw/extract/structure/analyze.py +150 -0
  137. indw/extract/structure/document.py +67 -0
  138. indw/extract/structure/inline.py +457 -0
  139. indw/extract/structure/recovery.py +68 -0
  140. indw/extract/structure/segment.py +129 -0
  141. indw/filter/__init__.py +23 -0
  142. indw/filter/content/code.py +681 -0
  143. indw/filter/content/domain.py +65 -0
  144. indw/filter/content/filters.py +504 -0
  145. indw/filter/content/metadata.py +143 -0
  146. indw/filter/decide/calibrate.py +207 -0
  147. indw/filter/decide/curator.py +92 -0
  148. indw/filter/decide/engine.py +302 -0
  149. indw/filter/decide/policy.py +374 -0
  150. indw/filter/decide/threshold.py +160 -0
  151. indw/filter/gate/diagnostics.py +90 -0
  152. indw/filter/gate/quality.py +528 -0
  153. indw/filter/gate/reports.py +153 -0
  154. indw/filter/gate/scorer.py +77 -0
  155. indw/filter/language/bridge.py +54 -0
  156. indw/filter/language/confidence.py +42 -0
  157. indw/filter/language/config.py +132 -0
  158. indw/filter/language/detect.py +168 -0
  159. indw/filter/language/fast_detector.py +86 -0
  160. indw/filter/language/mixed.py +85 -0
  161. indw/filter/language/reports.py +127 -0
  162. indw/filter/language/script.py +339 -0
  163. indw/filter/language/script_metrics.py +157 -0
  164. indw/filter/language/script_opt.py +141 -0
  165. indw/filter/language/script_orch.py +84 -0
  166. indw/filter/language/script_policy.py +40 -0
  167. indw/filter/language/script_table.py +127 -0
  168. indw/filter/language/stats.py +39 -0
  169. indw/filter/language/telemetry.py +161 -0
  170. indw/filter/language/token_metrics.py +121 -0
  171. indw/filter/language/validation.py +218 -0
  172. indw/filter/license/classifier.py +68 -0
  173. indw/filter/license/config.py +89 -0
  174. indw/filter/license/detector.py +188 -0
  175. indw/filter/license/manifest.py +70 -0
  176. indw/filter/license/normalize.py +140 -0
  177. indw/filter/license/policy.py +244 -0
  178. indw/filter/license/record.py +74 -0
  179. indw/filter/license/reports.py +104 -0
  180. indw/filter/license/schema.py +101 -0
  181. indw/filter/license/source_policy.py +157 -0
  182. indw/filter/pii/config.py +130 -0
  183. indw/filter/pii/context.py +72 -0
  184. indw/filter/pii/detect.py +76 -0
  185. indw/filter/pii/entities.py +182 -0
  186. indw/filter/pii/redaction.py +25 -0
  187. indw/filter/pii/reports.py +96 -0
  188. indw/filter/pii/risk.py +91 -0
  189. indw/filter/pii/secrets.py +206 -0
  190. indw/filter/pii/validation.py +180 -0
  191. indw/filter/refine/audit.py +124 -0
  192. indw/filter/refine/corpus.py +226 -0
  193. indw/filter/refine/processor.py +102 -0
  194. indw/filter/refine/rewrite.py +146 -0
  195. indw/filter/refine/settings.py +74 -0
  196. indw/filter/refine/truncation.py +189 -0
  197. indw/filter/score/adaptive.py +65 -0
  198. indw/filter/score/analysis.py +274 -0
  199. indw/filter/score/artifacts.py +174 -0
  200. indw/filter/score/builder.py +167 -0
  201. indw/filter/score/canonical.py +112 -0
  202. indw/filter/score/continuous.py +199 -0
  203. indw/filter/score/engine.py +41 -0
  204. indw/filter/score/signals.py +179 -0
  205. indw/filter/score/types.py +108 -0
  206. indw/filter/spec/document.py +163 -0
  207. indw/filter/spec/export.py +44 -0
  208. indw/filter/spec/pipeline.py +441 -0
  209. indw/filter/spec/quality.py +412 -0
  210. indw/filter/spec/validate.py +39 -0
  211. indw/filter/stage0/admission.py +68 -0
  212. indw/filter/stage0/audit.py +528 -0
  213. indw/filter/stage0/engine.py +289 -0
  214. indw/filter/stage0/verify.py +316 -0
  215. indw/filter/toxicity/classifier_labels.py +10 -0
  216. indw/filter/toxicity/config.py +138 -0
  217. indw/filter/toxicity/context.py +100 -0
  218. indw/filter/toxicity/detect.py +90 -0
  219. indw/filter/toxicity/patterns.py +62 -0
  220. indw/filter/toxicity/reports.py +96 -0
  221. indw/filter/toxicity/rule_scorer.py +89 -0
  222. indw/filter/toxicity/rules.py +83 -0
  223. indw/filter/toxicity/scorer.py +126 -0
  224. indw/filter/toxicity/validation.py +146 -0
  225. indw/ingest/__init__.py +18 -0
  226. indw/ingest/download.py +199 -0
  227. indw/ingest/format.py +165 -0
  228. indw/ingest/hash.py +65 -0
  229. indw/ingest/hf_datasets.py +79 -0
  230. indw/ingest/hf_env.py +12 -0
  231. indw/ingest/log.py +57 -0
  232. indw/ingest/resume.py +39 -0
  233. indw/ingest/run.py +166 -0
  234. indw/ingest/sink.py +132 -0
  235. indw/ingest/transcript.py +56 -0
  236. indw/schedule/__init__.py +18 -0
  237. indw/schedule/admission/__init__.py +21 -0
  238. indw/schedule/admission/tier01.py +90 -0
  239. indw/schedule/admission/tiers.py +35 -0
  240. indw/schedule/admission/tracker.py +61 -0
  241. indw/schedule/apply/coordinator.py +165 -0
  242. indw/schedule/apply/dedup.py +121 -0
  243. indw/schedule/apply/lifecycle.py +145 -0
  244. indw/schedule/apply/merge.py +284 -0
  245. indw/schedule/apply/serialize.py +79 -0
  246. indw/schedule/architecture/__init__.py +26 -0
  247. indw/schedule/architecture/classify.py +76 -0
  248. indw/schedule/architecture/graph.py +67 -0
  249. indw/schedule/architecture/ownership.py +131 -0
  250. indw/schedule/architecture/resources.py +44 -0
  251. indw/schedule/backends/__init__.py +13 -0
  252. indw/schedule/backends/config.py +40 -0
  253. indw/schedule/backends/contract.py +30 -0
  254. indw/schedule/backends/dask.py +131 -0
  255. indw/schedule/backends/factory.py +27 -0
  256. indw/schedule/backends/local.py +76 -0
  257. indw/schedule/backends/multiprocess.py +120 -0
  258. indw/schedule/backends/thread.py +121 -0
  259. indw/schedule/config/hardware.py +140 -0
  260. indw/schedule/config/pin.py +47 -0
  261. indw/schedule/config/policy.py +356 -0
  262. indw/schedule/config/resolve.py +80 -0
  263. indw/schedule/config/tune.py +258 -0
  264. indw/schedule/core.py +128 -0
  265. indw/schedule/dispatch/alloc.py +289 -0
  266. indw/schedule/dispatch/lanes.py +178 -0
  267. indw/schedule/dispatch/parallel.py +802 -0
  268. indw/schedule/dispatch/workers.py +252 -0
  269. indw/schedule/graph/__init__.py +3 -0
  270. indw/schedule/graph/artifacts.py +33 -0
  271. indw/schedule/graph/config.py +30 -0
  272. indw/schedule/graph/envelope.py +5 -0
  273. indw/schedule/graph/queues.py +161 -0
  274. indw/schedule/graph/runner.py +297 -0
  275. indw/schedule/ingest/coordinator.py +20 -0
  276. indw/schedule/intel/coordination.py +106 -0
  277. indw/schedule/intel/fingerprints.py +171 -0
  278. indw/schedule/intel/genome.py +229 -0
  279. indw/schedule/intel/hardware.py +109 -0
  280. indw/schedule/intel/incremental.py +192 -0
  281. indw/schedule/intel/inheritance.py +55 -0
  282. indw/schedule/intel/lci_graph.py +599 -0
  283. indw/schedule/intel/lci_router.py +112 -0
  284. indw/schedule/intel/merge_session.py +72 -0
  285. indw/schedule/intel/pci.py +307 -0
  286. indw/schedule/intel/pools/__init__.py +0 -0
  287. indw/schedule/intel/pools/acim.py +43 -0
  288. indw/schedule/intel/pools/pci.py +41 -0
  289. indw/schedule/intel/promotion.py +138 -0
  290. indw/schedule/intel/router.py +117 -0
  291. indw/schedule/intel/scores.py +86 -0
  292. indw/schedule/intel/session.py +417 -0
  293. indw/schedule/intel/store.py +289 -0
  294. indw/schedule/mix/config.py +80 -0
  295. indw/schedule/mix/curriculum.py +97 -0
  296. indw/schedule/mix/mixture_planner.py +130 -0
  297. indw/schedule/mix/plan.py +83 -0
  298. indw/schedule/mix/sampler.py +181 -0
  299. indw/schedule/mix/telemetry.py +116 -0
  300. indw/schedule/monitor/audit.py +23 -0
  301. indw/schedule/monitor/budget.py +33 -0
  302. indw/schedule/monitor/cost.py +183 -0
  303. indw/schedule/monitor/cpu.py +17 -0
  304. indw/schedule/monitor/doc.py +264 -0
  305. indw/schedule/monitor/invariants.py +105 -0
  306. indw/schedule/monitor/live.py +211 -0
  307. indw/schedule/monitor/obs.py +123 -0
  308. indw/schedule/monitor/pipeline_exporter.py +744 -0
  309. indw/schedule/read/gates.py +163 -0
  310. indw/schedule/read/ingest.py +38 -0
  311. indw/schedule/read/preprocess.py +53 -0
  312. indw/schedule/read/probe.py +230 -0
  313. indw/schedule/read/sources.py +57 -0
  314. indw/schedule/routing/__init__.py +0 -0
  315. indw/schedule/routing/admission.py +24 -0
  316. indw/schedule/row/index.py +127 -0
  317. indw/schedule/row/provenance.py +37 -0
  318. indw/schedule/row/reject.py +58 -0
  319. indw/schedule/row/resolve.py +40 -0
  320. indw/schedule/row/signals.py +55 -0
  321. indw/schedule/stages/__init__.py +21 -0
  322. indw/schedule/stages/artifact_cleaning.py +11 -0
  323. indw/schedule/stages/classification.py +96 -0
  324. indw/schedule/stages/contracts.py +71 -0
  325. indw/schedule/stages/curator.py +20 -0
  326. indw/schedule/stages/engine.py +325 -0
  327. indw/schedule/stages/knowledge.py +16 -0
  328. indw/schedule/stages/normalization.py +11 -0
  329. indw/schedule/stages/pools/__init__.py +20 -0
  330. indw/schedule/stages/pools/chain.py +111 -0
  331. indw/schedule/stages/pools/clean.py +57 -0
  332. indw/schedule/stages/pools/filter.py +58 -0
  333. indw/schedule/stages/pools/preprocess.py +67 -0
  334. indw/schedule/stages/pools/stage0.py +93 -0
  335. indw/schedule/stages/quality.py +20 -0
  336. indw/schedule/stages/rewrite.py +17 -0
  337. indw/schedule/stages/runner.py +121 -0
  338. indw/schedule/stages/structural_repair.py +61 -0
  339. indw/schedule/stages/validation.py +15 -0
  340. indw/schedule/state/artifacts.py +358 -0
  341. indw/schedule/state/checkpoint.py +770 -0
  342. indw/schedule/state/context.py +130 -0
  343. indw/schedule/state/lock.py +135 -0
  344. indw/schedule/state/sessions.py +81 -0
  345. indw/schedule/state/setup.py +235 -0
  346. indw/schedule/state/survivor.py +87 -0
  347. indw/store/corpus/manifest.py +80 -0
  348. indw/store/corpus/registry.py +134 -0
  349. indw/store/eval/compare.py +121 -0
  350. indw/store/eval/config.py +87 -0
  351. indw/store/eval/decision.py +93 -0
  352. indw/store/eval/diversity.py +51 -0
  353. indw/store/eval/evaluator.py +153 -0
  354. indw/store/eval/knowledge.py +18 -0
  355. indw/store/eval/metrics.py +79 -0
  356. indw/store/eval/reports.py +102 -0
  357. indw/store/eval/scoring.py +53 -0
  358. indw/store/eval/validation.py +214 -0
  359. indw/store/export/config.py +73 -0
  360. indw/store/export/export_items.py +9 -0
  361. indw/store/export/fast_export.py +173 -0
  362. indw/store/export/memmap_stream.py +159 -0
  363. indw/store/export/packed_stream.py +323 -0
  364. indw/store/export/packing/__init__.py +12 -0
  365. indw/store/export/packing/binpack.py +234 -0
  366. indw/store/export/packing/collate.py +53 -0
  367. indw/store/export/packing/config.py +35 -0
  368. indw/store/export/pipeline.py +117 -0
  369. indw/store/export/prefetch.py +101 -0
  370. indw/store/export/replay_export.py +49 -0
  371. indw/store/export/shard_io.py +113 -0
  372. indw/store/export/shard_meta.py +129 -0
  373. indw/store/export/splits.py +37 -0
  374. indw/store/io/atomic.py +101 -0
  375. indw/store/io/cache.py +101 -0
  376. indw/store/io/columnar.py +73 -0
  377. indw/store/io/json_codec.py +33 -0
  378. indw/store/io/jsonl.py +93 -0
  379. indw/store/io/retry.py +106 -0
  380. indw/tools/__init__.py +0 -0
  381. indw/tools/metrics/alerts.py +57 -0
  382. indw/tools/metrics/config.py +70 -0
  383. indw/tools/metrics/pipeline_health.py +198 -0
  384. indw/tools/metrics/recovery.py +65 -0
  385. indw/tools/metrics/regression.py +191 -0
  386. indw/tools/metrics/reject_log.py +74 -0
  387. indw/tools/metrics/reports.py +114 -0
  388. indw/tools/metrics/snapshot.py +153 -0
  389. indw/tools/metrics/stage_profile.py +134 -0
  390. indw/tools/metrics/storage.py +71 -0
  391. indw/tools/metrics/trends.py +86 -0
  392. indw/tools/metrics/validation.py +199 -0
  393. indw/tools/reports/__init__.py +0 -0
  394. indw/tools/reports/admission_cost.py +187 -0
  395. indw/tools/reports/audit_left/artifact_leakage.py +105 -0
  396. indw/tools/reports/audit_left/pipeline.py +344 -0
  397. indw/tools/reports/audit_left/validation.py +526 -0
  398. indw/tools/reports/batch_efficiency_audit.py +174 -0
  399. indw/tools/reports/benchmark/__init__.py +21 -0
  400. indw/tools/reports/benchmark/scale.py +462 -0
  401. indw/tools/reports/dask_integration.py +149 -0
  402. indw/tools/reports/execution_consolidation.py +210 -0
  403. indw/tools/reports/fast/__init__.py +0 -0
  404. indw/tools/reports/fast/analyze.py +521 -0
  405. indw/tools/reports/fast/patterns.py +22 -0
  406. indw/tools/reports/fast/report.py +95 -0
  407. indw/tools/reports/fast/sample.py +81 -0
  408. indw/tools/reports/fast/stats.py +110 -0
  409. indw/tools/reports/foundation_cost.py +154 -0
  410. indw/tools/reports/heavy_cost.py +169 -0
  411. indw/tools/reports/library_migration.py +216 -0
  412. indw/tools/reports/pipeline_audit.py +216 -0
  413. indw/tools/reports/pipeline_tune_report.py +144 -0
  414. indw/tools/reports/production_scale_report.py +354 -0
  415. indw/tools/reports/stabilization_audit.py +362 -0
  416. indw/tools/reports/stage0_cost.py +164 -0
  417. indw/util/hf_tokenizers.py +21 -0
  418. indw/util/stable_hash.py +41 -0
  419. indw/util/stats.py +15 -0
  420. indw-1.0.dist-info/METADATA +279 -0
  421. indw-1.0.dist-info/RECORD +426 -0
  422. indw-1.0.dist-info/WHEEL +5 -0
  423. indw-1.0.dist-info/entry_points.txt +2 -0
  424. indw-1.0.dist-info/licenses/LICENSE +190 -0
  425. indw-1.0.dist-info/licenses/NOTICE +17 -0
  426. indw-1.0.dist-info/top_level.txt +2 -0
app/cli.py ADDED
@@ -0,0 +1,34 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import sys
5
+
6
+ from app.commands import (
7
+ register_audit,
8
+ register_benchmark,
9
+ register_doctor,
10
+ register_merge,
11
+ register_test,
12
+ register_validate,
13
+ )
14
+
15
+
16
+ def build_parser() -> argparse.ArgumentParser:
17
+ ap = argparse.ArgumentParser(prog="indw", description="INDW — Instant Data Workflow")
18
+ sub = ap.add_subparsers(dest="command", required=True)
19
+ register_merge(sub)
20
+ register_test(sub)
21
+ register_validate(sub)
22
+ register_audit(sub)
23
+ register_benchmark(sub)
24
+ register_doctor(sub)
25
+ return ap
26
+
27
+
28
+ def main(argv: list[str] | None = None) -> int:
29
+ args = build_parser().parse_args(argv)
30
+ return int(args._handler(args))
31
+
32
+
33
+ if __name__ == "__main__":
34
+ raise SystemExit(main())
@@ -0,0 +1,16 @@
1
+ from app.commands.audit import register as register_audit
2
+ from app.commands.benchmark import register as register_benchmark
3
+ from app.commands.doctor import register as register_doctor
4
+ from app.commands.merge import register as register_merge
5
+ from app.commands.test import register as register_test
6
+
7
+ from app.commands.validate import register as register_validate
8
+
9
+ __all__ = [
10
+ "register_audit",
11
+ "register_benchmark",
12
+ "register_doctor",
13
+ "register_merge",
14
+ "register_test",
15
+ "register_validate",
16
+ ]
app/commands/audit.py ADDED
@@ -0,0 +1,22 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ from pathlib import Path
5
+
6
+
7
+ def register(sub: argparse._SubParsersAction) -> None:
8
+ p = sub.add_parser("audit", help="run pipeline audit reports")
9
+ p.add_argument(
10
+ "--kind",
11
+ choices=("pipeline", "dask", "production", "library", "stage0"),
12
+ default="pipeline",
13
+ )
14
+ p.add_argument("--work-dir", type=Path, default=None)
15
+ p.add_argument("--workers", type=int, default=4)
16
+ p.set_defaults(_handler=run)
17
+
18
+
19
+ def run(args: argparse.Namespace) -> int:
20
+ from app.workflows import run_audit
21
+
22
+ return run_audit(kind=args.kind, work_dir=args.work_dir, workers=args.workers)
@@ -0,0 +1,15 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+
5
+
6
+ def register(sub: argparse._SubParsersAction) -> None:
7
+ p = sub.add_parser("benchmark", help="production scale benchmark")
8
+ p.add_argument("--workers", default="1 2 4", help="worker counts")
9
+ p.set_defaults(_handler=run)
10
+
11
+
12
+ def run(args: argparse.Namespace) -> int:
13
+ from app.workflows import run_benchmark
14
+
15
+ return run_benchmark(workers=args.workers)
app/commands/doctor.py ADDED
@@ -0,0 +1,22 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import importlib.util
5
+ import platform
6
+
7
+
8
+ def register(sub: argparse._SubParsersAction) -> None:
9
+ p = sub.add_parser("doctor", help="check install and backend availability")
10
+ p.set_defaults(_handler=run)
11
+
12
+
13
+ def run(_args: argparse.Namespace) -> int:
14
+ import indw
15
+ from indw.schedule.backends.config import pipeline_execution_backend
16
+ from indw.schedule.backends.factory import resolve_execution_backend
17
+
18
+ print(f"indw={indw.__version__} python={platform.python_version()} platform={platform.platform()}")
19
+ print(f"backend={pipeline_execution_backend()} resolved={resolve_execution_backend().name}")
20
+ for pkg in ("orjson", "trafilatura", "dask"):
21
+ print(f"{pkg}={'ok' if importlib.util.find_spec(pkg) else 'missing'}")
22
+ return 0
app/commands/merge.py ADDED
@@ -0,0 +1,43 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import os
5
+ from pathlib import Path
6
+
7
+
8
+ def register(sub: argparse._SubParsersAction) -> None:
9
+ p = sub.add_parser("merge", help="run quality merge on raw corpus")
10
+ p.add_argument("raw_dir", type=Path)
11
+ p.add_argument("out_path", type=Path)
12
+ p.add_argument("--work-dir", type=Path, default=None)
13
+ p.add_argument("--workers", type=int, default=1)
14
+ p.add_argument("--chunk-size", type=int, default=500)
15
+ p.add_argument("--fresh", action="store_true")
16
+ p.add_argument(
17
+ "--backend",
18
+ choices=("local", "thread", "multiprocess", "dask"),
19
+ default=None,
20
+ help="execution backend (INSTANT_PIPELINE_BACKEND)",
21
+ )
22
+ p.set_defaults(_handler=run)
23
+
24
+
25
+ def run(args: argparse.Namespace) -> int:
26
+ from indw.filter.spec.quality import QualityPipelineConfig
27
+ from indw.schedule.core import merge_with_quality
28
+
29
+ if args.backend:
30
+ os.environ["INSTANT_PIPELINE_BACKEND"] = args.backend
31
+ os.environ.setdefault("INSTANT_MERGE_HW_PROBE", "0")
32
+ cfg = QualityPipelineConfig()
33
+ merge_with_quality(
34
+ args.raw_dir,
35
+ args.out_path,
36
+ quality_config=cfg,
37
+ work_dir=args.work_dir,
38
+ fresh=args.fresh,
39
+ resume=not args.fresh,
40
+ workers=args.workers,
41
+ chunk_size=args.chunk_size,
42
+ )
43
+ return 0
app/commands/test.py ADDED
@@ -0,0 +1,20 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+
5
+
6
+ def register(sub: argparse._SubParsersAction) -> None:
7
+ p = sub.add_parser("test", help="run framework test suite")
8
+ p.add_argument(
9
+ "--profile",
10
+ choices=("unit", "critical", "parity", "integration", "smoke"),
11
+ default="unit",
12
+ )
13
+ p.add_argument("pytest_args", nargs="*", help="extra pytest arguments")
14
+ p.set_defaults(_handler=run)
15
+
16
+
17
+ def run(args: argparse.Namespace) -> int:
18
+ from app.workflows import run_tests
19
+
20
+ return run_tests(args.profile, extra_args=args.pytest_args or None)
@@ -0,0 +1,15 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+
5
+
6
+ def register(sub: argparse._SubParsersAction) -> None:
7
+ p = sub.add_parser("validate", help="run parity and acceptance validation")
8
+ p.add_argument("pytest_args", nargs="*", help="extra pytest arguments")
9
+ p.set_defaults(_handler=run)
10
+
11
+
12
+ def run(args: argparse.Namespace) -> int:
13
+ from app.workflows import run_tests
14
+
15
+ return run_tests("parity", extra_args=args.pytest_args or None)
app/workflows.py ADDED
@@ -0,0 +1,70 @@
1
+ from __future__ import annotations
2
+
3
+ import subprocess
4
+ import sys
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ _ROOT = Path(__file__).resolve().parents[1]
9
+
10
+
11
+ def test_profiles() -> dict[str, dict[str, Any]]:
12
+ return {
13
+ "unit": {"markers": "not integration and not slow", "parallel": True, "paths": ["tests/"]},
14
+ "critical": {"markers": "critical and not integration", "parallel": False, "paths": ["tests/subsystems/"]},
15
+ "parity": {
16
+ "markers": "integration",
17
+ "parallel": False,
18
+ "paths": [
19
+ "tests/subsystems/test_stage_pool_parity.py",
20
+ "tests/subsystems/test_parallel_merge_parity.py",
21
+ "tests/subsystems/test_tier_admission_parity.py",
22
+ "tests/subsystems/test_execution_backend.py",
23
+ ],
24
+ },
25
+ "integration": {"markers": "integration or slow", "parallel": False, "paths": ["tests/"]},
26
+ "smoke": {"markers": "smoke", "parallel": False, "paths": ["tests/"]},
27
+ }
28
+
29
+
30
+ def run_tests(profile: str = "unit", *, extra_args: list[str] | None = None) -> int:
31
+ profiles = test_profiles()
32
+ if profile not in profiles:
33
+ raise ValueError(f"unknown profile {profile}; choose from {sorted(profiles)}")
34
+ spec = profiles[profile]
35
+ cmd = [sys.executable, "-m", "pytest", *spec["paths"], "-m", spec["markers"], "--tb=short", "--strict-markers"]
36
+ if spec.get("parallel"):
37
+ cmd.extend(["-n", "auto", "--dist", "loadfile", "-q"])
38
+ else:
39
+ cmd.append("-v")
40
+ if extra_args:
41
+ cmd.extend(extra_args)
42
+ return subprocess.run(cmd, cwd=_ROOT).returncode
43
+
44
+
45
+ def run_benchmark(*, workers: str = "1 2 4") -> int:
46
+ script = _ROOT / "scripts" / "production_scale_audit.py"
47
+ if not script.is_file():
48
+ print("benchmark script missing", file=sys.stderr)
49
+ return 1
50
+ cmd = [sys.executable, str(script), "--workers", *workers.split()]
51
+ return subprocess.run(cmd, cwd=_ROOT).returncode
52
+
53
+
54
+ def run_audit(*, kind: str = "pipeline", work_dir: Path | None = None, workers: int = 4) -> int:
55
+ scripts = {
56
+ "pipeline": ("scripts/pipeline_audit.py", []),
57
+ "dask": ("scripts/dask_integration_report.py", []),
58
+ "production": ("scripts/production_scale_audit.py", ["--workers", "1", "2"]),
59
+ "library": ("scripts/library_migration_report.py", []),
60
+ "stage0": ("scripts/stage0_production_verify.py", ["--workers", str(workers)]),
61
+ }
62
+ rel, extra = scripts.get(kind, scripts["pipeline"])
63
+ script = _ROOT / rel
64
+ if not script.is_file():
65
+ print(f"audit script not found: {kind}", file=sys.stderr)
66
+ return 1
67
+ cmd = [sys.executable, str(script), *extra]
68
+ if work_dir is not None and kind == "pipeline":
69
+ cmd.extend(["--work-dir", str(work_dir)])
70
+ return subprocess.run(cmd, cwd=_ROOT).returncode
indw/__init__.py ADDED
@@ -0,0 +1,42 @@
1
+ from importlib.metadata import PackageNotFoundError, version
2
+
3
+ try:
4
+ __version__ = version('indw')
5
+ except PackageNotFoundError:
6
+ __version__ = '0.0.0'
7
+
8
+ _LAZY_EXPORTS = {
9
+ 'CorpusRegistry': ('indw.store.corpus.registry', 'CorpusRegistry'),
10
+ 'DatasetDownloader': ('indw.ingest.download', 'DatasetDownloader'),
11
+ 'FastDatasetPipeline': ('indw.ingest.run', 'FastDatasetPipeline'),
12
+ 'setup_dataset_logging': ('indw.ingest.log', 'setup_dataset_logging'),
13
+ 'ScriptProfile': ('indw.filter.language.script', 'ScriptProfile'),
14
+ 'analyze_script_profile': ('indw.filter.language.script', 'analyze_script_profile'),
15
+ 'MultilingualPolicyConfig': ('indw.filter.language.script_policy', 'MultilingualPolicyConfig'),
16
+ 'MixtureOrchestrationConfig': ('indw.schedule.mix.config', 'MixtureOrchestrationConfig'),
17
+ 'CorpusMixturePlan': ('indw.schedule.mix.plan', 'CorpusMixturePlan'),
18
+ 'adapt_mixture_from_telemetry': ('indw.schedule.mix.telemetry', 'adapt_mixture_from_telemetry'),
19
+ 'build_corpus_mixture_plan': ('indw.schedule.mix.mixture_planner', 'build_corpus_mixture_plan'),
20
+ 'QualityPipelineConfig': ('indw.filter.spec.quality', 'QualityPipelineConfig'),
21
+ 'merge_with_quality': ('indw.schedule.core', 'merge_with_quality'),
22
+ 'QualityGate': ('indw.filter.gate.quality', 'QualityGate'),
23
+ 'export_token_bins_fast': ('indw.store.export.fast_export', 'export_token_bins_fast'),
24
+ 'build_pretrain_dataloader': ('indw.store.export.memmap_stream', 'build_pretrain_dataloader'),
25
+ 'build_val_dataloader': ('indw.store.export.memmap_stream', 'build_val_dataloader'),
26
+ }
27
+
28
+ __all__ = [
29
+ '__version__',
30
+ *sorted(_LAZY_EXPORTS),
31
+ ]
32
+
33
+
34
+ def __getattr__(name: str):
35
+ if name in _LAZY_EXPORTS:
36
+ import importlib
37
+
38
+ mod_path, attr = _LAZY_EXPORTS[name]
39
+ val = getattr(importlib.import_module(mod_path), attr)
40
+ globals()[name] = val
41
+ return val
42
+ raise AttributeError(f'module {__name__!r} has no attribute {name!r}')
indw/_compat.py ADDED
@@ -0,0 +1,35 @@
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ import importlib.abc
5
+ import importlib.util
6
+ import sys
7
+
8
+ _installed = False
9
+
10
+
11
+ class _IndwCompatFinder(importlib.abc.MetaPathFinder):
12
+ def find_spec(self, fullname, path, target=None):
13
+ if fullname != "data" and not fullname.startswith("data."):
14
+ return None
15
+ indw_name = "indw" if fullname == "data" else "indw" + fullname[4:]
16
+ if fullname in sys.modules:
17
+ return importlib.util.spec_from_loader(fullname, loader=None)
18
+ try:
19
+ mod = importlib.import_module(indw_name)
20
+ except ModuleNotFoundError:
21
+ return None
22
+ sys.modules[fullname] = mod
23
+ return importlib.util.spec_from_loader(fullname, loader=None)
24
+
25
+
26
+ def install_compat() -> None:
27
+ global _installed
28
+ if _installed:
29
+ return
30
+ for finder in sys.meta_path:
31
+ if isinstance(finder, _IndwCompatFinder):
32
+ _installed = True
33
+ return
34
+ sys.meta_path.insert(0, _IndwCompatFinder())
35
+ _installed = True
indw/clean/__init__.py ADDED
@@ -0,0 +1,29 @@
1
+ __all__ = [
2
+ 'CorpusCleaningPipeline',
3
+ 'CleaningResult',
4
+ 'extract_row_text',
5
+ 'final_pass_jsonl_row',
6
+ 'process_jsonl_row',
7
+ 'row_text_key',
8
+ ]
9
+
10
+ _LAZY = {
11
+ 'CorpusCleaningPipeline': ('indw.clean.corpus', 'CorpusCleaningPipeline'),
12
+ 'CleaningResult': ('indw.clean.corpus', 'CleaningResult'),
13
+ 'extract_row_text': ('indw.clean.corpus', 'extract_row_text'),
14
+ 'final_pass_jsonl_row': ('indw.clean.corpus', 'final_pass_jsonl_row'),
15
+ 'process_jsonl_row': ('indw.clean.corpus', 'process_jsonl_row'),
16
+ 'row_text_key': ('indw.clean.corpus', 'row_text_key'),
17
+ }
18
+
19
+
20
+ def __getattr__(name: str):
21
+ if name in _LAZY:
22
+ module_path, attr = _LAZY[name]
23
+ import importlib
24
+
25
+ mod = importlib.import_module(module_path)
26
+ val = getattr(mod, attr)
27
+ globals()[name] = val
28
+ return val
29
+ raise AttributeError(f'module {__name__!r} has no attribute {name!r}')
@@ -0,0 +1,80 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from dataclasses import dataclass, field
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from indw.clean.artifact.discovery_config import DiscoveryConfig
9
+ from indw.clean.artifact.discovery_corpus import CorpusStatsAccumulator
10
+ from indw.clean.artifact.discovery_registry import DynamicArtifactRegistry
11
+
12
+ @dataclass
13
+ class ShadowDisagreement:
14
+ doc_id: str = ''
15
+ legacy_ratio: float = 0.0
16
+ discovery_ratio: float = 0.0
17
+ delta: float = 0.0
18
+
19
+ def to_dict(self) -> dict[str, Any]:
20
+ return {
21
+ 'doc_id': self.doc_id,
22
+ 'legacy_ratio': round(self.legacy_ratio, 4),
23
+ 'discovery_ratio': round(self.discovery_ratio, 4),
24
+ 'delta': round(self.delta, 4),
25
+ }
26
+
27
+ @dataclass
28
+ class CalibrationReport:
29
+ batch_id: int = 0
30
+ docs_seen: int = 0
31
+ promoted: int = 0
32
+ demoted: int = 0
33
+ registry_size: int = 0
34
+ shadow_disagreements: list[ShadowDisagreement] = field(default_factory=list)
35
+ trim_threshold: float = 0.92
36
+
37
+ def to_dict(self) -> dict[str, Any]:
38
+ return {
39
+ 'batch_id': self.batch_id,
40
+ 'docs_seen': self.docs_seen,
41
+ 'promoted': self.promoted,
42
+ 'demoted': self.demoted,
43
+ 'registry_size': self.registry_size,
44
+ 'trim_threshold': self.trim_threshold,
45
+ 'shadow_disagreements': [d.to_dict() for d in self.shadow_disagreements[-500:]],
46
+ }
47
+
48
+ def batch_calibrate(
49
+ accumulator: CorpusStatsAccumulator,
50
+ registry: DynamicArtifactRegistry,
51
+ config: DiscoveryConfig,
52
+ *,
53
+ corpus_dir: str = '',
54
+ shadow_disagreements: list[ShadowDisagreement] | None = None,
55
+ ) -> CalibrationReport:
56
+ accumulator.end_batch(decay=config.decay)
57
+ cal = registry.calibrate(accumulator)
58
+ report = CalibrationReport(
59
+ batch_id=accumulator.batch_id,
60
+ docs_seen=accumulator.docs_seen,
61
+ promoted=cal['promoted'],
62
+ demoted=cal['demoted'],
63
+ registry_size=cal['total'],
64
+ shadow_disagreements=shadow_disagreements or [],
65
+ trim_threshold=config.min_trim_confidence,
66
+ )
67
+ if corpus_dir:
68
+ out = Path(corpus_dir) / 'discovery_calibration.json'
69
+ out.parent.mkdir(parents=True, exist_ok=True)
70
+ existing: list[dict] = []
71
+ if out.exists():
72
+ try:
73
+ existing = json.loads(out.read_text(encoding='utf-8'))
74
+ if not isinstance(existing, list):
75
+ existing = [existing]
76
+ except (json.JSONDecodeError, OSError):
77
+ existing = []
78
+ existing.append(report.to_dict())
79
+ out.write_text(json.dumps(existing[-50:], indent=2), encoding='utf-8')
80
+ return report
@@ -0,0 +1,132 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+
5
+ from indw.clean.artifact.discovery_corpus import CorpusStatsAccumulator
6
+ from indw.clean.artifact.decompose import DocumentUnit, position_bin
7
+ from indw.clean.artifact.discovery_registry import ArtifactEntry, DynamicArtifactRegistry
8
+ from indw.clean.artifact.safeguards import is_protected_unit
9
+ from indw.clean.document.value import analyze_content_value
10
+
11
+ @dataclass
12
+ class FusedConfidence:
13
+ artifact_confidence: float = 0.0
14
+ knowledge_confidence: float = 0.0
15
+ frequency_confidence: float = 0.0
16
+ position_confidence: float = 0.0
17
+ structural_confidence: float = 0.0
18
+ novelty_confidence: float = 0.0
19
+ repetition_confidence: float = 0.0
20
+ coverage_confidence: float = 0.0
21
+ entropy_confidence: float = 0.0
22
+ trim_tier: str = 'keep'
23
+ would_trim: bool = False
24
+
25
+ def to_dict(self) -> dict[str, float | bool | str]:
26
+ return {
27
+ 'artifact_confidence': round(self.artifact_confidence, 4),
28
+ 'knowledge_confidence': round(self.knowledge_confidence, 4),
29
+ 'frequency_confidence': round(self.frequency_confidence, 4),
30
+ 'position_confidence': round(self.position_confidence, 4),
31
+ 'structural_confidence': round(self.structural_confidence, 4),
32
+ 'novelty_confidence': round(self.novelty_confidence, 4),
33
+ 'repetition_confidence': round(self.repetition_confidence, 4),
34
+ 'coverage_confidence': round(self.coverage_confidence, 4),
35
+ 'entropy_confidence': round(self.entropy_confidence, 4),
36
+ 'trim_tier': self.trim_tier,
37
+ 'would_trim': self.would_trim,
38
+ }
39
+
40
+ class ConfidenceFusion:
41
+ def __init__(
42
+ self,
43
+ *,
44
+ min_trim_confidence: float = 0.92,
45
+ medium_trim_confidence: float = 0.72,
46
+ knowledge_dampen: float = 0.55,
47
+ ) -> None:
48
+ self.min_trim_confidence = min_trim_confidence
49
+ self.medium_trim_confidence = medium_trim_confidence
50
+ self.knowledge_dampen = knowledge_dampen
51
+
52
+ def fuse_unit(
53
+ self,
54
+ unit: DocumentUnit,
55
+ entry: ArtifactEntry | None,
56
+ *,
57
+ doc_text: str,
58
+ count_in_doc: int = 1,
59
+ doc_len: int = 0,
60
+ ) -> FusedConfidence:
61
+ if entry is None:
62
+ return FusedConfidence()
63
+
64
+ if is_protected_unit(unit.text, kind=unit.kind, in_fence=unit.in_fence):
65
+ return FusedConfidence(knowledge_confidence=1.0, trim_tier='protected')
66
+
67
+ ctx_start = max(0, unit.start - 200)
68
+ ctx_end = min(len(doc_text), unit.end + 200)
69
+ context = doc_text[ctx_start:ctx_end]
70
+ cv = analyze_content_value(context)
71
+ knowledge = cv.overall_value_score
72
+ if cv.evidence and cv.evidence.preserve:
73
+ knowledge = max(knowledge, 0.85)
74
+
75
+ repetition = 1.0 - min(1.0, 1.0 / max(count_in_doc, 1))
76
+ coverage = entry.frequency_confidence
77
+ entropy_signal = 1.0 - entry.novelty_confidence
78
+
79
+ artifact = entry.artifact_confidence
80
+ artifact = min(1.0, artifact * 0.75 + repetition * 0.15 + coverage * 0.10)
81
+ if knowledge > self.knowledge_dampen:
82
+ artifact *= max(0.0, 1.0 - (knowledge - self.knowledge_dampen))
83
+
84
+ tier = 'keep'
85
+ would_trim = False
86
+ if artifact >= self.min_trim_confidence and knowledge < self.knowledge_dampen and entry.novelty_confidence < 0.35:
87
+ if not unit.in_fence and unit.kind != 'code':
88
+ tier = 'high'
89
+ would_trim = True
90
+ elif artifact >= self.medium_trim_confidence and knowledge < 0.45 and entry.novelty_confidence < 0.25:
91
+ bin_idx = position_bin(unit.start, max(doc_len, 1))
92
+ if bin_idx in (0, 4) and unit.kind in ('line', 'header', 'footer'):
93
+ tier = 'medium'
94
+ would_trim = True
95
+
96
+ return FusedConfidence(
97
+ artifact_confidence=artifact,
98
+ knowledge_confidence=knowledge,
99
+ frequency_confidence=entry.frequency_confidence,
100
+ position_confidence=entry.position_confidence,
101
+ structural_confidence=entry.structural_confidence,
102
+ novelty_confidence=entry.novelty_confidence,
103
+ repetition_confidence=repetition,
104
+ coverage_confidence=coverage,
105
+ entropy_confidence=entropy_signal,
106
+ trim_tier=tier,
107
+ would_trim=would_trim,
108
+ )
109
+
110
+ def fuse_document(
111
+ self,
112
+ units: list[DocumentUnit],
113
+ registry: DynamicArtifactRegistry,
114
+ accumulator: CorpusStatsAccumulator,
115
+ doc_text: str,
116
+ *,
117
+ key_counts: dict[str, int] | None = None,
118
+ ) -> list[tuple[DocumentUnit, FusedConfidence]]:
119
+ counts: dict[str, int] = key_counts or {}
120
+ doc_len = len(doc_text)
121
+ out: list[tuple[DocumentUnit, FusedConfidence]] = []
122
+ for unit in units:
123
+ from indw.clean.artifact.discovery_corpus import fragment_key
124
+
125
+ key = fragment_key(unit.text, unit.layout)
126
+ cnt = counts.get(key, 1)
127
+ entry = registry.lookup(unit.text, accumulator, layout=unit.layout, count_in_doc=cnt)
128
+ fused = self.fuse_unit(
129
+ unit, entry, doc_text=doc_text, count_in_doc=cnt, doc_len=doc_len,
130
+ )
131
+ out.append((unit, fused))
132
+ return out