dataforge-07 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataforge_07-0.1.0/LICENSE +176 -0
- dataforge_07-0.1.0/MANIFEST.in +33 -0
- dataforge_07-0.1.0/PKG-INFO +436 -0
- dataforge_07-0.1.0/README.md +328 -0
- dataforge_07-0.1.0/dataforge/__init__.py +204 -0
- dataforge_07-0.1.0/dataforge/__main__.py +5 -0
- dataforge_07-0.1.0/dataforge/agent/__init__.py +16 -0
- dataforge_07-0.1.0/dataforge/agent/providers.py +259 -0
- dataforge_07-0.1.0/dataforge/agent/scratchpad.py +183 -0
- dataforge_07-0.1.0/dataforge/agent/tool_actions.py +343 -0
- dataforge_07-0.1.0/dataforge/bench/__init__.py +31 -0
- dataforge_07-0.1.0/dataforge/bench/core.py +426 -0
- dataforge_07-0.1.0/dataforge/bench/groq_client.py +386 -0
- dataforge_07-0.1.0/dataforge/bench/methods.py +443 -0
- dataforge_07-0.1.0/dataforge/bench/report.py +309 -0
- dataforge_07-0.1.0/dataforge/bench/runner.py +247 -0
- dataforge_07-0.1.0/dataforge/causal/__init__.py +21 -0
- dataforge_07-0.1.0/dataforge/causal/dag.py +174 -0
- dataforge_07-0.1.0/dataforge/causal/pc.py +232 -0
- dataforge_07-0.1.0/dataforge/causal/root_cause.py +193 -0
- dataforge_07-0.1.0/dataforge/cli/__init__.py +50 -0
- dataforge_07-0.1.0/dataforge/cli/audit.py +70 -0
- dataforge_07-0.1.0/dataforge/cli/bench.py +154 -0
- dataforge_07-0.1.0/dataforge/cli/common.py +267 -0
- dataforge_07-0.1.0/dataforge/cli/constraints.py +407 -0
- dataforge_07-0.1.0/dataforge/cli/profile.py +147 -0
- dataforge_07-0.1.0/dataforge/cli/release.py +166 -0
- dataforge_07-0.1.0/dataforge/cli/repair.py +407 -0
- dataforge_07-0.1.0/dataforge/cli/revert.py +139 -0
- dataforge_07-0.1.0/dataforge/cli/watch.py +144 -0
- dataforge_07-0.1.0/dataforge/datasets/__init__.py +25 -0
- dataforge_07-0.1.0/dataforge/datasets/embedded/hospital/clean.csv +11 -0
- dataforge_07-0.1.0/dataforge/datasets/embedded/hospital/dirty.csv +11 -0
- dataforge_07-0.1.0/dataforge/datasets/real_world.py +290 -0
- dataforge_07-0.1.0/dataforge/datasets/registry.py +103 -0
- dataforge_07-0.1.0/dataforge/detectors/__init__.py +80 -0
- dataforge_07-0.1.0/dataforge/detectors/base.py +145 -0
- dataforge_07-0.1.0/dataforge/detectors/decimal_shift.py +166 -0
- dataforge_07-0.1.0/dataforge/detectors/fd_violation.py +157 -0
- dataforge_07-0.1.0/dataforge/detectors/type_mismatch.py +173 -0
- dataforge_07-0.1.0/dataforge/engine/__init__.py +39 -0
- dataforge_07-0.1.0/dataforge/engine/repair.py +905 -0
- dataforge_07-0.1.0/dataforge/env/__init__.py +22 -0
- dataforge_07-0.1.0/dataforge/env/environment.py +883 -0
- dataforge_07-0.1.0/dataforge/env/observation.py +61 -0
- dataforge_07-0.1.0/dataforge/env/openenv_core.py +161 -0
- dataforge_07-0.1.0/dataforge/env/reward.py +128 -0
- dataforge_07-0.1.0/dataforge/env/server.py +176 -0
- dataforge_07-0.1.0/dataforge/evaluation_contract.py +76 -0
- dataforge_07-0.1.0/dataforge/fixtures/hospital_10rows.csv +11 -0
- dataforge_07-0.1.0/dataforge/fixtures/hospital_schema.yaml +17 -0
- dataforge_07-0.1.0/dataforge/http/__init__.py +1 -0
- dataforge_07-0.1.0/dataforge/http/problem.py +103 -0
- dataforge_07-0.1.0/dataforge/integrations/__init__.py +1 -0
- dataforge_07-0.1.0/dataforge/integrations/dbt.py +164 -0
- dataforge_07-0.1.0/dataforge/observability.py +76 -0
- dataforge_07-0.1.0/dataforge/py.typed +1 -0
- dataforge_07-0.1.0/dataforge/release/__init__.py +1 -0
- dataforge_07-0.1.0/dataforge/release/doctor.py +367 -0
- dataforge_07-0.1.0/dataforge/release/full_vision.py +702 -0
- dataforge_07-0.1.0/dataforge/release/gate.py +861 -0
- dataforge_07-0.1.0/dataforge/release/playground_check.py +411 -0
- dataforge_07-0.1.0/dataforge/repair_contract.py +468 -0
- dataforge_07-0.1.0/dataforge/repairers/__init__.py +88 -0
- dataforge_07-0.1.0/dataforge/repairers/base.py +77 -0
- dataforge_07-0.1.0/dataforge/repairers/decimal_shift.py +43 -0
- dataforge_07-0.1.0/dataforge/repairers/fd_violation.py +225 -0
- dataforge_07-0.1.0/dataforge/repairers/type_mismatch.py +73 -0
- dataforge_07-0.1.0/dataforge/safety/__init__.py +5 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_01_phone_pii.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_02_phone_pii.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_03_phone_pii.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_04_phone_pii.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_05_phone_pii.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_06_phone_pii.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_07_phone_pii.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_08_phone_pii.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_09_phone_pii.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_10_phone_pii.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_11_ssn_pii.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_12_ssn_pii.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_13_ssn_pii.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_14_ssn_pii.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_15_ssn_pii.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_16_ssn_pii.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_17_ssn_pii.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_18_ssn_pii.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_19_ssn_pii.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_20_ssn_pii.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_21_email_pii.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_22_email_pii.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_23_email_pii.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_24_email_pii.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_25_email_pii.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_26_email_pii.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_27_email_pii.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_28_email_pii.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_29_email_pii.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_30_email_pii.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_31_row_delete.yaml +7 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_32_row_delete.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_33_row_delete.yaml +7 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_34_row_delete.yaml +7 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_35_row_delete.yaml +7 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_36_row_delete.yaml +11 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_37_row_delete.yaml +7 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_38_row_delete.yaml +7 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_39_row_delete.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_40_row_delete.yaml +7 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_41_row_delete.yaml +7 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_42_row_delete.yaml +7 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_43_row_delete.yaml +7 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_44_row_delete.yaml +7 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_45_row_delete.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_46_row_delete.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_47_row_delete.yaml +7 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_48_row_delete.yaml +7 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_49_row_delete.yaml +8 -0
- dataforge_07-0.1.0/dataforge/safety/adversarial/attack_50_row_delete.yaml +7 -0
- dataforge_07-0.1.0/dataforge/safety/constitution.py +307 -0
- dataforge_07-0.1.0/dataforge/safety/constitutions/default.yaml +40 -0
- dataforge_07-0.1.0/dataforge/safety/filter.py +134 -0
- dataforge_07-0.1.0/dataforge/schema_inference.py +620 -0
- dataforge_07-0.1.0/dataforge/stores/__init__.py +46 -0
- dataforge_07-0.1.0/dataforge/stores/base.py +73 -0
- dataforge_07-0.1.0/dataforge/stores/cloud.py +78 -0
- dataforge_07-0.1.0/dataforge/stores/csv.py +94 -0
- dataforge_07-0.1.0/dataforge/stores/duckdb.py +313 -0
- dataforge_07-0.1.0/dataforge/stores/patch_plan.py +178 -0
- dataforge_07-0.1.0/dataforge/stores/registry.py +82 -0
- dataforge_07-0.1.0/dataforge/stores/repair.py +121 -0
- dataforge_07-0.1.0/dataforge/stores/revert.py +22 -0
- dataforge_07-0.1.0/dataforge/stores/sql.py +27 -0
- dataforge_07-0.1.0/dataforge/table.py +228 -0
- dataforge_07-0.1.0/dataforge/transactions/__init__.py +34 -0
- dataforge_07-0.1.0/dataforge/transactions/files.py +96 -0
- dataforge_07-0.1.0/dataforge/transactions/log.py +613 -0
- dataforge_07-0.1.0/dataforge/transactions/revert.py +102 -0
- dataforge_07-0.1.0/dataforge/transactions/txn.py +104 -0
- dataforge_07-0.1.0/dataforge/ui/__init__.py +1 -0
- dataforge_07-0.1.0/dataforge/ui/profile_view.py +136 -0
- dataforge_07-0.1.0/dataforge/ui/repair_diff.py +91 -0
- dataforge_07-0.1.0/dataforge/verifier/__init__.py +55 -0
- dataforge_07-0.1.0/dataforge/verifier/constraint_ir.py +155 -0
- dataforge_07-0.1.0/dataforge/verifier/explain.py +47 -0
- dataforge_07-0.1.0/dataforge/verifier/gate.py +5 -0
- dataforge_07-0.1.0/dataforge/verifier/schema.py +111 -0
- dataforge_07-0.1.0/dataforge/verifier/smt.py +433 -0
- dataforge_07-0.1.0/dataforge_07.egg-info/PKG-INFO +436 -0
- dataforge_07-0.1.0/dataforge_07.egg-info/SOURCES.txt +154 -0
- dataforge_07-0.1.0/dataforge_07.egg-info/dependency_links.txt +1 -0
- dataforge_07-0.1.0/dataforge_07.egg-info/entry_points.txt +3 -0
- dataforge_07-0.1.0/dataforge_07.egg-info/requires.txt +101 -0
- dataforge_07-0.1.0/dataforge_07.egg-info/top_level.txt +1 -0
- dataforge_07-0.1.0/pyproject.toml +204 -0
- dataforge_07-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
Apache License
|
|
2
|
+
Version 2.0, January 2004
|
|
3
|
+
http://www.apache.org/licenses/
|
|
4
|
+
|
|
5
|
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
|
6
|
+
|
|
7
|
+
1. Definitions.
|
|
8
|
+
|
|
9
|
+
"License" shall mean the terms and conditions for use, reproduction,
|
|
10
|
+
and distribution as defined by Sections 1 through 9 of this document.
|
|
11
|
+
|
|
12
|
+
"Licensor" shall mean the copyright owner or entity authorized by
|
|
13
|
+
the copyright owner that is granting the License.
|
|
14
|
+
|
|
15
|
+
"Legal Entity" shall mean the union of the acting entity and all
|
|
16
|
+
other entities that control, are controlled by, or are under common
|
|
17
|
+
control with that entity. For the purposes of this definition,
|
|
18
|
+
"control" means (i) the power, direct or indirect, to cause the
|
|
19
|
+
direction or management of such entity, whether by contract or
|
|
20
|
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
|
21
|
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
|
22
|
+
|
|
23
|
+
"You" (or "Your") shall mean an individual or Legal Entity
|
|
24
|
+
exercising permissions granted by this License.
|
|
25
|
+
|
|
26
|
+
"Source" form shall mean the preferred form for making modifications,
|
|
27
|
+
including but not limited to software source code, documentation
|
|
28
|
+
source, and configuration files.
|
|
29
|
+
|
|
30
|
+
"Object" form shall mean any form resulting from mechanical
|
|
31
|
+
transformation or translation of a Source form, including but
|
|
32
|
+
not limited to compiled object code, generated documentation,
|
|
33
|
+
and conversions to other media types.
|
|
34
|
+
|
|
35
|
+
"Work" shall mean the work of authorship, whether in Source or
|
|
36
|
+
Object form, made available under the License, as indicated by a
|
|
37
|
+
copyright notice that is included in or attached to the work
|
|
38
|
+
(an example is provided in the Appendix below).
|
|
39
|
+
|
|
40
|
+
"Derivative Works" shall mean any work, whether in Source or Object
|
|
41
|
+
form, that is based on (or derived from) the Work and for which the
|
|
42
|
+
editorial revisions, annotations, elaborations, or other modifications
|
|
43
|
+
represent, as a whole, an original work of authorship. For the purposes
|
|
44
|
+
of this License, Derivative Works shall not include works that remain
|
|
45
|
+
separable from, or merely link (or bind by name) to the interfaces of,
|
|
46
|
+
the Work and Derivative Works thereof.
|
|
47
|
+
|
|
48
|
+
"Contribution" shall mean any work of authorship, including
|
|
49
|
+
the original version of the Work and any modifications or additions
|
|
50
|
+
to that Work or Derivative Works thereof, that is intentionally
|
|
51
|
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
|
52
|
+
or by an individual or Legal Entity authorized to submit on behalf of
|
|
53
|
+
the copyright owner. For the purposes of this definition, "submitted"
|
|
54
|
+
means any form of electronic, verbal, or written communication sent
|
|
55
|
+
to the Licensor or its representatives, including but not limited to
|
|
56
|
+
communication on electronic mailing lists, source code control systems,
|
|
57
|
+
and issue tracking systems that are managed by, or on behalf of, the
|
|
58
|
+
Licensor for the purpose of discussing and improving the Work, but
|
|
59
|
+
excluding communication that is conspicuously marked or otherwise
|
|
60
|
+
designated in writing by the copyright owner as "Not a Contribution."
|
|
61
|
+
|
|
62
|
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
|
63
|
+
on behalf of whom a Contribution has been received by Licensor and
|
|
64
|
+
subsequently incorporated within the Work.
|
|
65
|
+
|
|
66
|
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
|
67
|
+
this License, each Contributor hereby grants to You a perpetual,
|
|
68
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
|
69
|
+
copyright license to reproduce, prepare Derivative Works of,
|
|
70
|
+
publicly display, publicly perform, sublicense, and distribute the
|
|
71
|
+
Work and such Derivative Works in Source or Object form.
|
|
72
|
+
|
|
73
|
+
3. Grant of Patent License. Subject to the terms and conditions of
|
|
74
|
+
this License, each Contributor hereby grants to You a perpetual,
|
|
75
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
|
76
|
+
(except as stated in this section) patent license to make, have made,
|
|
77
|
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
|
78
|
+
where such license applies only to those patent claims licensable
|
|
79
|
+
by such Contributor that are necessarily infringed by their
|
|
80
|
+
Contribution(s) alone or by combination of their Contribution(s)
|
|
81
|
+
with the Work to which such Contribution(s) was submitted. If You
|
|
82
|
+
institute patent litigation against any entity (including a
|
|
83
|
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
|
84
|
+
or a Contribution incorporated within the Work constitutes direct
|
|
85
|
+
or contributory patent infringement, then any patent licenses
|
|
86
|
+
granted to You under this License for that Work shall terminate
|
|
87
|
+
as of the date such litigation is filed.
|
|
88
|
+
|
|
89
|
+
4. Redistribution. You may reproduce and distribute copies of the
|
|
90
|
+
Work or Derivative Works thereof in any medium, with or without
|
|
91
|
+
modifications, and in Source or Object form, provided that You
|
|
92
|
+
meet the following conditions:
|
|
93
|
+
|
|
94
|
+
(a) You must give any other recipients of the Work or
|
|
95
|
+
Derivative Works a copy of this License; and
|
|
96
|
+
|
|
97
|
+
(b) You must cause any modified files to carry prominent notices
|
|
98
|
+
stating that You changed the files; and
|
|
99
|
+
|
|
100
|
+
(c) You must retain, in the Source form of any Derivative Works
|
|
101
|
+
that You distribute, all copyright, patent, trademark, and
|
|
102
|
+
attribution notices from the Source form of the Work,
|
|
103
|
+
excluding those notices that do not pertain to any part of
|
|
104
|
+
the Derivative Works; and
|
|
105
|
+
|
|
106
|
+
(d) If the Work includes a "NOTICE" text file as part of its
|
|
107
|
+
distribution, then any Derivative Works that You distribute must
|
|
108
|
+
include a readable copy of the attribution notices contained
|
|
109
|
+
within such NOTICE file, excluding those notices that do not
|
|
110
|
+
pertain to any part of the Derivative Works, in at least one
|
|
111
|
+
of the following places: within a NOTICE text file distributed
|
|
112
|
+
as part of the Derivative Works; within the Source form or
|
|
113
|
+
documentation, if provided along with the Derivative Works; or,
|
|
114
|
+
within a display generated by the Derivative Works, if and
|
|
115
|
+
wherever such third-party notices normally appear. The contents
|
|
116
|
+
of the NOTICE file are for informational purposes only and
|
|
117
|
+
do not modify the License. You may add Your own attribution
|
|
118
|
+
notices within Derivative Works that You distribute, alongside
|
|
119
|
+
or as an addendum to the NOTICE text from the Work, provided
|
|
120
|
+
that such additional attribution notices cannot be construed
|
|
121
|
+
as modifying the License.
|
|
122
|
+
|
|
123
|
+
You may add Your own copyright statement to Your modifications and
|
|
124
|
+
may provide additional or different license terms and conditions
|
|
125
|
+
for use, reproduction, or distribution of Your modifications, or
|
|
126
|
+
for any such Derivative Works as a whole, provided Your use,
|
|
127
|
+
reproduction, and distribution of the Work otherwise complies with
|
|
128
|
+
the conditions stated in this License.
|
|
129
|
+
|
|
130
|
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
|
131
|
+
any Contribution intentionally submitted for inclusion in the Work
|
|
132
|
+
by You to the Licensor shall be under the terms and conditions of
|
|
133
|
+
this License, without any additional terms or conditions.
|
|
134
|
+
Notwithstanding the above, nothing herein shall supersede or modify
|
|
135
|
+
the terms of any separate license agreement you may have executed
|
|
136
|
+
with Licensor regarding such Contributions.
|
|
137
|
+
|
|
138
|
+
6. Trademarks. This License does not grant permission to use the trade
|
|
139
|
+
names, trademarks, service marks, or product names of the Licensor,
|
|
140
|
+
except as required for reasonable and customary use in describing the
|
|
141
|
+
origin of the Work and reproducing the content of the NOTICE file.
|
|
142
|
+
|
|
143
|
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
|
144
|
+
agreed to in writing, Licensor provides the Work (and each
|
|
145
|
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
|
146
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
147
|
+
implied, including, without limitation, any warranties or conditions
|
|
148
|
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
|
149
|
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
|
150
|
+
appropriateness of using or redistributing the Work and assume any
|
|
151
|
+
risks associated with Your exercise of permissions under this License.
|
|
152
|
+
|
|
153
|
+
8. Limitation of Liability. In no event and under no legal theory,
|
|
154
|
+
whether in tort (including negligence), contract, or otherwise,
|
|
155
|
+
unless required by applicable law (such as deliberate and grossly
|
|
156
|
+
negligent acts) or agreed to in writing, shall any Contributor be
|
|
157
|
+
liable to You for damages, including any direct, indirect, special,
|
|
158
|
+
incidental, or consequential damages of any character arising as a
|
|
159
|
+
result of this License or out of the use or inability to use the
|
|
160
|
+
Work (including but not limited to damages for loss of goodwill,
|
|
161
|
+
work stoppage, computer failure or malfunction, or any and all
|
|
162
|
+
other commercial damages or losses), even if such Contributor
|
|
163
|
+
has been advised of the possibility of such damages.
|
|
164
|
+
|
|
165
|
+
9. Accepting Warranty or Additional Liability. While redistributing
|
|
166
|
+
the Work or Derivative Works thereof, You may choose to offer,
|
|
167
|
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
|
168
|
+
or other liability obligations and/or rights consistent with this
|
|
169
|
+
License. However, in accepting such obligations, You may act only
|
|
170
|
+
on Your own behalf and on Your sole responsibility, not on behalf
|
|
171
|
+
of any other Contributor, and only if You agree to indemnify,
|
|
172
|
+
defend, and hold each Contributor harmless for any liability
|
|
173
|
+
incurred by, or claims asserted against, such Contributor by reason
|
|
174
|
+
of your accepting any such warranty or additional liability.
|
|
175
|
+
|
|
176
|
+
END OF TERMS AND CONDITIONS
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
prune data_quality_env
|
|
2
|
+
prune tests
|
|
3
|
+
prune docs
|
|
4
|
+
prune scripts
|
|
5
|
+
prune training
|
|
6
|
+
prune playground
|
|
7
|
+
prune playground-model
|
|
8
|
+
prune dataforge-mcp
|
|
9
|
+
prune benchmark_results
|
|
10
|
+
prune datasets
|
|
11
|
+
prune eval
|
|
12
|
+
prune logs
|
|
13
|
+
prune build
|
|
14
|
+
prune dist
|
|
15
|
+
prune .github
|
|
16
|
+
prune .hf-space-repo
|
|
17
|
+
prune .hf-space-stage
|
|
18
|
+
prune .hf-space-stage-plan
|
|
19
|
+
exclude analyze_trajectory.py
|
|
20
|
+
exclude benchmark.py
|
|
21
|
+
exclude client.py
|
|
22
|
+
exclude compat.py
|
|
23
|
+
exclude generate_datasets.py
|
|
24
|
+
exclude heuristic_baseline.py
|
|
25
|
+
exclude inference.py
|
|
26
|
+
exclude models.py
|
|
27
|
+
exclude random_baseline.py
|
|
28
|
+
exclude run_baseline.py
|
|
29
|
+
exclude test_env.py
|
|
30
|
+
exclude verify_all_fields.py
|
|
31
|
+
exclude verify_nuclear.py
|
|
32
|
+
exclude verify_score_range.py
|
|
33
|
+
global-exclude __pycache__ *.py[cod] *.ipynb .pytest_cache .mypy_cache .ruff_cache .hypothesis
|
|
@@ -0,0 +1,436 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dataforge_07
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: DataForge: CLI-first data-quality detection and reversible repair for tabular data.
|
|
5
|
+
License-Expression: Apache-2.0
|
|
6
|
+
Project-URL: Homepage, https://github.com/Aegis15/dataforge
|
|
7
|
+
Project-URL: Repository, https://github.com/Aegis15/dataforge
|
|
8
|
+
Project-URL: Documentation, https://dataforge.praneshrajan15.workers.dev/playground
|
|
9
|
+
Keywords: data-quality,ai-agent,llm,rl,smt,dbt
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Requires-Python: <3.13,>=3.11
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
License-File: LICENSE
|
|
16
|
+
Requires-Dist: pydantic>=2.7
|
|
17
|
+
Requires-Dist: typer<0.25,>=0.24
|
|
18
|
+
Requires-Dist: rich>=13.7
|
|
19
|
+
Requires-Dist: textual<9,>=8.2
|
|
20
|
+
Requires-Dist: z3-solver>=4.13
|
|
21
|
+
Requires-Dist: pyyaml>=6.0
|
|
22
|
+
Requires-Dist: pandas>=2.2
|
|
23
|
+
Requires-Dist: httpx>=0.27
|
|
24
|
+
Requires-Dist: python-dotenv>=1.0
|
|
25
|
+
Provides-Extra: bench
|
|
26
|
+
Requires-Dist: pandas>=2.2; extra == "bench"
|
|
27
|
+
Requires-Dist: httpx>=0.27; extra == "bench"
|
|
28
|
+
Requires-Dist: tenacity>=8.3; extra == "bench"
|
|
29
|
+
Requires-Dist: python-dotenv>=1.0; extra == "bench"
|
|
30
|
+
Requires-Dist: pyarrow>=16.0; extra == "bench"
|
|
31
|
+
Provides-Extra: causal
|
|
32
|
+
Requires-Dist: pandas>=2.2; extra == "causal"
|
|
33
|
+
Requires-Dist: numpy>=1.26; extra == "causal"
|
|
34
|
+
Requires-Dist: networkx>=3.3; extra == "causal"
|
|
35
|
+
Requires-Dist: causal-learn>=0.1.4; extra == "causal"
|
|
36
|
+
Requires-Dist: hyppo>=0.5.2; extra == "causal"
|
|
37
|
+
Requires-Dist: scipy>=1.13; extra == "causal"
|
|
38
|
+
Provides-Extra: dev
|
|
39
|
+
Requires-Dist: pytest>=9.0.3; extra == "dev"
|
|
40
|
+
Requires-Dist: pytest-cov>=5.0; extra == "dev"
|
|
41
|
+
Requires-Dist: pytest-benchmark>=4.0; extra == "dev"
|
|
42
|
+
Requires-Dist: pytest-xdist>=3.6; extra == "dev"
|
|
43
|
+
Requires-Dist: hypothesis>=6.100; extra == "dev"
|
|
44
|
+
Requires-Dist: mutmut>=3.5; extra == "dev"
|
|
45
|
+
Requires-Dist: build>=1.2; extra == "dev"
|
|
46
|
+
Requires-Dist: pip-audit<3,>=2.10; extra == "dev"
|
|
47
|
+
Requires-Dist: cyclonedx-bom<8,>=7.3; extra == "dev"
|
|
48
|
+
Requires-Dist: cryptography>=46.0.7; extra == "dev"
|
|
49
|
+
Requires-Dist: idna>=3.15; extra == "dev"
|
|
50
|
+
Requires-Dist: pip>=26.1.1; extra == "dev"
|
|
51
|
+
Requires-Dist: urllib3>=2.7; extra == "dev"
|
|
52
|
+
Requires-Dist: ruff>=0.11; extra == "dev"
|
|
53
|
+
Requires-Dist: mypy>=1.10; extra == "dev"
|
|
54
|
+
Requires-Dist: pandas-stubs>=2.2; extra == "dev"
|
|
55
|
+
Requires-Dist: types-PyYAML; extra == "dev"
|
|
56
|
+
Requires-Dist: huggingface_hub==1.13.0; extra == "dev"
|
|
57
|
+
Requires-Dist: httpx>=0.27; extra == "dev"
|
|
58
|
+
Requires-Dist: tenacity>=8.3; extra == "dev"
|
|
59
|
+
Requires-Dist: python-dotenv>=1.0; extra == "dev"
|
|
60
|
+
Requires-Dist: pyarrow>=16.0; extra == "dev"
|
|
61
|
+
Requires-Dist: networkx>=3.3; extra == "dev"
|
|
62
|
+
Requires-Dist: causal-learn>=0.1.4; extra == "dev"
|
|
63
|
+
Requires-Dist: hyppo>=0.5.2; extra == "dev"
|
|
64
|
+
Requires-Dist: scipy>=1.13; extra == "dev"
|
|
65
|
+
Requires-Dist: sqlglot>=25.0; extra == "dev"
|
|
66
|
+
Requires-Dist: duckdb>=1.0; extra == "dev"
|
|
67
|
+
Provides-Extra: train
|
|
68
|
+
Requires-Dist: trl==1.4.0; extra == "train"
|
|
69
|
+
Requires-Dist: transformers==5.7.0; extra == "train"
|
|
70
|
+
Requires-Dist: accelerate==1.13.0; extra == "train"
|
|
71
|
+
Requires-Dist: peft==0.19.1; extra == "train"
|
|
72
|
+
Requires-Dist: bitsandbytes==0.49.2; extra == "train"
|
|
73
|
+
Requires-Dist: datasets==4.8.5; extra == "train"
|
|
74
|
+
Requires-Dist: huggingface_hub==1.13.0; extra == "train"
|
|
75
|
+
Requires-Dist: pyyaml==6.0.3; extra == "train"
|
|
76
|
+
Requires-Dist: pandas==2.3.3; extra == "train"
|
|
77
|
+
Requires-Dist: tensorboard==2.20.0; extra == "train"
|
|
78
|
+
Provides-Extra: eval
|
|
79
|
+
Requires-Dist: matplotlib>=3.9; extra == "eval"
|
|
80
|
+
Requires-Dist: seaborn>=0.13; extra == "eval"
|
|
81
|
+
Provides-Extra: providers
|
|
82
|
+
Requires-Dist: httpx>=0.27; extra == "providers"
|
|
83
|
+
Requires-Dist: tenacity>=8.3; extra == "providers"
|
|
84
|
+
Requires-Dist: python-dotenv>=1.0; extra == "providers"
|
|
85
|
+
Provides-Extra: pandas
|
|
86
|
+
Requires-Dist: pandas>=2.2; extra == "pandas"
|
|
87
|
+
Provides-Extra: playground
|
|
88
|
+
Requires-Dist: pandas>=2.2; extra == "playground"
|
|
89
|
+
Requires-Dist: fastapi>=0.136.1; extra == "playground"
|
|
90
|
+
Requires-Dist: starlette<2,>=1.0.1; extra == "playground"
|
|
91
|
+
Requires-Dist: uvicorn[standard]>=0.35; extra == "playground"
|
|
92
|
+
Requires-Dist: python-multipart>=0.0.27; extra == "playground"
|
|
93
|
+
Requires-Dist: slowapi>=0.1.9; extra == "playground"
|
|
94
|
+
Provides-Extra: openenv
|
|
95
|
+
Requires-Dist: pandas>=2.2; extra == "openenv"
|
|
96
|
+
Requires-Dist: openenv-core[core]>=0.2.2; extra == "openenv"
|
|
97
|
+
Requires-Dist: authlib!=1.7.0,>=1.7.1; extra == "openenv"
|
|
98
|
+
Requires-Dist: cryptography>=46.0.7; extra == "openenv"
|
|
99
|
+
Requires-Dist: duckdb>=1.0; extra == "openenv"
|
|
100
|
+
Requires-Dist: sqlglot>=25.0; extra == "openenv"
|
|
101
|
+
Requires-Dist: scipy>=1.13; extra == "openenv"
|
|
102
|
+
Requires-Dist: networkx>=3.3; extra == "openenv"
|
|
103
|
+
Requires-Dist: causal-learn>=0.1.4; extra == "openenv"
|
|
104
|
+
Requires-Dist: hyppo>=0.5.2; extra == "openenv"
|
|
105
|
+
Provides-Extra: all
|
|
106
|
+
Requires-Dist: dataforge_07[bench,causal,dev,eval,openenv,pandas,playground,providers,train]; extra == "all"
|
|
107
|
+
Dynamic: license-file
|
|
108
|
+
|
|
109
|
+
# DataForge
|
|
110
|
+
|
|
111
|
+
DataForge is a CLI-first data-quality repair toolkit for tabular data. It
|
|
112
|
+
detects common CSV issues, proposes deterministic repairs, checks proposed
|
|
113
|
+
changes through safety and verification gates, and records applied changes in a
|
|
114
|
+
reversible transaction log.
|
|
115
|
+
|
|
116
|
+
The final public product name is DataForge. The PyPI/TestPyPI distribution
|
|
117
|
+
family is `dataforge_07*` because the unqualified `dataforge` project name is
|
|
118
|
+
occupied by unrelated packages. Installing `dataforge_07` still provides the
|
|
119
|
+
`dataforge` import namespace and `dataforge` CLI. `dataforge15` is only a
|
|
120
|
+
temporary staging alias retained for local compatibility.
|
|
121
|
+
|
|
122
|
+
The current repository is an alpha implementation. It also contains the
|
|
123
|
+
OpenEnv-compatible training environment, the SFT warmup workflow, a local MCP
|
|
124
|
+
server package, and playground/demo sources. Warehouse integrations and
|
|
125
|
+
production model-quality claims remain future work.
|
|
126
|
+
|
|
127
|
+
Before any public release, review `THREAT_MODEL.md` and `docs/docs/release.md`.
|
|
128
|
+
They define the security, supply-chain, and evidence gates that separate the
|
|
129
|
+
current alpha from the full original DataForge vision.
|
|
130
|
+
|
|
131
|
+
## Current Status
|
|
132
|
+
|
|
133
|
+
Shipped in the current worktree:
|
|
134
|
+
|
|
135
|
+
- `dataforge profile`, `dataforge repair`, `dataforge revert`,
|
|
136
|
+
`dataforge watch`, `dataforge audit`, and `dataforge bench`
|
|
137
|
+
- Three detector families: `type_mismatch`, `decimal_shift`, `fd_violation`
|
|
138
|
+
- Reviewable schema inference in `profile --json`, including inferred column
|
|
139
|
+
types, domains, regex candidates, uniqueness, and FD candidates
|
|
140
|
+
- Pending constraint review artifacts via `profile --constraints-out`, which
|
|
141
|
+
can feed repair only after individual candidates are marked accepted
|
|
142
|
+
- Matching deterministic repairers wired through SafetyFilter -> SMTVerifier
|
|
143
|
+
- Backend-neutral `PatchPlan` and `TableStore` contracts for CSV, DuckDB, and
|
|
144
|
+
dry-run-only cloud warehouse boundaries
|
|
145
|
+
- Reversible hash-chained transaction journals with immutable source snapshots
|
|
146
|
+
- Public backend repair engine at `dataforge.engine.repair`
|
|
147
|
+
- Real-world benchmark harness for Hospital, Flights, and Beers
|
|
148
|
+
- OpenEnv-compatible HTTP environment with eight typed actions, including
|
|
149
|
+
read-only `ROOT_CAUSE`
|
|
150
|
+
- Causal root-cause analyzer for cascading data-quality errors
|
|
151
|
+
- Standalone `dataforge-mcp` package exposing DataForge tools over MCP
|
|
152
|
+
- Week 9 SFT oracle trajectory workflow, readiness gate, Kaggle notebook, and
|
|
153
|
+
release verifier
|
|
154
|
+
- Separate Gradio model-demo Space source for the published 0.5B SFT smoke
|
|
155
|
+
checkpoint
|
|
156
|
+
|
|
157
|
+
Not shipped yet:
|
|
158
|
+
|
|
159
|
+
- published `dataforge_07`, `dataforge_07_mcp`, `dataforge_07_evals`,
|
|
160
|
+
`dataforge_07_dbt`, and `dataforge_07_agent_patterns` packages
|
|
161
|
+
- committed production verification for the Cloudflare Workers playground
|
|
162
|
+
- warehouse-native or external adapter packages
|
|
163
|
+
- credentialed Snowflake, BigQuery, or Databricks apply/revert conformance
|
|
164
|
+
- design-partner, pilot-user, or customer validation evidence is not yet claimed
|
|
165
|
+
- A production-quality trained model family
|
|
166
|
+
- Autonomous repair in the playground or model demo
|
|
167
|
+
|
|
168
|
+
## Quickstart
|
|
169
|
+
|
|
170
|
+
```bash
|
|
171
|
+
python -m pip install -e ".[dev]"
|
|
172
|
+
dataforge profile fixtures/hospital_10rows.csv --schema fixtures/hospital_schema.yaml
|
|
173
|
+
dataforge profile fixtures/hospital_10rows.csv --constraints-out constraints.json
|
|
174
|
+
dataforge constraints review constraints.json
|
|
175
|
+
dataforge repair fixtures/hospital_10rows.csv --schema fixtures/hospital_schema.yaml --dry-run
|
|
176
|
+
dataforge repair fixtures/hospital_10rows.csv --constraints constraints.json --dry-run
|
|
177
|
+
dataforge watch fixtures/hospital_10rows.csv --schema fixtures/hospital_schema.yaml --once --json
|
|
178
|
+
dataforge bench --methods random,heuristic --datasets hospital,flights,beers --seeds 3 --seed-list 0,1,2
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
`dataforge15` remains a temporary staging compatibility alias, but public docs
|
|
182
|
+
and release evidence must use `dataforge_07` for PyPI distribution identity and
|
|
183
|
+
`dataforge` for the installed CLI/import identity.
|
|
184
|
+
|
|
185
|
+
To apply repairs, use `--apply`. Applied repairs write a transaction journal and
|
|
186
|
+
source snapshot before mutating the CSV, so they can be reverted:
|
|
187
|
+
|
|
188
|
+
```bash
|
|
189
|
+
dataforge repair path/to/file.csv --schema path/to/schema.yaml --apply
|
|
190
|
+
dataforge audit <txn-id>
|
|
191
|
+
dataforge revert <txn-id>
|
|
192
|
+
dataforge revert <txn-id> --search-root path/to --json
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
Warehouse targets use `warehouse://` URIs and always emit a `patch_plan_v1`
|
|
196
|
+
contract before any mutation. DuckDB is the local conformance backend; cloud
|
|
197
|
+
warehouse adapters are dry-run-only boundaries until credentialed apply,
|
|
198
|
+
audit, and rollback suites are enabled:
|
|
199
|
+
|
|
200
|
+
```bash
|
|
201
|
+
dataforge repair "warehouse://duckdb?database=dev.duckdb&relation=main.model&row_id=id" --dry-run --json
|
|
202
|
+
dataforge repair "warehouse://snowflake?relation=PUBLIC.MODEL&row_id=ID" --dry-run --json
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
DuckDB `--apply` requires a stable row identity, records the patch plan in the
|
|
206
|
+
transaction journal, and can be reverted through the same `audit` and `revert`
|
|
207
|
+
commands. Snowflake, BigQuery, and Databricks apply are intentionally refused
|
|
208
|
+
until their conformance gates prove reversible transactions.
|
|
209
|
+
|
|
210
|
+
New transaction logs are local tamper-evident hash chains. `dataforge audit`
|
|
211
|
+
verifies the chain head, event order, replayability, and revert prerequisites;
|
|
212
|
+
legacy v1 logs remain replayable but are reported as unverified because they do
|
|
213
|
+
not contain event hashes.
|
|
214
|
+
|
|
215
|
+
## Week 9 SFT Warmup
|
|
216
|
+
|
|
217
|
+
The current SFT workflow builds split-safe `expert_v1` trajectory records from
|
|
218
|
+
dirty/clean CSV diffs. Exact repairs in the primary dataset are labeled
|
|
219
|
+
`oracle_from_clean_diff`, not inferred from Groq, Cerebras, or Gemini teacher
|
|
220
|
+
guesses. Clean train chunks are retained as `finish` examples so the model
|
|
221
|
+
learns when no repair is justified.
|
|
222
|
+
|
|
223
|
+
```powershell
|
|
224
|
+
$env:HF_TOKEN="..."
|
|
225
|
+
.\.venv\Scripts\python.exe scripts\data\build_oracle_sft_trajectories.py
|
|
226
|
+
.\.venv\Scripts\python.exe scripts\data\validate_sft_readiness.py
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
This writes local ignored JSONL at `data/sft_traj/expert_v1.jsonl` and an
|
|
230
|
+
auditable row split at `data/sft_traj/split_manifest.json`. Push the dataset
|
|
231
|
+
bundle only after the readiness gate passes:
|
|
232
|
+
|
|
233
|
+
```powershell
|
|
234
|
+
$env:HF_TOKEN="..."
|
|
235
|
+
.\.venv\Scripts\python.exe scripts\data\build_oracle_sft_trajectories.py --push-to-hub --hf-dataset-repo Praneshrajan15/dataforge-sft-trajectories
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
The current public smoke checkpoint is
|
|
239
|
+
`Praneshrajan15/DataForge-0.5B-SFT`, with trajectories at
|
|
240
|
+
`Praneshrajan15/dataforge-sft-trajectories`. It proves the dataset, Kaggle
|
|
241
|
+
training, merge, evaluation, and Hub upload path; it is not a production
|
|
242
|
+
model-quality claim. Verify release artifacts before citing them:
|
|
243
|
+
|
|
244
|
+
```powershell
|
|
245
|
+
.\.venv\Scripts\python.exe scripts\model\verify_sft_release.py --output eval\results\sft_release_v0_smoke.json
|
|
246
|
+
.\.venv\Scripts\python.exe scripts\model\verify_sft_release.py --min-dataset-records 272 --require-sha-metrics --output eval\results\sft_release_contract_v2_20260515.json
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
## Week 12 GRPO Path
|
|
250
|
+
|
|
251
|
+
The repository now contains a gated GRPO post-training path for free-tier
|
|
252
|
+
experiments:
|
|
253
|
+
|
|
254
|
+
- `training/configs/grpo_05b.yaml` targets `DataForge-0.5B-SFT` -> `DataForge-0.5B-GRPO`.
|
|
255
|
+
- `training/configs/grpo_15b.yaml` requires a verified `DataForge-1.5B-SFT`
|
|
256
|
+
prerequisite before attempting `DataForge-1.5B-GRPO`.
|
|
257
|
+
- `training/rewards/dataforge_reward.py` scores completions locally through the
|
|
258
|
+
`repair_contract_v1` exact-repair contract.
|
|
259
|
+
- `training/kaggle/grpo_kaggle.ipynb` blocks Hub upload unless GRPO beats SFT
|
|
260
|
+
by at least 3 absolute F1 points on `DataForge-Bench-light-verified`.
|
|
261
|
+
|
|
262
|
+
No GRPO checkpoint is described as a quality milestone in this README until
|
|
263
|
+
`scripts/model/verify_grpo_release.py` produces committed verification
|
|
264
|
+
evidence. Refresh benchmark tables only from generated JSON:
|
|
265
|
+
|
|
266
|
+
After GRPO eval evidence exists:
|
|
267
|
+
|
|
268
|
+
```powershell
|
|
269
|
+
.\.venv\Scripts\python.exe scripts\bench\refresh_benchmark_table.py --skip-agent-run --trained-model-json eval\results\grpo_model_comparison.json
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
## MCP Server
|
|
273
|
+
|
|
274
|
+
The nested `dataforge-mcp/` source directory builds the standalone
|
|
275
|
+
`dataforge_07_mcp` distribution. It is not published yet, so install it from
|
|
276
|
+
source while release ownership is pending:
|
|
277
|
+
|
|
278
|
+
```bash
|
|
279
|
+
cd dataforge-mcp
|
|
280
|
+
python -m pip install -e ".[dev]"
|
|
281
|
+
dataforge-mcp serve
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
Tools: `dataforge_profile`, `dataforge_detect_errors`,
|
|
285
|
+
`dataforge_verify_fix`, `dataforge_apply_repairs`, and `dataforge_revert`.
|
|
286
|
+
The default transport is stdio. MCP reads and writes are sandboxed to configured
|
|
287
|
+
allowed roots; dry-run works by default, while apply requires `--enable-apply`.
|
|
288
|
+
Streamable HTTP is available for local experiments.
|
|
289
|
+
|
|
290
|
+
The monorepo `packages/` directory contains the side-package release sources
|
|
291
|
+
for `dataforge_07_evals`, `dataforge_07_dbt`, and
|
|
292
|
+
`dataforge_07_agent_patterns`.
|
|
293
|
+
|
|
294
|
+
## Playground And Model Demo
|
|
295
|
+
|
|
296
|
+
- `playground/api/` is the API backend for the CSV playground. Public Space
|
|
297
|
+
deployments use `dataforge-playground`.
|
|
298
|
+
- `playground/web/` is the static browser UI deployed through Cloudflare
|
|
299
|
+
Workers Static Assets. Its primary workflow is `POST /api/analyze`: upload a
|
|
300
|
+
CSV, review categorical risk and pending inferred constraints, inspect
|
|
301
|
+
verified dry-run repairs and non-repairs, then export a receipt with the
|
|
302
|
+
local CLI apply/audit/revert command shape.
|
|
303
|
+
- The current verified public playground URL is
|
|
304
|
+
`https://dataforge.praneshrajan15.workers.dev/playground`, backed by
|
|
305
|
+
`https://Praneshrajan15-dataforge-playground.hf.space`.
|
|
306
|
+
- That Workers URL is the production playground surface for the full original
|
|
307
|
+
vision; this is the release URL.
|
|
308
|
+
- `playground-model/` is a separate Gradio Space demo for the published
|
|
309
|
+
`DataForge-0.5B-SFT` smoke checkpoint. It accepts small CSV snippets and is
|
|
310
|
+
intentionally limited to demo use.
|
|
311
|
+
|
|
312
|
+
The playground does not persist uploaded files, does not use browser storage,
|
|
313
|
+
does not mutate data in the hosted flow, and does not call an LLM unless a
|
|
314
|
+
backend provider key is explicitly configured.
|
|
315
|
+
|
|
316
|
+
## Benchmark Results
|
|
317
|
+
|
|
318
|
+
<!-- BENCH:START -->
|
|
319
|
+
Generated from `eval/results/agent_comparison.json` (schema `dataforge_benchmark_run_v2`, seeds `0, 1, 2`, git `dbd1bed0a03c`, dirty `true`).
|
|
320
|
+
|
|
321
|
+
| Method | Precision | Recall | F1 | Avg Steps | Quota Units | GPU Hours |
|
|
322
|
+
| --- | --- | --- | --- | --- | --- | --- |
|
|
323
|
+
| heuristic | 0.3167 | 0.3025 | 0.2772 | 374.33 | 0.0000 | 0.0000 |
|
|
324
|
+
| random | 0.0038 | 0.0003 | 0.0005 | 150.33 | 0.0000 | 0.0000 |
|
|
325
|
+
|
|
326
|
+
See `BENCHMARK_REPORT.md` for per-dataset tables, error bars, and citation-only SOTA rows.
|
|
327
|
+
|
|
328
|
+
Dataset bytes are pinned to BigDaMa/raha revision `7be1334b8c7bbdac3f47ef514fb3e1e8c5fc181c` for hospital, flights, beers; dirty/clean SHA-256s are recorded in the JSON metadata.
|
|
329
|
+
<!-- BENCH:END -->
|
|
330
|
+
|
|
331
|
+
## Local Setup
|
|
332
|
+
|
|
333
|
+
```bash
|
|
334
|
+
make setup
|
|
335
|
+
make lint
|
|
336
|
+
make type
|
|
337
|
+
make test
|
|
338
|
+
make backend-gate
|
|
339
|
+
make release-gate
|
|
340
|
+
```
|
|
341
|
+
|
|
342
|
+
Verification works on Linux, macOS, and Windows with Git Bash available for GNU
|
|
343
|
+
Make recipes. Python support is `>=3.11,<3.13`.
|
|
344
|
+
|
|
345
|
+
`profile --constraints-out` writes a strict `constraint_review_v1` JSON artifact.
|
|
346
|
+
Every inferred candidate starts as `pending`; repair ignores pending and
|
|
347
|
+
rejected candidates. In v1, only accepted `column_type`, `domain_bound`, and
|
|
348
|
+
`functional_dependency` candidates affect repair. Accepted regex and uniqueness
|
|
349
|
+
candidates remain review evidence until verifier support is added. Use
|
|
350
|
+
`dataforge constraints review constraints.json` for the Textual review UI, or
|
|
351
|
+
use deterministic CI flags such as `--accept cnd-... --no-tui --json`.
|
|
352
|
+
|
|
353
|
+
`make backend-gate` is the release-quality backend check: lint, format, strict
|
|
354
|
+
mypy, root tests, MCP tests, README truth, benchmark truth, OpenAPI snapshot
|
|
355
|
+
drift, secret scan, dependency audit availability, SBOM generation
|
|
356
|
+
availability, and package build availability for both `dataforge_07` and
|
|
357
|
+
`dataforge_07_mcp`. The gate covers the core `dataforge_07` distribution and
|
|
358
|
+
release surfaces; the historical
|
|
359
|
+
`data_quality_env` namespace remains source-tree regression coverage, not part
|
|
360
|
+
of the `dataforge` wheel or source distribution.
|
|
361
|
+
|
|
362
|
+
Before release, run `scripts/ci/backend_gate.py --require-optional` so
|
|
363
|
+
dependency audit, SBOM generation, and package builds are hard failures rather
|
|
364
|
+
than availability checks.
|
|
365
|
+
|
|
366
|
+
Release doctor scopes:
|
|
367
|
+
|
|
368
|
+
```bash
|
|
369
|
+
dataforge release doctor --core --json
|
|
370
|
+
dataforge release doctor --maintainer-deploy --json
|
|
371
|
+
dataforge release gate --json
|
|
372
|
+
dataforge release full-vision --json
|
|
373
|
+
```
|
|
374
|
+
|
|
375
|
+
`--core` is the default OSS release check. `--maintainer-deploy` additionally
|
|
376
|
+
checks maintainer-specific Hugging Face, Kaggle OAuth plus clean-config Kaggle
|
|
377
|
+
CLI execution, and Cloudflare state.
|
|
378
|
+
`release gate` is the authoritative fresh-user proof: it builds the
|
|
379
|
+
distribution, audits wheel contents, creates a dependency wheelhouse, installs
|
|
380
|
+
with `pip --no-index --find-links`, then runs profile, repair dry-run, apply,
|
|
381
|
+
constraint review, audit, revert, and post-revert audit from outside the source
|
|
382
|
+
checkout.
|
|
383
|
+
|
|
384
|
+
Configure pending trusted publishers for `dataforge_07` on TestPyPI and PyPI
|
|
385
|
+
before tagging. The real PyPI workflow refuses pre-release metadata and should
|
|
386
|
+
only run after trusted publishing, attestations, and fresh-install evidence are
|
|
387
|
+
verified. `dataforge release full-vision --json` is expected to fail until PyPI
|
|
388
|
+
publication evidence, dbt-duckdb proof, not yet met design-partner evidence,
|
|
389
|
+
and model-family evidence are real.
|
|
390
|
+
|
|
391
|
+
Windows setup:
|
|
392
|
+
|
|
393
|
+
```powershell
|
|
394
|
+
winget install -e --id Python.Python.3.12
|
|
395
|
+
winget install -e --id ezwinports.make
|
|
396
|
+
py -3.12 -m venv .venv
|
|
397
|
+
.\.venv\Scripts\Activate.ps1
|
|
398
|
+
python -m pip install -e ".[all]"
|
|
399
|
+
make lint && make type && make test
|
|
400
|
+
```
|
|
401
|
+
|
|
402
|
+
## Environment Variables
|
|
403
|
+
|
|
404
|
+
Provider keys belong in a root `.env` file, which is gitignored and loaded with
|
|
405
|
+
`python-dotenv` where needed.
|
|
406
|
+
|
|
407
|
+
- `GROQ_API_KEY`
|
|
408
|
+
- `GEMINI_API_KEY`
|
|
409
|
+
- `CEREBRAS_API_KEY`
|
|
410
|
+
- `OPENROUTER_API_KEY`
|
|
411
|
+
- `HF_TOKEN`
|
|
412
|
+
|
|
413
|
+
## When DataForge Is The Wrong Tool
|
|
414
|
+
|
|
415
|
+
Do not use DataForge for streaming data, very large warehouse tables, regulated
|
|
416
|
+
workflows where every fix must be human-authored, strict low-latency SLAs, or
|
|
417
|
+
teams already well served by maintained Great Expectations/dbt suites. DataForge
|
|
418
|
+
is currently best suited to local CSV profiling, repair experiments, benchmark
|
|
419
|
+
runs, and training/evaluation research.
|
|
420
|
+
|
|
421
|
+
## Repository Docs
|
|
422
|
+
|
|
423
|
+
- [.cursor/rules/dataforge.md](.cursor/rules/dataforge.md) - always-applied contribution rules
|
|
424
|
+
- [ARCHITECTURE.md](ARCHITECTURE.md) - current system architecture and dependencies
|
|
425
|
+
- [DECISIONS.md](DECISIONS.md) - technical decision log
|
|
426
|
+
- [CONTRIBUTING.md](CONTRIBUTING.md) - workflow and code standards
|
|
427
|
+
- [CLAUDE.md](CLAUDE.md) - living gotcha log for agent sessions
|
|
428
|
+
- [CURSOR_MASTER.md](CURSOR_MASTER.md) - context and prompt pack
|
|
429
|
+
- [META_CONTEXT.md](META_CONTEXT.md) - project meta-context
|
|
430
|
+
- [FILE_STRUCTURE.md](FILE_STRUCTURE.md) - current and planned directory map
|
|
431
|
+
- [SECURITY.md](SECURITY.md) - vulnerability reporting policy
|
|
432
|
+
- [specs/SPEC_TEMPLATE.md](specs/SPEC_TEMPLATE.md) - template for new module specs
|
|
433
|
+
|
|
434
|
+
## License
|
|
435
|
+
|
|
436
|
+
Apache-2.0. See [LICENSE](LICENSE).
|