dd-parser-cleaner 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dd_parser_cleaner-0.1.0/.gitignore +12 -0
- dd_parser_cleaner-0.1.0/.python-version +1 -0
- dd_parser_cleaner-0.1.0/.vscode/settings.json +13 -0
- dd_parser_cleaner-0.1.0/PKG-INFO +10 -0
- dd_parser_cleaner-0.1.0/README.md +0 -0
- dd_parser_cleaner-0.1.0/config.yaml +22 -0
- dd_parser_cleaner-0.1.0/gemini/stash.md +309 -0
- dd_parser_cleaner-0.1.0/pyproject.toml +25 -0
- dd_parser_cleaner-0.1.0/src/dd_cleaner/__init__.py +0 -0
- dd_parser_cleaner-0.1.0/src/dd_cleaner/cli.py +35 -0
- dd_parser_cleaner-0.1.0/src/dd_cleaner/engine.py +160 -0
- dd_parser_cleaner-0.1.0/src/dd_parser/__init__.py +9 -0
- dd_parser_cleaner-0.1.0/src/dd_parser/cli.py +25 -0
- dd_parser_cleaner-0.1.0/src/dd_parser/core.py +224 -0
- dd_parser_cleaner-0.1.0/src/dd_parser/models.py +21 -0
- dd_parser_cleaner-0.1.0/src/dd_parser/py.typed +0 -0
- dd_parser_cleaner-0.1.0/test_client.py +27 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.13
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
{
|
|
2
|
+
"python.analysis.extraPaths": [
|
|
3
|
+
"${workspaceFolder}/src"
|
|
4
|
+
],
|
|
5
|
+
"python.analysis.packageIndexDepths": [
|
|
6
|
+
{
|
|
7
|
+
"name": "dd_parser",
|
|
8
|
+
"depth": 5
|
|
9
|
+
}
|
|
10
|
+
],
|
|
11
|
+
"python.analysis.importFormat": "absolute",
|
|
12
|
+
"python.analysis.typeCheckingMode": "basic"
|
|
13
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dd-parser-cleaner
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A private, local LLM-powered data dictionary parser and entity mapper with automated cleaning.
|
|
5
|
+
Requires-Python: >=3.10
|
|
6
|
+
Requires-Dist: ollama>=0.2.0
|
|
7
|
+
Requires-Dist: pandas>=2.2.0
|
|
8
|
+
Requires-Dist: pydantic>=2.6.0
|
|
9
|
+
Requires-Dist: pypdf>=4.1.0
|
|
10
|
+
Requires-Dist: pyyaml>=6.0.1
|
|
File without changes
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# operational_settings
|
|
2
|
+
batch_size: 10
|
|
3
|
+
files:
|
|
4
|
+
- "sba_dd.csv"
|
|
5
|
+
|
|
6
|
+
# llm_settings
|
|
7
|
+
model_name: "llama3.2"
|
|
8
|
+
temperature: 0.0
|
|
9
|
+
system_prompt: "You are a precise data engineering assistant. Respond strictly in JSON."
|
|
10
|
+
csv_target_column_index: 0
|
|
11
|
+
|
|
12
|
+
# =====================================================================
|
|
13
|
+
# Pipeline Ingestion/Execution Directory Targets
|
|
14
|
+
# =====================================================================
|
|
15
|
+
# dd_parser module outputs
|
|
16
|
+
dd_parser_output_dir: "dd_analysis_results" # Sub-dir where the blueprint lands
|
|
17
|
+
output_filename: "sba_analysis_results.csv" # Name of the handshake blueprint
|
|
18
|
+
|
|
19
|
+
# dd_cleaner module outputs
|
|
20
|
+
raw_dataset_file: "sba_loans_raw.csv" # Sourced from the root working_dir
|
|
21
|
+
dd_cleaner_output_dir: "dd_cleaner_results" # Dedicated sub-dir for cleaned data
|
|
22
|
+
clean_output_filename: "sba_loans_clean.csv" # Final production dataset filename
|
|
@@ -0,0 +1,309 @@
|
|
|
1
|
+
## š Session Stash: Unified Project State & KMDS Document Reporting
|
|
2
|
+
|
|
3
|
+
## š Project State Summary
|
|
4
|
+
|
|
5
|
+
* Workspace Title: `dd-parser-cleaner`
|
|
6
|
+
* Active Platform Integration: Fully aligned with the `kmds-data-helper` ecosystem [15]. Ingests and processes data dictionary properties inside the `data_dictionary/` workspace [15], maps target source payloads out of the `data/` workspace [15], and drops clean, readable Markdown summaries directly into the `documents/` workspace [15].
|
|
7
|
+
* Pipeline Handshake Status: Fully functional. The inference engine locks down a secure `# DD-PARSER-SIGNATURE` comment header row at the top of the mapping CSV, which the data cleaner validates before execution.
|
|
8
|
+
* Execution Safety: Resolves case-variant header anomalies dynamically at runtime using a lowercase field map, ensuring type-casting rules apply flawlessly to mismatched dataset schemas.
|
|
9
|
+
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
## š Active Unified Workspace Layout
|
|
13
|
+
|
|
14
|
+
```text
|
|
15
|
+
/home/rajiv/programming/dd_parser/ # Workspace Directory
|
|
16
|
+
āāā pyproject.toml # Distribution and entry point registry
|
|
17
|
+
āāā config.yaml # Centralized execution parameter file
|
|
18
|
+
āāā src/
|
|
19
|
+
āāā dd_parser/ # LLM Inference and Heuristic Engine
|
|
20
|
+
ā āāā __init__.py
|
|
21
|
+
ā āāā cli.py
|
|
22
|
+
ā āāā core.py # Generates blueprint matrix + dd_parsing_summary.md
|
|
23
|
+
ā āāā models.py # Pydantic schema validation contract
|
|
24
|
+
āāā dd_cleaner/ # Case-Insensitive Transformation Engine
|
|
25
|
+
āāā __init__.py
|
|
26
|
+
āāā cli.py
|
|
27
|
+
āāā engine.py # Generates clean data payload + data_cleaning_summary.md
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
## š Core Code Matrix Updates
|
|
33
|
+
|
|
34
|
+
## 1. Upgraded Project Settings (`pyproject.toml`)
|
|
35
|
+
|
|
36
|
+
```toml
|
|
37
|
+
[project]
|
|
38
|
+
name = "dd-parser-cleaner"
|
|
39
|
+
version = "0.1.0"
|
|
40
|
+
description = "A private, local LLM-powered data dictionary parser and entity mapper with automated cleaning."
|
|
41
|
+
readme = "README.md"
|
|
42
|
+
requires-python = ">=3.10"
|
|
43
|
+
dependencies = [
|
|
44
|
+
"pandas>=2.2.0",
|
|
45
|
+
"pydantic>=2.6.0",
|
|
46
|
+
"pypdf>=4.1.0",
|
|
47
|
+
"ollama>=0.2.0",
|
|
48
|
+
"pyyaml>=6.0.1",
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
[project.scripts]
|
|
52
|
+
classify-entities = "dd_parser.cli:main"
|
|
53
|
+
clean-dataset = "dd_cleaner.cli:main"
|
|
54
|
+
|
|
55
|
+
[build-system]
|
|
56
|
+
requires = ["hatchling"]
|
|
57
|
+
build-backend = "hatchling.build"
|
|
58
|
+
|
|
59
|
+
[tool.hatch.build.targets.wheel]
|
|
60
|
+
packages = ["src/dd_parser", "src/dd_cleaner"]
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## 2. Runtime Execution Configuration (`config.yaml`)
|
|
64
|
+
|
|
65
|
+
```yaml
|
|
66
|
+
# operational_settings
|
|
67
|
+
batch_size: 10
|
|
68
|
+
files:
|
|
69
|
+
- "sba_dd.csv"
|
|
70
|
+
|
|
71
|
+
# llm_settings
|
|
72
|
+
model_name: "llama3.2"
|
|
73
|
+
temperature: 0.0
|
|
74
|
+
system_prompt: "You are a precise data engineering assistant. Respond strictly in JSON."
|
|
75
|
+
csv_target_column_index: 0
|
|
76
|
+
|
|
77
|
+
# =====================================================================
|
|
78
|
+
# Pipeline Ingestion/Execution Directory Targets
|
|
79
|
+
# =====================================================================
|
|
80
|
+
# Document Report Analytics Target Location
|
|
81
|
+
documents_dir: "documents"
|
|
82
|
+
|
|
83
|
+
# dd_parser module outputs
|
|
84
|
+
dd_parser_output_dir: "dd_analysis_results"
|
|
85
|
+
output_filename: "sba_analysis_results.csv"
|
|
86
|
+
|
|
87
|
+
# dd_cleaner module outputs
|
|
88
|
+
raw_dataset_file: "sba_loans_raw.csv"
|
|
89
|
+
dd_cleaner_output_dir: "dd_cleaner_results"
|
|
90
|
+
clean_output_filename: "sba_loans_clean.csv"
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## 3. Reporting Parser Engine (`src/dd_parser/core.py`)
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
importos
|
|
97
|
+
importjson
|
|
98
|
+
importyaml
|
|
99
|
+
importtime
|
|
100
|
+
importlogging
|
|
101
|
+
importpandasas pd
|
|
102
|
+
frompypdfimportPdfReader
|
|
103
|
+
importollama
|
|
104
|
+
fromtypingimportList, Dict, Any
|
|
105
|
+
fromdd_parser.modelsimportAttributeAnalysis, BatchAnalysisResponse
|
|
106
|
+
|
|
107
|
+
logger = logging.getLogger("dd_parser")
|
|
108
|
+
|
|
109
|
+
classLocalEntityClassifier:
|
|
110
|
+
def__init__(self):
|
|
111
|
+
self.working_dir: str = ""
|
|
112
|
+
self.config: Dict[Any, Any] = {}
|
|
113
|
+
|
|
114
|
+
defset_working_config(self, working_dir: str, config_path: str):
|
|
115
|
+
abs_config_path = os.path.abspath(config_path)
|
|
116
|
+
ifnot os.path.exists(abs_config_path):
|
|
117
|
+
raise FileNotFoundError(f"Configuration file not found at: {abs_config_path}")
|
|
118
|
+
|
|
119
|
+
with open(abs_config_path, 'r') asf:
|
|
120
|
+
self.config = yaml.safe_load(f)
|
|
121
|
+
|
|
122
|
+
ifnot os.path.isdir(working_dir):
|
|
123
|
+
raise FileNotFoundError(f"Target data directory not found: {os.path.abspath(working_dir)}")
|
|
124
|
+
self.working_dir = os.path.abspath(working_dir)
|
|
125
|
+
|
|
126
|
+
raw_output_dir = self.config.get('dd_parser_output_dir', 'dd_analysis_results')
|
|
127
|
+
abs_output_dir = os.path.isabs(raw_output_dir) and raw_output_dir or os.path.abspath(os.path.join(self.working_dir, raw_output_dir))
|
|
128
|
+
os.makedirs(abs_output_dir, exist_ok=True)
|
|
129
|
+
|
|
130
|
+
log_file_path = os.path.join(abs_output_dir, "parser_run.log")
|
|
131
|
+
file_handler = logging.FileHandler(log_file_path, encoding='utf-8')
|
|
132
|
+
file_handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(name)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"))
|
|
133
|
+
logger.addHandler(file_handler)
|
|
134
|
+
|
|
135
|
+
logger.info("Context Initialized with Hybrid Processing Configuration.")
|
|
136
|
+
|
|
137
|
+
defextract_attributes(self, file_path: str, csv_idx: int = 0) -> List[str]:
|
|
138
|
+
_, ext = os.path.splitext(file_path)
|
|
139
|
+
ext = ext.lower()
|
|
140
|
+
if ext == '.csv':
|
|
141
|
+
return pd.read_csv(file_path).iloc[:, csv_idx].dropna().astype(str).tolist()
|
|
142
|
+
elif ext == '.pdf':
|
|
143
|
+
return [line.strip() forpagein PdfReader(file_path).pages forlinein page.extract_text().split('\n') if line.strip()]
|
|
144
|
+
elif ext in ['.md', '.markdown']:
|
|
145
|
+
with open(file_path, 'r', encoding='utf-8') asf:
|
|
146
|
+
return [line.strip() forlinein f if line.strip()]
|
|
147
|
+
raise ValueError(f"Unsupported format: {ext}")
|
|
148
|
+
|
|
149
|
+
defanalyze_batch(self, attributes: List[str]) -> List[AttributeAnalysis]:
|
|
150
|
+
prompt = f"""
|
|
151
|
+
Analyze the following data dictionary attributes.
|
|
152
|
+
|
|
153
|
+
### EXAMPLES OF EXCELLENT PERFORMANCE
|
|
154
|
+
Input: ["BorrCity", "BankStreet", "GrossApproval", "SoldSecMrktInd"]
|
|
155
|
+
Output Schema Map:
|
|
156
|
+
{{
|
|
157
|
+
"analysis": [
|
|
158
|
+
{{"attribute_name": "BorrCity", "provisional_entity": "Borrower", "is_geographical": true, "related_entity": "Borrower", "provisional_python_type": "str"}},
|
|
159
|
+
{{"attribute_name": "BankStreet", "provisional_entity": "Bank", "is_geographical": true, "related_entity": "Bank", "provisional_python_type": "str"}},
|
|
160
|
+
{{"attribute_name": "GrossApproval", "provisional_entity": "Loan", "is_geographical": false, "related_entity": null, "provisional_python_type": "float"}},
|
|
161
|
+
{{"attribute_name": "SoldSecMrktInd", "provisional_entity": "Loan", "is_geographical": false, "related_entity": null, "provisional_python_type": "bool"}}
|
|
162
|
+
]
|
|
163
|
+
}}
|
|
164
|
+
|
|
165
|
+
### CURRENT EXECUTION BATCH
|
|
166
|
+
Attributes to process: {json.dumps(attributes)}
|
|
167
|
+
"""
|
|
168
|
+
response = ollama.chat(
|
|
169
|
+
model=self.config.get('model_name', 'llama3.2'),
|
|
170
|
+
messages=[
|
|
171
|
+
{"role": "system", "content": self.config.get('system_prompt', 'You are a precise data engineering assistant. Respond strictly in JSON.')},
|
|
172
|
+
{"role": "user", "content": prompt}
|
|
173
|
+
],
|
|
174
|
+
options={"temperature": 0.0},
|
|
175
|
+
format=BatchAnalysisResponse.model_json_schema()
|
|
176
|
+
)
|
|
177
|
+
return BatchAnalysisResponse(**json.loads(response['message']['content'])).analysis
|
|
178
|
+
|
|
179
|
+
defpost_process_cleaner(self, analysis_list: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
180
|
+
cleaned_records = []
|
|
181
|
+
foritemin analysis_list:
|
|
182
|
+
data = item.copy()
|
|
183
|
+
attr = data['attribute_name']
|
|
184
|
+
|
|
185
|
+
if attr.startswith('Borr'):
|
|
186
|
+
data['provisional_entity'] = 'Borrower'
|
|
187
|
+
if data['is_geographical']: data['related_entity'] = 'Borrower'
|
|
188
|
+
elif attr.startswith('Bank'):
|
|
189
|
+
data['provisional_entity'] = 'Bank'
|
|
190
|
+
if attr in ['BankStreet', 'BankCity', 'BankState', 'BankZip']:
|
|
191
|
+
data['is_geographical'] = True
|
|
192
|
+
data['related_entity'] = 'Bank'
|
|
193
|
+
elif attr.startswith('Project'):
|
|
194
|
+
data['provisional_entity'] = 'Project'
|
|
195
|
+
if attr in ['ProjectCounty', 'ProjectState']:
|
|
196
|
+
data['is_geographical'] = True
|
|
197
|
+
data['related_entity'] = 'Project'
|
|
198
|
+
elif'Approval'in attr or'Disbursement'in attr or attr in ['Program', 'Subprogram']:
|
|
199
|
+
data['provisional_entity'] = 'Loan'
|
|
200
|
+
elif attr.startswith('SBA'):
|
|
201
|
+
data['provisional_entity'] = 'SBA'
|
|
202
|
+
if data['is_geographical']: data['related_entity'] = 'SBA'
|
|
203
|
+
|
|
204
|
+
if attr.endswith('Ind') or'Indicator'in attr:
|
|
205
|
+
data['provisional_python_type'] = 'bool'
|
|
206
|
+
|
|
207
|
+
ifnot data['is_geographical']:
|
|
208
|
+
data['related_entity'] = ""
|
|
209
|
+
cleaned_records.append(data)
|
|
210
|
+
return cleaned_records
|
|
211
|
+
|
|
212
|
+
defgenerate_parsing_markdown_summary(self, final_results: List[Dict[str, Any]], base_project_dir: str):
|
|
213
|
+
df = pd.DataFrame(final_results)
|
|
214
|
+
doc_dir_name = self.config.get('documents_dir', 'documents')
|
|
215
|
+
abs_doc_dir = os.path.abspath(os.path.join(base_project_dir, doc_dir_name))
|
|
216
|
+
os.makedirs(abs_doc_dir, exist_ok=True)
|
|
217
|
+
|
|
218
|
+
report_path = os.path.join(abs_doc_dir, "dd_parsing_summary.md")
|
|
219
|
+
entity_counts = df['provisional_entity'].value_counts()
|
|
220
|
+
total_attributes = len(df)
|
|
221
|
+
|
|
222
|
+
with open(report_path, 'w', encoding='utf-8') asf:
|
|
223
|
+
f.write("# š KMDS Data Helper: Data Dictionary Parsing Summary\n\n")
|
|
224
|
+
f.write(f"**Total Tracked Attributes:** {total_attributes} \n")
|
|
225
|
+
f.write(f"**Unique Detected Entities:** {len(entity_counts)}\n\n")
|
|
226
|
+
|
|
227
|
+
f.write("## šļø Entity Size & Distribution Profile\n")
|
|
228
|
+
f.write("| Detected Entity Node | Number of Attributes (Size) |\n")
|
|
229
|
+
f.write("| :--- | :--- |\n")
|
|
230
|
+
forent, countin entity_counts.items():
|
|
231
|
+
f.write(f"| {ent} | {count} |\n")
|
|
232
|
+
f.write("\n")
|
|
233
|
+
|
|
234
|
+
f.write("## šļø Attribute Structural Categories\n")
|
|
235
|
+
categorical_df = df[df['provisional_python_type'] == 'bool']
|
|
236
|
+
numerical_df = df[df['provisional_python_type'].isin(['int', 'float'])]
|
|
237
|
+
semantic_df = df[~df['provisional_python_type'].isin(['bool', 'int', 'float'])]
|
|
238
|
+
|
|
239
|
+
f.write(f"### š Categorical Fields (Total: {len(categorical_df)})\n")
|
|
240
|
+
for_, rowin categorical_df.iterrows():
|
|
241
|
+
f.write(f"- `{row['attribute_name']}` ({row['provisional_python_type']}) $\rightarrow$ Node: **{row['provisional_entity']}**\n")
|
|
242
|
+
f.write("\n")
|
|
243
|
+
|
|
244
|
+
f.write(f"### š¢ Numerical Fields (Total: {len(numerical_df)})\n")
|
|
245
|
+
for_, rowin numerical_df.iterrows():
|
|
246
|
+
f.write(f"- `{row['attribute_name']}` ({row['provisional_python_type']}) $\rightarrow$ Node: **{row['provisional_entity']}**\n")
|
|
247
|
+
f.write("\n")
|
|
248
|
+
|
|
249
|
+
f.write(f"### š§ Semantic Attributes Grouped By Parent Class (Total: {len(semantic_df)})\n")
|
|
250
|
+
grouped_semantic = semantic_df.groupby('provisional_entity')
|
|
251
|
+
forent_group, group_dfin grouped_semantic:
|
|
252
|
+
f.write(f"#### Entity Category: `{ent_group}`\n")
|
|
253
|
+
for_, rowin group_df.iterrows():
|
|
254
|
+
geo_suffix = row['is_geographical'] andf" [GEO Linked: {row['related_entity']}]"or""
|
|
255
|
+
f.write(f" - `{row['attribute_name']}` ({row['provisional_python_type']}){geo_suffix}\n")
|
|
256
|
+
f.write("\n")
|
|
257
|
+
|
|
258
|
+
defprocess(self):
|
|
259
|
+
files_to_process = self.config.get('files', [])
|
|
260
|
+
ifnot files_to_process: return
|
|
261
|
+
|
|
262
|
+
raw_output_dir = self.config.get('dd_parser_output_dir', 'dd_analysis_results')
|
|
263
|
+
abs_output_dir = os.path.isabs(raw_output_dir) and raw_output_dir or os.path.abspath(os.path.join(self.working_dir, raw_output_dir))
|
|
264
|
+
|
|
265
|
+
batch_size, csv_col_idx = self.config.get('batch_size', 10), self.config.get('csv_target_column_index', 0)
|
|
266
|
+
config_filename = self.config.get('output_filename')
|
|
267
|
+
base_project_dir = os.path.abspath(os.path.join(self.working_dir, ".."))
|
|
268
|
+
|
|
269
|
+
forfilepathin files_to_process:
|
|
270
|
+
input_file_path = os.path.isabs(filepath) and filepath or os.path.abspath(os.path.join(self.working_dir, filepath))
|
|
271
|
+
ifnot os.path.exists(input_file_path): continue
|
|
272
|
+
|
|
273
|
+
filename = os.path.basename(input_file_path)
|
|
274
|
+
try:
|
|
275
|
+
raw_attributes = self.extract_attributes(input_file_path, csv_col_idx)
|
|
276
|
+
except Exception ase: continue
|
|
277
|
+
|
|
278
|
+
final_results = []
|
|
279
|
+
foriin range(0, len(raw_attributes), batch_size):
|
|
280
|
+
batch = raw_attributes[i:i+batch_size]
|
|
281
|
+
try:
|
|
282
|
+
batch_output = self.analyze_batch(batch)
|
|
283
|
+
batch_dicts = [item.model_dump() foritemin batch_output]
|
|
284
|
+
cleaned_batch = self.post_process_cleaner(batch_dicts)
|
|
285
|
+
final_results.extend(cleaned_batch)
|
|
286
|
+
except Exception ase: pass
|
|
287
|
+
|
|
288
|
+
if final_results:
|
|
289
|
+
out_filename = (config_filename and len(files_to_process) == 1) and config_filename orf"mapped_{os.path.splitext(filename)}.csv"
|
|
290
|
+
output_csv_path = os.path.join(abs_output_dir, out_filename)
|
|
291
|
+
|
|
292
|
+
preamble = f"# DD-PARSER-SIGNATURE: PROCESSED-BY-{self.config.get('model_name', 'llama3.2').upper()}\n"
|
|
293
|
+
with open(output_csv_path, 'w', encoding='utf-8') asf:
|
|
294
|
+
f.write(preamble)
|
|
295
|
+
|
|
296
|
+
pd.DataFrame(final_results).to_csv(output_csv_path, mode='a', index=False)
|
|
297
|
+
self.generate_parsing_markdown_summary(final_results, base_project_dir)
|
|
298
|
+
```
|
|
299
|
+
|
|
300
|
+
---
|
|
301
|
+
|
|
302
|
+
## š Ready for Next Sprint
|
|
303
|
+
|
|
304
|
+
When you initiate your next tracking session, we will pick up directly with the data cleaner logic to add:
|
|
305
|
+
|
|
306
|
+
* Geographic Scrubbing Routines: Processing columns tagged with `is_geographical: true` to enforce title casing (e.g. `"Colorado Springs"`) and length-padding string masks for postal indices [15] (e.g., zero-padding ZIP codes to a strict length of 5 digits).
|
|
307
|
+
* Missing Value Custom Strategy Options: Designing explicit rules to safely substitute or isolate null cells based on attribute mappings.
|
|
308
|
+
|
|
309
|
+
Let me know whenever you are ready to kick off the geographic data cleaning extensions!
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "dd-parser-cleaner" # This renames the overall workspace project
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "A private, local LLM-powered data dictionary parser and entity mapper with automated cleaning."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.10"
|
|
7
|
+
dependencies = [
|
|
8
|
+
"pandas>=2.2.0",
|
|
9
|
+
"pydantic>=2.6.0",
|
|
10
|
+
"pypdf>=4.1.0",
|
|
11
|
+
"ollama>=0.2.0",
|
|
12
|
+
"pyyaml>=6.0.1",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
[project.scripts]
|
|
16
|
+
classify-entities = "dd_parser.cli:main" # Reverted back to your original folder string
|
|
17
|
+
clean-dataset = "dd_cleaner.cli:main" # Remains completely stable
|
|
18
|
+
|
|
19
|
+
[build-system]
|
|
20
|
+
requires = ["hatchling"]
|
|
21
|
+
build-backend = "hatchling.build"
|
|
22
|
+
|
|
23
|
+
# āāā ADD THIS BLOC TO RESOLVE THE WHEEL SELECTION BUG āāā
|
|
24
|
+
[tool.hatch.build.targets.wheel]
|
|
25
|
+
packages = ["src/dd_parser", "src/dd_cleaner"]
|
|
File without changes
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import logging
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
def main():
|
|
6
|
+
# 1. Initialize clean, standardized console log streaming
|
|
7
|
+
logging.basicConfig(
|
|
8
|
+
level=logging.INFO,
|
|
9
|
+
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
|
10
|
+
datefmt="%Y-%m-%d %H:%M:%S"
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
parser = argparse.ArgumentParser(
|
|
14
|
+
description="Run the data cleaner pipeline driven by the verified dd-parser blueprint matrix map."
|
|
15
|
+
)
|
|
16
|
+
parser.add_argument("--working-dir", required=True, help="Path to raw data directories containing target payloads.")
|
|
17
|
+
parser.add_argument("--config", required=True, help="Path to config.yaml file containing script targets.")
|
|
18
|
+
args = parser.parse_args()
|
|
19
|
+
|
|
20
|
+
# Import engine inside main to allow logging configs to bind cleanly first
|
|
21
|
+
from dd_cleaner.engine import DataCleanerEngine
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
# 2. Fire up the pipeline runner
|
|
25
|
+
cleaner = DataCleanerEngine()
|
|
26
|
+
cleaner.set_working_config(working_dir=args.working_dir, config_path=args.config)
|
|
27
|
+
cleaner.clean_dataset()
|
|
28
|
+
|
|
29
|
+
except Exception as e:
|
|
30
|
+
logger = logging.getLogger("dd_cleaner_main")
|
|
31
|
+
logger.error(f"š Critical Pipeline Failure: {str(e)}")
|
|
32
|
+
sys.exit(1)
|
|
33
|
+
|
|
34
|
+
if __name__ == "__main__":
|
|
35
|
+
main()
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import logging
|
|
3
|
+
import yaml
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from typing import Dict, Any
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger("dd_cleaner")
|
|
8
|
+
|
|
9
|
+
class DataCleanerEngine:
|
|
10
|
+
def __init__(self):
|
|
11
|
+
self.working_dir: str = ""
|
|
12
|
+
self.config: Dict[Any, Any] = {}
|
|
13
|
+
|
|
14
|
+
def set_working_config(self, working_dir: str, config_path: str):
|
|
15
|
+
"""Loads the shared config and binds the root working directory context."""
|
|
16
|
+
abs_config_path = os.path.abspath(config_path)
|
|
17
|
+
if not os.path.exists(abs_config_path):
|
|
18
|
+
raise FileNotFoundError(f"Configuration file not found at: {abs_config_path}")
|
|
19
|
+
with open(abs_config_path, 'r') as f:
|
|
20
|
+
self.config = yaml.safe_load(f)
|
|
21
|
+
if not os.path.isdir(working_dir):
|
|
22
|
+
raise FileNotFoundError(f"Target data directory not found: {os.path.abspath(working_dir)}")
|
|
23
|
+
self.working_dir = os.path.abspath(working_dir)
|
|
24
|
+
logger.info("Cleaner Context Initialized.")
|
|
25
|
+
|
|
26
|
+
def verify_and_load_blueprint(self) -> pd.DataFrame:
|
|
27
|
+
"""Handshakes with the dd_parser output subdirectory to grab the metadata map."""
|
|
28
|
+
parser_dir = self.config.get('dd_parser_output_dir', 'dd_analysis_results')
|
|
29
|
+
blueprint_name = self.config.get('output_filename', 'sba_analysis_results.csv')
|
|
30
|
+
blueprint_path = os.path.isabs(parser_dir) and os.path.join(parser_dir, blueprint_name) or os.path.abspath(os.path.join(self.working_dir, parser_dir, blueprint_name))
|
|
31
|
+
|
|
32
|
+
if not os.path.exists(blueprint_path):
|
|
33
|
+
raise FileNotFoundError(f"Missing parsing matrix blueprint file at: {blueprint_path}")
|
|
34
|
+
|
|
35
|
+
with open(blueprint_path, 'r', encoding='utf-8') as f:
|
|
36
|
+
first_line = f.readline()
|
|
37
|
+
if not first_line.startswith("# DD-PARSER-SIGNATURE"):
|
|
38
|
+
raise ValueError(f"Rejected: File at {blueprint_path} does not originate from dd-parser pipeline!")
|
|
39
|
+
|
|
40
|
+
return pd.read_csv(blueprint_path, comment='#')
|
|
41
|
+
|
|
42
|
+
def generate_cleaning_markdown_summary(self, data_df: pd.DataFrame, base_project_dir: str):
|
|
43
|
+
"""Compiles clean data type breakdowns and null metrics to the documents/ workspace."""
|
|
44
|
+
doc_dir_name = self.config.get('documents_dir', 'documents')
|
|
45
|
+
abs_doc_dir = os.path.abspath(os.path.join(base_project_dir, doc_dir_name))
|
|
46
|
+
os.makedirs(abs_doc_dir, exist_ok=True)
|
|
47
|
+
|
|
48
|
+
report_path = os.path.join(abs_doc_dir, "data_cleaning_summary.md")
|
|
49
|
+
type_summary = data_df.dtypes.value_counts()
|
|
50
|
+
null_counts = data_df.isnull().sum()
|
|
51
|
+
total_rows = len(data_df)
|
|
52
|
+
|
|
53
|
+
with open(report_path, 'w', encoding='utf-8') as f:
|
|
54
|
+
f.write("# š§¼ KMDS Data Helper: Data Cleaning Summary Report\n\n")
|
|
55
|
+
f.write("## š Converted Data Types Summary\n")
|
|
56
|
+
for dtype_name, count in type_summary.items():
|
|
57
|
+
f.write(f"| {dtype_name} | {count} |\n")
|
|
58
|
+
f.write("\n## šļø Missing Value Counts\n")
|
|
59
|
+
for col in data_df.columns:
|
|
60
|
+
f.write(f"| `{col}` | {null_counts[col]} | {(total_rows - null_counts[col])/total_rows*100:.2f}% |\n")
|
|
61
|
+
|
|
62
|
+
logger.info(f"Generated clean dataset validation profile saved to: {report_path}")
|
|
63
|
+
|
|
64
|
+
def execute_numeric_imputer(self, series: pd.Series) -> pd.Series:
|
|
65
|
+
"""
|
|
66
|
+
Isolated strategy vector for missing continuous numerical data items.
|
|
67
|
+
Swap out this inner logic to upgrade from median to kNN/Iterative models later.
|
|
68
|
+
"""
|
|
69
|
+
fill_value = series.median()
|
|
70
|
+
return series.fillna(fill_value)
|
|
71
|
+
|
|
72
|
+
def execute_poc_feature_prep(self, df: pd.DataFrame, blueprint_df: pd.DataFrame) -> pd.DataFrame:
|
|
73
|
+
"""Applies basic missing value strategies and builds address strings for geocoding."""
|
|
74
|
+
prep_df = df.copy()
|
|
75
|
+
raw_cols_lower = {col.lower(): col for col in prep_df.columns}
|
|
76
|
+
|
|
77
|
+
# 1. Extract and compile Geo attributes on a per-entity basis
|
|
78
|
+
geo_blueprint = blueprint_df[blueprint_df['is_geographical'] == True]
|
|
79
|
+
entity_geo_groups = geo_blueprint.groupby('provisional_entity')
|
|
80
|
+
|
|
81
|
+
for entity_name, group in entity_geo_groups:
|
|
82
|
+
geo_cols = []
|
|
83
|
+
for _, row in group.iterrows():
|
|
84
|
+
attr_lower = row['attribute_name'].lower()
|
|
85
|
+
if attr_lower in raw_cols_lower:
|
|
86
|
+
geo_cols.append(raw_cols_lower[attr_lower])
|
|
87
|
+
|
|
88
|
+
if geo_cols:
|
|
89
|
+
logger.info(f" -> Consolidating geo attributes for entity: '{entity_name}'")
|
|
90
|
+
prep_df[f"{entity_name.lower()}_geo_search_string"] = prep_df[geo_cols].fillna("").astype(str).agg(", ".join, axis=1)
|
|
91
|
+
|
|
92
|
+
# 2. Variable Strategy Loop driven by Schema Typing definitions
|
|
93
|
+
for _, row in blueprint_df.iterrows():
|
|
94
|
+
attr_lower = row['attribute_name'].lower()
|
|
95
|
+
if attr_lower not in raw_cols_lower:
|
|
96
|
+
continue
|
|
97
|
+
col_name = raw_cols_lower[attr_lower]
|
|
98
|
+
t_type = row['provisional_python_type']
|
|
99
|
+
|
|
100
|
+
# Numeric Strategy: Route directly through decoupled method
|
|
101
|
+
if t_type in ['int', 'float']:
|
|
102
|
+
if prep_df[col_name].isnull().any():
|
|
103
|
+
prep_df[col_name] = self.execute_numeric_imputer(prep_df[col_name])
|
|
104
|
+
|
|
105
|
+
# Categorical Strategy: Explicitly flag missing indices
|
|
106
|
+
elif t_type == 'str':
|
|
107
|
+
prep_df[col_name] = prep_df[col_name].replace(["nan", "None", ""], None).fillna("MISSING")
|
|
108
|
+
|
|
109
|
+
return prep_df
|
|
110
|
+
|
|
111
|
+
def clean_dataset(self):
|
|
112
|
+
blueprint_df = self.verify_and_load_blueprint()
|
|
113
|
+
raw_file = self.config.get('raw_dataset_file', 'sba_loans_raw.csv')
|
|
114
|
+
base_project_dir = os.path.abspath(os.path.join(self.working_dir, ".."))
|
|
115
|
+
raw_path = os.path.isabs(raw_file) and raw_file or os.path.abspath(os.path.join(base_project_dir, "data", raw_file))
|
|
116
|
+
|
|
117
|
+
data_df = pd.read_csv(raw_path)
|
|
118
|
+
raw_columns_lower = {col.lower(): col for col in data_df.columns}
|
|
119
|
+
|
|
120
|
+
# [STAGE 1] Perform baseline type conversions
|
|
121
|
+
for _, row in blueprint_df.iterrows():
|
|
122
|
+
blueprint_attr = row['attribute_name']
|
|
123
|
+
target_type = row['provisional_python_type']
|
|
124
|
+
attr_lower = blueprint_attr.lower()
|
|
125
|
+
if attr_lower not in raw_columns_lower: continue
|
|
126
|
+
col_name = raw_columns_lower[attr_lower]
|
|
127
|
+
|
|
128
|
+
try:
|
|
129
|
+
if target_type == 'bool':
|
|
130
|
+
if data_df[col_name].dtype == object:
|
|
131
|
+
data_df[col_name] = data_df[col_name].astype(str).str.upper().str.strip().isin(['TRUE', '1', 'Y', 'YES', 'T'])
|
|
132
|
+
else:
|
|
133
|
+
data_df[col_name] = data_df[col_name].fillna(False).astype(bool)
|
|
134
|
+
elif target_type == 'int':
|
|
135
|
+
data_df[col_name] = pd.to_numeric(data_df[col_name], errors='coerce')
|
|
136
|
+
elif target_type == 'float':
|
|
137
|
+
data_df[col_name] = pd.to_numeric(data_df[col_name], errors='coerce')
|
|
138
|
+
elif target_type in ['datetime.date', 'datetime.datetime']:
|
|
139
|
+
data_df[col_name] = pd.to_datetime(data_df[col_name], errors='coerce')
|
|
140
|
+
else:
|
|
141
|
+
data_df[col_name] = data_df[col_name].astype(str).str.strip()
|
|
142
|
+
except Exception: pass
|
|
143
|
+
|
|
144
|
+
# [STAGE 2] Write standard output and generate report
|
|
145
|
+
cleaner_dir = self.config.get('dd_cleaner_output_dir', 'dd_cleaner_results')
|
|
146
|
+
clean_filename = self.config.get('clean_output_filename', 'sba_loans_clean.csv')
|
|
147
|
+
abs_dest_dir = os.path.isabs(cleaner_dir) and cleaner_dir or os.path.abspath(os.path.join(base_project_dir, "data", cleaner_dir))
|
|
148
|
+
os.makedirs(abs_dest_dir, exist_ok=True)
|
|
149
|
+
|
|
150
|
+
data_df.to_csv(os.path.join(abs_dest_dir, clean_filename), index=False)
|
|
151
|
+
self.generate_cleaning_markdown_summary(data_df, base_project_dir)
|
|
152
|
+
|
|
153
|
+
# [STAGE 3] Run PoC feature prep AFTER generating the report
|
|
154
|
+
logger.info("Executing PoC Missing Value Strategies and Geo-string preparation...")
|
|
155
|
+
poc_ready_df = self.execute_poc_feature_prep(data_df, blueprint_df)
|
|
156
|
+
|
|
157
|
+
# Save the finalized feature-selection dataset
|
|
158
|
+
poc_output_path = os.path.join(abs_dest_dir, "feature_selection_ready.csv")
|
|
159
|
+
poc_ready_df.to_csv(poc_output_path, index=False)
|
|
160
|
+
logger.info(f"š Hand-off Complete! Modeling matrix saved to: {poc_output_path}")
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
def main():
|
|
5
|
+
# Setup initial clean console output streaming
|
|
6
|
+
logging.basicConfig(
|
|
7
|
+
level=logging.INFO,
|
|
8
|
+
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
|
9
|
+
datefmt="%Y-%m-%d %H:%M:%S"
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
parser = argparse.ArgumentParser(description="Run the private local LLM Data Dictionary Parser.")
|
|
13
|
+
parser.add_argument("--working-dir", required=True, help="Path to raw source files.")
|
|
14
|
+
parser.add_argument("--config", required=True, help="Path to config.yaml file.")
|
|
15
|
+
args = parser.parse_args()
|
|
16
|
+
|
|
17
|
+
# Import inside main to let logging configure cleanly first
|
|
18
|
+
from dd_parser.core import LocalEntityClassifier
|
|
19
|
+
|
|
20
|
+
classifier = LocalEntityClassifier()
|
|
21
|
+
classifier.set_working_config(working_dir=args.working_dir, config_path=args.config)
|
|
22
|
+
classifier.process()
|
|
23
|
+
|
|
24
|
+
if __name__ == "__main__":
|
|
25
|
+
main()
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import yaml
|
|
4
|
+
import time
|
|
5
|
+
import logging
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from pypdf import PdfReader
|
|
8
|
+
import ollama
|
|
9
|
+
from typing import List, Dict, Any
|
|
10
|
+
from dd_parser.models import AttributeAnalysis, BatchAnalysisResponse
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger("dd_parser")
|
|
13
|
+
|
|
14
|
+
class LocalEntityClassifier:
|
|
15
|
+
def __init__(self):
|
|
16
|
+
self.working_dir: str = ""
|
|
17
|
+
self.config: Dict[Any, Any] = {}
|
|
18
|
+
|
|
19
|
+
def set_working_config(self, working_dir: str, config_path: str):
|
|
20
|
+
abs_config_path = os.path.abspath(config_path)
|
|
21
|
+
if not os.path.exists(abs_config_path):
|
|
22
|
+
raise FileNotFoundError(f"Configuration file not found at: {abs_config_path}")
|
|
23
|
+
|
|
24
|
+
with open(abs_config_path, 'r') as f:
|
|
25
|
+
self.config = yaml.safe_load(f)
|
|
26
|
+
|
|
27
|
+
if not os.path.isdir(working_dir):
|
|
28
|
+
raise FileNotFoundError(f"Target data directory not found: {os.path.abspath(working_dir)}")
|
|
29
|
+
self.working_dir = os.path.abspath(working_dir)
|
|
30
|
+
|
|
31
|
+
raw_output_dir = self.config.get('dd_parser_output_dir', 'dd_analysis_results')
|
|
32
|
+
abs_output_dir = os.path.isabs(raw_output_dir) and raw_output_dir or os.path.abspath(os.path.join(self.working_dir, raw_output_dir))
|
|
33
|
+
os.makedirs(abs_output_dir, exist_ok=True)
|
|
34
|
+
|
|
35
|
+
log_file_path = os.path.join(abs_output_dir, "parser_run.log")
|
|
36
|
+
file_handler = logging.FileHandler(log_file_path, encoding='utf-8')
|
|
37
|
+
file_handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(name)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"))
|
|
38
|
+
logger.addHandler(file_handler)
|
|
39
|
+
|
|
40
|
+
logger.info("Context Initialized with Hybrid Processing Configuration.")
|
|
41
|
+
logger.info(f"Loaded Config: {abs_config_path} | Tracking Log: {log_file_path}")
|
|
42
|
+
|
|
43
|
+
def extract_attributes(self, file_path: str, csv_idx: int = 0) -> List[str]:
|
|
44
|
+
_, ext = os.path.splitext(file_path)
|
|
45
|
+
ext = ext.lower()
|
|
46
|
+
if ext == '.csv':
|
|
47
|
+
return pd.read_csv(file_path).iloc[:, csv_idx].dropna().astype(str).tolist()
|
|
48
|
+
elif ext == '.pdf':
|
|
49
|
+
return [line.strip() for page in PdfReader(file_path).pages for line in page.extract_text().split('\n') if line.strip()]
|
|
50
|
+
elif ext in ['.md', '.markdown']:
|
|
51
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
52
|
+
return [line.strip() for line in f if line.strip()]
|
|
53
|
+
raise ValueError(f"Unsupported format: {ext}")
|
|
54
|
+
|
|
55
|
+
def analyze_batch(self, attributes: List[str]) -> List[AttributeAnalysis]:
|
|
56
|
+
prompt = f"""
|
|
57
|
+
Analyze the following data dictionary attributes.
|
|
58
|
+
|
|
59
|
+
### EXAMPLES OF EXCELLENT PERFORMANCE
|
|
60
|
+
Input: ["BorrCity", "BankStreet", "GrossApproval", "SoldSecMrktInd"]
|
|
61
|
+
Output Schema Map:
|
|
62
|
+
{{
|
|
63
|
+
"analysis": [
|
|
64
|
+
{{"attribute_name": "BorrCity", "provisional_entity": "Borrower", "is_geographical": true, "related_entity": "Borrower", "provisional_python_type": "str"}},
|
|
65
|
+
{{"attribute_name": "BankStreet", "provisional_entity": "Bank", "is_geographical": true, "related_entity": "Bank", "provisional_python_type": "str"}},
|
|
66
|
+
{{"attribute_name": "GrossApproval", "provisional_entity": "Loan", "is_geographical": false, "related_entity": null, "provisional_python_type": "float"}},
|
|
67
|
+
{{"attribute_name": "SoldSecMrktInd", "provisional_entity": "Loan", "is_geographical": false, "related_entity": null, "provisional_python_type": "bool"}}
|
|
68
|
+
]
|
|
69
|
+
}}
|
|
70
|
+
|
|
71
|
+
### CURRENT EXECUTION BATCH
|
|
72
|
+
Attributes to process: {json.dumps(attributes)}
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
response = ollama.chat(
|
|
76
|
+
model=self.config.get('model_name', 'llama3.2'),
|
|
77
|
+
messages=[
|
|
78
|
+
{"role": "system", "content": self.config.get('system_prompt', 'You are a precise data engineering assistant. Respond strictly in JSON.')},
|
|
79
|
+
{"role": "user", "content": prompt}
|
|
80
|
+
],
|
|
81
|
+
options={"temperature": self.config.get('temperature', 0.0)},
|
|
82
|
+
format=BatchAnalysisResponse.model_json_schema()
|
|
83
|
+
)
|
|
84
|
+
return BatchAnalysisResponse(**json.loads(response['message']['content'])).analysis
|
|
85
|
+
|
|
86
|
+
def post_process_cleaner(self, analysis_list: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
87
|
+
cleaned_records = []
|
|
88
|
+
for item in analysis_list:
|
|
89
|
+
data = item.copy()
|
|
90
|
+
attr = data['attribute_name']
|
|
91
|
+
|
|
92
|
+
if attr.startswith('Borr'):
|
|
93
|
+
data['provisional_entity'] = 'Borrower'
|
|
94
|
+
if data['is_geographical']: data['related_entity'] = 'Borrower'
|
|
95
|
+
elif attr.startswith('Bank'):
|
|
96
|
+
data['provisional_entity'] = 'Bank'
|
|
97
|
+
if attr in ['BankStreet', 'BankCity', 'BankState', 'BankZip']:
|
|
98
|
+
data['is_geographical'] = True
|
|
99
|
+
data['related_entity'] = 'Bank'
|
|
100
|
+
elif attr.startswith('Project'):
|
|
101
|
+
data['provisional_entity'] = 'Project'
|
|
102
|
+
if attr in ['ProjectCounty', 'ProjectState']:
|
|
103
|
+
data['is_geographical'] = True
|
|
104
|
+
data['related_entity'] = 'Project'
|
|
105
|
+
elif 'Approval' in attr or 'Disbursement' in attr or attr in ['Program', 'Subprogram']:
|
|
106
|
+
data['provisional_entity'] = 'Loan'
|
|
107
|
+
elif attr.startswith('SBA'):
|
|
108
|
+
data['provisional_entity'] = 'SBA'
|
|
109
|
+
if data['is_geographical']: data['related_entity'] = 'SBA'
|
|
110
|
+
|
|
111
|
+
if attr.endswith('Ind') or 'Indicator' in attr:
|
|
112
|
+
data['provisional_python_type'] = 'bool'
|
|
113
|
+
|
|
114
|
+
if not data['is_geographical']:
|
|
115
|
+
data['related_entity'] = ""
|
|
116
|
+
|
|
117
|
+
cleaned_records.append(data)
|
|
118
|
+
return cleaned_records
|
|
119
|
+
|
|
120
|
+
def generate_parsing_markdown_summary(self, final_results: List[Dict[str, Any]], base_project_dir: str):
|
|
121
|
+
"""Compiles structural data profiles and metadata metrics as a clean Markdown specification file."""
|
|
122
|
+
df = pd.DataFrame(final_results)
|
|
123
|
+
doc_dir_name = self.config.get('documents_dir', 'documents')
|
|
124
|
+
abs_doc_dir = os.path.abspath(os.path.join(base_project_dir, doc_dir_name))
|
|
125
|
+
os.makedirs(abs_doc_dir, exist_ok=True)
|
|
126
|
+
|
|
127
|
+
report_path = os.path.join(abs_doc_dir, "dd_parsing_summary.md")
|
|
128
|
+
entity_counts = df['provisional_entity'].value_counts()
|
|
129
|
+
total_attributes = len(df)
|
|
130
|
+
|
|
131
|
+
with open(report_path, 'w', encoding='utf-8') as f:
|
|
132
|
+
f.write("# š KMDS Data Helper: Data Dictionary Parsing Summary\n\n")
|
|
133
|
+
f.write(f"**Total Tracked Attributes:** {total_attributes} \n")
|
|
134
|
+
f.write(f"**Unique Detected Entities:** {len(entity_counts)}\n\n")
|
|
135
|
+
|
|
136
|
+
f.write("## šļø Entity Size & Distribution Profile\n")
|
|
137
|
+
f.write("| Detected Entity Node | Number of Attributes (Size) |\n")
|
|
138
|
+
f.write("| :--- | :--- |\n")
|
|
139
|
+
for ent, count in entity_counts.items():
|
|
140
|
+
f.write(f"| {ent} | {count} |\n")
|
|
141
|
+
f.write("\n")
|
|
142
|
+
|
|
143
|
+
f.write("## šļø Attribute Structural Categories\n")
|
|
144
|
+
categorical_df = df[df['provisional_python_type'] == 'bool']
|
|
145
|
+
numerical_df = df[df['provisional_python_type'].isin(['int', 'float'])]
|
|
146
|
+
semantic_df = df[~df['provisional_python_type'].isin(['bool', 'int', 'float'])]
|
|
147
|
+
|
|
148
|
+
f.write(f"### š Categorical Fields (Total: {len(categorical_df)})\n")
|
|
149
|
+
for _, row in categorical_df.iterrows():
|
|
150
|
+
f.write(f"- `{row['attribute_name']}` ({row['provisional_python_type']}) $\rightarrow$ Node: **{row['provisional_entity']}**\n")
|
|
151
|
+
f.write("\n")
|
|
152
|
+
|
|
153
|
+
f.write(f"### š¢ Numerical Fields (Total: {len(numerical_df)})\n")
|
|
154
|
+
for _, row in numerical_df.iterrows():
|
|
155
|
+
f.write(f"- `{row['attribute_name']}` ({row['provisional_python_type']}) $\rightarrow$ Node: **{row['provisional_entity']}**\n")
|
|
156
|
+
f.write("\n")
|
|
157
|
+
|
|
158
|
+
f.write(f"### š§ Semantic Attributes Grouped By Parent Class (Total: {len(semantic_df)})\n")
|
|
159
|
+
grouped_semantic = semantic_df.groupby('provisional_entity')
|
|
160
|
+
for ent_group, group_df in grouped_semantic:
|
|
161
|
+
f.write(f"#### Entity Category: `{ent_group}`\n")
|
|
162
|
+
for _, row in group_df.iterrows():
|
|
163
|
+
geo_suffix = row['is_geographical'] and f" [GEO Linked: {row['related_entity']}]" or ""
|
|
164
|
+
f.write(f" - `{row['attribute_name']}` ({row['provisional_python_type']}){geo_suffix}\n")
|
|
165
|
+
f.write("\n")
|
|
166
|
+
|
|
167
|
+
logger.info(f"Generated parser metadata documentation report saved to: {report_path}")
|
|
168
|
+
|
|
169
|
+
def process(self):
|
|
170
|
+
files_to_process = self.config.get('files', [])
|
|
171
|
+
if not files_to_process: return
|
|
172
|
+
|
|
173
|
+
raw_output_dir = self.config.get('dd_parser_output_dir', 'dd_analysis_results')
|
|
174
|
+
abs_output_dir = os.path.isabs(raw_output_dir) and raw_output_dir or os.path.abspath(os.path.join(self.working_dir, raw_output_dir))
|
|
175
|
+
|
|
176
|
+
batch_size, csv_col_idx = self.config.get('batch_size', 10), self.config.get('csv_target_column_index', 0)
|
|
177
|
+
config_filename = self.config.get('output_filename')
|
|
178
|
+
|
|
179
|
+
# Extract base project root for kmds-data-helper structure alignment
|
|
180
|
+
base_project_dir = os.path.abspath(os.path.join(self.working_dir, ".."))
|
|
181
|
+
|
|
182
|
+
for filepath in files_to_process:
|
|
183
|
+
input_file_path = os.path.isabs(filepath) and filepath or os.path.abspath(os.path.join(self.working_dir, filepath))
|
|
184
|
+
if not os.path.exists(input_file_path): continue
|
|
185
|
+
|
|
186
|
+
filename = os.path.basename(input_file_path)
|
|
187
|
+
try:
|
|
188
|
+
raw_attributes = self.extract_attributes(input_file_path, csv_col_idx)
|
|
189
|
+
logger.info(f"Extracted {len(raw_attributes)} attributes from {filename}")
|
|
190
|
+
except Exception as e:
|
|
191
|
+
logger.exception(f"Read failure on {filename}: {e}")
|
|
192
|
+
continue
|
|
193
|
+
|
|
194
|
+
final_results = []
|
|
195
|
+
for i in range(0, len(raw_attributes), batch_size):
|
|
196
|
+
batch = raw_attributes[i:i+batch_size]
|
|
197
|
+
start_time = time.perf_counter()
|
|
198
|
+
try:
|
|
199
|
+
batch_output = self.analyze_batch(batch)
|
|
200
|
+
batch_dicts = [item.model_dump() for item in batch_output]
|
|
201
|
+
cleaned_batch = self.post_process_cleaner(batch_dicts)
|
|
202
|
+
final_results.extend(cleaned_batch)
|
|
203
|
+
logger.info(f" Batch run {i} verified in {time.perf_counter() - start_time:.2f}s")
|
|
204
|
+
except Exception as e:
|
|
205
|
+
logger.error(f" Batch crash at element index {i}: {e}")
|
|
206
|
+
|
|
207
|
+
if final_results:
|
|
208
|
+
if config_filename and len(files_to_process) == 1:
|
|
209
|
+
out_filename = config_filename
|
|
210
|
+
else:
|
|
211
|
+
base_name, _ = os.path.splitext(filename)
|
|
212
|
+
out_filename = f"mapped_{base_name}.csv"
|
|
213
|
+
|
|
214
|
+
output_csv_path = os.path.join(abs_output_dir, out_filename)
|
|
215
|
+
|
|
216
|
+
preamble = f"# DD-PARSER-SIGNATURE: PROCESSED-BY-{self.config.get('model_name', 'llama3.2').upper()}\n"
|
|
217
|
+
with open(output_csv_path, 'w', encoding='utf-8') as f:
|
|
218
|
+
f.write(preamble)
|
|
219
|
+
|
|
220
|
+
pd.DataFrame(final_results).to_csv(output_csv_path, mode='a', index=False)
|
|
221
|
+
logger.info(f"Finished tracking {filename}. Saved verified matrix map to: {output_csv_path}")
|
|
222
|
+
|
|
223
|
+
# Dynamic Markdown Generation Task Trigger
|
|
224
|
+
self.generate_parsing_markdown_summary(final_results, base_project_dir)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from pydantic import BaseModel, Field
|
|
2
|
+
from typing import List, Optional, Literal
|
|
3
|
+
|
|
4
|
+
class AttributeAnalysis(BaseModel):
|
|
5
|
+
attribute_name: str
|
|
6
|
+
provisional_entity: str = Field(
|
|
7
|
+
description="The primary business entity this attribute belongs to (e.g., Customer, Product, Transaction)."
|
|
8
|
+
)
|
|
9
|
+
is_geographical: bool = Field(
|
|
10
|
+
description="True if the attribute represents a physical location, address, coordinate, country, or region."
|
|
11
|
+
)
|
|
12
|
+
related_entity: Optional[str] = Field(
|
|
13
|
+
None,
|
|
14
|
+
description="If geographical, which entity does this location bind to? (e.g., 'Customer' for 'shipping_state')."
|
|
15
|
+
)
|
|
16
|
+
provisional_python_type: Literal["str", "int", "float", "datetime.date", "datetime.datetime", "bool"] = Field(
|
|
17
|
+
description="Semantic Python data type based on semantics."
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
class BatchAnalysisResponse(BaseModel):
|
|
21
|
+
analysis: List[AttributeAnalysis]
|
|
File without changes
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from dd_parser import LocalEntityClassifier
|
|
3
|
+
|
|
4
|
+
def run_local_test():
|
|
5
|
+
# Folder containing your dictionary files
|
|
6
|
+
target_data_dir = "/home/rajiv/programming/kmds_descriptive_analytics/kmds_sba_loans/data_dictionary"
|
|
7
|
+
|
|
8
|
+
# Path to the config file sitting in your current workspace
|
|
9
|
+
config_name = "config.yaml"
|
|
10
|
+
|
|
11
|
+
print("=== Starting dd_parser Integration Test ===")
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
classifier = LocalEntityClassifier()
|
|
15
|
+
|
|
16
|
+
# FIXED: Pass config_path as the second keyword argument matching our refactored core engine
|
|
17
|
+
classifier.set_working_config(working_dir=target_data_dir, config_path=config_name)
|
|
18
|
+
|
|
19
|
+
print("\nš Dispatched extraction and Ollama micro-batching pipelines...")
|
|
20
|
+
classifier.process()
|
|
21
|
+
print("\n=== Test Finished Successfully ===")
|
|
22
|
+
|
|
23
|
+
except Exception as e:
|
|
24
|
+
print(f"\nā Pipeline execution failed with exception: {e}")
|
|
25
|
+
|
|
26
|
+
if __name__ == "__main__":
|
|
27
|
+
run_local_test()
|