dd-parser-cleaner 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,12 @@
1
+ .venv/
2
+ uv.lock
3
+ __pycache__/
4
+ *.pyc
5
+ .pytest_cache/
6
+ .DS_Store
7
+ dist/
8
+ dd_analysis_results/
9
+ dd_cleaner_results/
10
+ documents/
11
+ *.csv
12
+ parser_run.log
@@ -0,0 +1 @@
1
+ 3.13
@@ -0,0 +1,13 @@
1
+ {
2
+ "python.analysis.extraPaths": [
3
+ "${workspaceFolder}/src"
4
+ ],
5
+ "python.analysis.packageIndexDepths": [
6
+ {
7
+ "name": "dd_parser",
8
+ "depth": 5
9
+ }
10
+ ],
11
+ "python.analysis.importFormat": "absolute",
12
+ "python.analysis.typeCheckingMode": "basic"
13
+ }
@@ -0,0 +1,10 @@
1
+ Metadata-Version: 2.4
2
+ Name: dd-parser-cleaner
3
+ Version: 0.1.0
4
+ Summary: A private, local LLM-powered data dictionary parser and entity mapper with automated cleaning.
5
+ Requires-Python: >=3.10
6
+ Requires-Dist: ollama>=0.2.0
7
+ Requires-Dist: pandas>=2.2.0
8
+ Requires-Dist: pydantic>=2.6.0
9
+ Requires-Dist: pypdf>=4.1.0
10
+ Requires-Dist: pyyaml>=6.0.1
File without changes
@@ -0,0 +1,22 @@
1
+ # operational_settings
2
+ batch_size: 10
3
+ files:
4
+ - "sba_dd.csv"
5
+
6
+ # llm_settings
7
+ model_name: "llama3.2"
8
+ temperature: 0.0
9
+ system_prompt: "You are a precise data engineering assistant. Respond strictly in JSON."
10
+ csv_target_column_index: 0
11
+
12
+ # =====================================================================
13
+ # Pipeline Ingestion/Execution Directory Targets
14
+ # =====================================================================
15
+ # dd_parser module outputs
16
+ dd_parser_output_dir: "dd_analysis_results" # Sub-dir where the blueprint lands
17
+ output_filename: "sba_analysis_results.csv" # Name of the handshake blueprint
18
+
19
+ # dd_cleaner module outputs
20
+ raw_dataset_file: "sba_loans_raw.csv" # Sourced from the root working_dir
21
+ dd_cleaner_output_dir: "dd_cleaner_results" # Dedicated sub-dir for cleaned data
22
+ clean_output_filename: "sba_loans_clean.csv" # Final production dataset filename
@@ -0,0 +1,309 @@
1
+ ## šŸ“‘ Session Stash: Unified Project State & KMDS Document Reporting
2
+
3
+ ## šŸ“Œ Project State Summary
4
+
5
+ * Workspace Title: `dd-parser-cleaner`
6
+ * Active Platform Integration: Fully aligned with the `kmds-data-helper` ecosystem [15]. Ingests and processes data dictionary properties inside the `data_dictionary/` workspace [15], maps target source payloads out of the `data/` workspace [15], and drops clean, readable Markdown summaries directly into the `documents/` workspace [15].
7
+ * Pipeline Handshake Status: Fully functional. The inference engine locks down a secure `# DD-PARSER-SIGNATURE` comment header row at the top of the mapping CSV, which the data cleaner validates before execution.
8
+ * Execution Safety: Resolves case-variant header anomalies dynamically at runtime using a lowercase field map, ensuring type-casting rules apply flawlessly to mismatched dataset schemas.
9
+
10
+ ---
11
+
12
+ ## šŸ“‚ Active Unified Workspace Layout
13
+
14
+ ```text
15
+ /home/rajiv/programming/dd_parser/ # Workspace Directory
16
+ ā”œā”€ā”€ pyproject.toml # Distribution and entry point registry
17
+ ā”œā”€ā”€ config.yaml # Centralized execution parameter file
18
+ └── src/
19
+ ā”œā”€ā”€ dd_parser/ # LLM Inference and Heuristic Engine
20
+ │ ā”œā”€ā”€ __init__.py
21
+ │ ā”œā”€ā”€ cli.py
22
+ │ ā”œā”€ā”€ core.py # Generates blueprint matrix + dd_parsing_summary.md
23
+ │ └── models.py # Pydantic schema validation contract
24
+ └── dd_cleaner/ # Case-Insensitive Transformation Engine
25
+ ā”œā”€ā”€ __init__.py
26
+ ā”œā”€ā”€ cli.py
27
+ └── engine.py # Generates clean data payload + data_cleaning_summary.md
28
+ ```
29
+
30
+ ---
31
+
32
+ ## šŸ“„ Core Code Matrix Updates
33
+
34
+ ## 1. Upgraded Project Settings (`pyproject.toml`)
35
+
36
+ ```toml
37
+ [project]
38
+ name = "dd-parser-cleaner"
39
+ version = "0.1.0"
40
+ description = "A private, local LLM-powered data dictionary parser and entity mapper with automated cleaning."
41
+ readme = "README.md"
42
+ requires-python = ">=3.10"
43
+ dependencies = [
44
+ "pandas>=2.2.0",
45
+ "pydantic>=2.6.0",
46
+ "pypdf>=4.1.0",
47
+ "ollama>=0.2.0",
48
+ "pyyaml>=6.0.1",
49
+ ]
50
+
51
+ [project.scripts]
52
+ classify-entities = "dd_parser.cli:main"
53
+ clean-dataset = "dd_cleaner.cli:main"
54
+
55
+ [build-system]
56
+ requires = ["hatchling"]
57
+ build-backend = "hatchling.build"
58
+
59
+ [tool.hatch.build.targets.wheel]
60
+ packages = ["src/dd_parser", "src/dd_cleaner"]
61
+ ```
62
+
63
+ ## 2. Runtime Execution Configuration (`config.yaml`)
64
+
65
+ ```yaml
66
+ # operational_settings
67
+ batch_size: 10
68
+ files:
69
+ - "sba_dd.csv"
70
+
71
+ # llm_settings
72
+ model_name: "llama3.2"
73
+ temperature: 0.0
74
+ system_prompt: "You are a precise data engineering assistant. Respond strictly in JSON."
75
+ csv_target_column_index: 0
76
+
77
+ # =====================================================================
78
+ # Pipeline Ingestion/Execution Directory Targets
79
+ # =====================================================================
80
+ # Document Report Analytics Target Location
81
+ documents_dir: "documents"
82
+
83
+ # dd_parser module outputs
84
+ dd_parser_output_dir: "dd_analysis_results"
85
+ output_filename: "sba_analysis_results.csv"
86
+
87
+ # dd_cleaner module outputs
88
+ raw_dataset_file: "sba_loans_raw.csv"
89
+ dd_cleaner_output_dir: "dd_cleaner_results"
90
+ clean_output_filename: "sba_loans_clean.csv"
91
+ ```
92
+
93
+ ## 3. Reporting Parser Engine (`src/dd_parser/core.py`)
94
+
95
+ ```python
96
+ importos
97
+ importjson
98
+ importyaml
99
+ importtime
100
+ importlogging
101
+ importpandasas pd
102
+ frompypdfimportPdfReader
103
+ importollama
104
+ fromtypingimportList, Dict, Any
105
+ fromdd_parser.modelsimportAttributeAnalysis, BatchAnalysisResponse
106
+
107
+ logger = logging.getLogger("dd_parser")
108
+
109
+ classLocalEntityClassifier:
110
+ def__init__(self):
111
+ self.working_dir: str = ""
112
+ self.config: Dict[Any, Any] = {}
113
+
114
+ defset_working_config(self, working_dir: str, config_path: str):
115
+ abs_config_path = os.path.abspath(config_path)
116
+ ifnot os.path.exists(abs_config_path):
117
+ raise FileNotFoundError(f"Configuration file not found at: {abs_config_path}")
118
+
119
+ with open(abs_config_path, 'r') asf:
120
+ self.config = yaml.safe_load(f)
121
+
122
+ ifnot os.path.isdir(working_dir):
123
+ raise FileNotFoundError(f"Target data directory not found: {os.path.abspath(working_dir)}")
124
+ self.working_dir = os.path.abspath(working_dir)
125
+
126
+ raw_output_dir = self.config.get('dd_parser_output_dir', 'dd_analysis_results')
127
+ abs_output_dir = os.path.isabs(raw_output_dir) and raw_output_dir or os.path.abspath(os.path.join(self.working_dir, raw_output_dir))
128
+ os.makedirs(abs_output_dir, exist_ok=True)
129
+
130
+ log_file_path = os.path.join(abs_output_dir, "parser_run.log")
131
+ file_handler = logging.FileHandler(log_file_path, encoding='utf-8')
132
+ file_handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(name)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"))
133
+ logger.addHandler(file_handler)
134
+
135
+ logger.info("Context Initialized with Hybrid Processing Configuration.")
136
+
137
+ defextract_attributes(self, file_path: str, csv_idx: int = 0) -> List[str]:
138
+ _, ext = os.path.splitext(file_path)
139
+ ext = ext.lower()
140
+ if ext == '.csv':
141
+ return pd.read_csv(file_path).iloc[:, csv_idx].dropna().astype(str).tolist()
142
+ elif ext == '.pdf':
143
+ return [line.strip() forpagein PdfReader(file_path).pages forlinein page.extract_text().split('\n') if line.strip()]
144
+ elif ext in ['.md', '.markdown']:
145
+ with open(file_path, 'r', encoding='utf-8') asf:
146
+ return [line.strip() forlinein f if line.strip()]
147
+ raise ValueError(f"Unsupported format: {ext}")
148
+
149
+ defanalyze_batch(self, attributes: List[str]) -> List[AttributeAnalysis]:
150
+ prompt = f"""
151
+ Analyze the following data dictionary attributes.
152
+
153
+ ### EXAMPLES OF EXCELLENT PERFORMANCE
154
+ Input: ["BorrCity", "BankStreet", "GrossApproval", "SoldSecMrktInd"]
155
+ Output Schema Map:
156
+ {{
157
+ "analysis": [
158
+ {{"attribute_name": "BorrCity", "provisional_entity": "Borrower", "is_geographical": true, "related_entity": "Borrower", "provisional_python_type": "str"}},
159
+ {{"attribute_name": "BankStreet", "provisional_entity": "Bank", "is_geographical": true, "related_entity": "Bank", "provisional_python_type": "str"}},
160
+ {{"attribute_name": "GrossApproval", "provisional_entity": "Loan", "is_geographical": false, "related_entity": null, "provisional_python_type": "float"}},
161
+ {{"attribute_name": "SoldSecMrktInd", "provisional_entity": "Loan", "is_geographical": false, "related_entity": null, "provisional_python_type": "bool"}}
162
+ ]
163
+ }}
164
+
165
+ ### CURRENT EXECUTION BATCH
166
+ Attributes to process: {json.dumps(attributes)}
167
+ """
168
+ response = ollama.chat(
169
+ model=self.config.get('model_name', 'llama3.2'),
170
+ messages=[
171
+ {"role": "system", "content": self.config.get('system_prompt', 'You are a precise data engineering assistant. Respond strictly in JSON.')},
172
+ {"role": "user", "content": prompt}
173
+ ],
174
+ options={"temperature": 0.0},
175
+ format=BatchAnalysisResponse.model_json_schema()
176
+ )
177
+ return BatchAnalysisResponse(**json.loads(response['message']['content'])).analysis
178
+
179
+ defpost_process_cleaner(self, analysis_list: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
180
+ cleaned_records = []
181
+ foritemin analysis_list:
182
+ data = item.copy()
183
+ attr = data['attribute_name']
184
+
185
+ if attr.startswith('Borr'):
186
+ data['provisional_entity'] = 'Borrower'
187
+ if data['is_geographical']: data['related_entity'] = 'Borrower'
188
+ elif attr.startswith('Bank'):
189
+ data['provisional_entity'] = 'Bank'
190
+ if attr in ['BankStreet', 'BankCity', 'BankState', 'BankZip']:
191
+ data['is_geographical'] = True
192
+ data['related_entity'] = 'Bank'
193
+ elif attr.startswith('Project'):
194
+ data['provisional_entity'] = 'Project'
195
+ if attr in ['ProjectCounty', 'ProjectState']:
196
+ data['is_geographical'] = True
197
+ data['related_entity'] = 'Project'
198
+ elif'Approval'in attr or'Disbursement'in attr or attr in ['Program', 'Subprogram']:
199
+ data['provisional_entity'] = 'Loan'
200
+ elif attr.startswith('SBA'):
201
+ data['provisional_entity'] = 'SBA'
202
+ if data['is_geographical']: data['related_entity'] = 'SBA'
203
+
204
+ if attr.endswith('Ind') or'Indicator'in attr:
205
+ data['provisional_python_type'] = 'bool'
206
+
207
+ ifnot data['is_geographical']:
208
+ data['related_entity'] = ""
209
+ cleaned_records.append(data)
210
+ return cleaned_records
211
+
212
+ defgenerate_parsing_markdown_summary(self, final_results: List[Dict[str, Any]], base_project_dir: str):
213
+ df = pd.DataFrame(final_results)
214
+ doc_dir_name = self.config.get('documents_dir', 'documents')
215
+ abs_doc_dir = os.path.abspath(os.path.join(base_project_dir, doc_dir_name))
216
+ os.makedirs(abs_doc_dir, exist_ok=True)
217
+
218
+ report_path = os.path.join(abs_doc_dir, "dd_parsing_summary.md")
219
+ entity_counts = df['provisional_entity'].value_counts()
220
+ total_attributes = len(df)
221
+
222
+ with open(report_path, 'w', encoding='utf-8') asf:
223
+ f.write("# šŸ“‘ KMDS Data Helper: Data Dictionary Parsing Summary\n\n")
224
+ f.write(f"**Total Tracked Attributes:** {total_attributes} \n")
225
+ f.write(f"**Unique Detected Entities:** {len(entity_counts)}\n\n")
226
+
227
+ f.write("## šŸ—ļø Entity Size & Distribution Profile\n")
228
+ f.write("| Detected Entity Node | Number of Attributes (Size) |\n")
229
+ f.write("| :--- | :--- |\n")
230
+ forent, countin entity_counts.items():
231
+ f.write(f"| {ent} | {count} |\n")
232
+ f.write("\n")
233
+
234
+ f.write("## šŸŽ›ļø Attribute Structural Categories\n")
235
+ categorical_df = df[df['provisional_python_type'] == 'bool']
236
+ numerical_df = df[df['provisional_python_type'].isin(['int', 'float'])]
237
+ semantic_df = df[~df['provisional_python_type'].isin(['bool', 'int', 'float'])]
238
+
239
+ f.write(f"### šŸ“Š Categorical Fields (Total: {len(categorical_df)})\n")
240
+ for_, rowin categorical_df.iterrows():
241
+ f.write(f"- `{row['attribute_name']}` ({row['provisional_python_type']}) $\rightarrow$ Node: **{row['provisional_entity']}**\n")
242
+ f.write("\n")
243
+
244
+ f.write(f"### šŸ”¢ Numerical Fields (Total: {len(numerical_df)})\n")
245
+ for_, rowin numerical_df.iterrows():
246
+ f.write(f"- `{row['attribute_name']}` ({row['provisional_python_type']}) $\rightarrow$ Node: **{row['provisional_entity']}**\n")
247
+ f.write("\n")
248
+
249
+ f.write(f"### 🧠 Semantic Attributes Grouped By Parent Class (Total: {len(semantic_df)})\n")
250
+ grouped_semantic = semantic_df.groupby('provisional_entity')
251
+ forent_group, group_dfin grouped_semantic:
252
+ f.write(f"#### Entity Category: `{ent_group}`\n")
253
+ for_, rowin group_df.iterrows():
254
+ geo_suffix = row['is_geographical'] andf" [GEO Linked: {row['related_entity']}]"or""
255
+ f.write(f" - `{row['attribute_name']}` ({row['provisional_python_type']}){geo_suffix}\n")
256
+ f.write("\n")
257
+
258
+ defprocess(self):
259
+ files_to_process = self.config.get('files', [])
260
+ ifnot files_to_process: return
261
+
262
+ raw_output_dir = self.config.get('dd_parser_output_dir', 'dd_analysis_results')
263
+ abs_output_dir = os.path.isabs(raw_output_dir) and raw_output_dir or os.path.abspath(os.path.join(self.working_dir, raw_output_dir))
264
+
265
+ batch_size, csv_col_idx = self.config.get('batch_size', 10), self.config.get('csv_target_column_index', 0)
266
+ config_filename = self.config.get('output_filename')
267
+ base_project_dir = os.path.abspath(os.path.join(self.working_dir, ".."))
268
+
269
+ forfilepathin files_to_process:
270
+ input_file_path = os.path.isabs(filepath) and filepath or os.path.abspath(os.path.join(self.working_dir, filepath))
271
+ ifnot os.path.exists(input_file_path): continue
272
+
273
+ filename = os.path.basename(input_file_path)
274
+ try:
275
+ raw_attributes = self.extract_attributes(input_file_path, csv_col_idx)
276
+ except Exception ase: continue
277
+
278
+ final_results = []
279
+ foriin range(0, len(raw_attributes), batch_size):
280
+ batch = raw_attributes[i:i+batch_size]
281
+ try:
282
+ batch_output = self.analyze_batch(batch)
283
+ batch_dicts = [item.model_dump() foritemin batch_output]
284
+ cleaned_batch = self.post_process_cleaner(batch_dicts)
285
+ final_results.extend(cleaned_batch)
286
+ except Exception ase: pass
287
+
288
+ if final_results:
289
+ out_filename = (config_filename and len(files_to_process) == 1) and config_filename orf"mapped_{os.path.splitext(filename)}.csv"
290
+ output_csv_path = os.path.join(abs_output_dir, out_filename)
291
+
292
+ preamble = f"# DD-PARSER-SIGNATURE: PROCESSED-BY-{self.config.get('model_name', 'llama3.2').upper()}\n"
293
+ with open(output_csv_path, 'w', encoding='utf-8') asf:
294
+ f.write(preamble)
295
+
296
+ pd.DataFrame(final_results).to_csv(output_csv_path, mode='a', index=False)
297
+ self.generate_parsing_markdown_summary(final_results, base_project_dir)
298
+ ```
299
+
300
+ ---
301
+
302
+ ## šŸš€ Ready for Next Sprint
303
+
304
+ When you initiate your next tracking session, we will pick up directly with the data cleaner logic to add:
305
+
306
+ * Geographic Scrubbing Routines: Processing columns tagged with `is_geographical: true` to enforce title casing (e.g. `"Colorado Springs"`) and length-padding string masks for postal indices [15] (e.g., zero-padding ZIP codes to a strict length of 5 digits).
307
+ * Missing Value Custom Strategy Options: Designing explicit rules to safely substitute or isolate null cells based on attribute mappings.
308
+
309
+ Let me know whenever you are ready to kick off the geographic data cleaning extensions!
@@ -0,0 +1,25 @@
1
+ [project]
2
+ name = "dd-parser-cleaner" # This renames the overall workspace project
3
+ version = "0.1.0"
4
+ description = "A private, local LLM-powered data dictionary parser and entity mapper with automated cleaning."
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ dependencies = [
8
+ "pandas>=2.2.0",
9
+ "pydantic>=2.6.0",
10
+ "pypdf>=4.1.0",
11
+ "ollama>=0.2.0",
12
+ "pyyaml>=6.0.1",
13
+ ]
14
+
15
+ [project.scripts]
16
+ classify-entities = "dd_parser.cli:main" # Reverted back to your original folder string
17
+ clean-dataset = "dd_cleaner.cli:main" # Remains completely stable
18
+
19
+ [build-system]
20
+ requires = ["hatchling"]
21
+ build-backend = "hatchling.build"
22
+
23
+ # ─── ADD THIS BLOC TO RESOLVE THE WHEEL SELECTION BUG ───
24
+ [tool.hatch.build.targets.wheel]
25
+ packages = ["src/dd_parser", "src/dd_cleaner"]
File without changes
@@ -0,0 +1,35 @@
1
+ import argparse
2
+ import logging
3
+ import sys
4
+
5
+ def main():
6
+ # 1. Initialize clean, standardized console log streaming
7
+ logging.basicConfig(
8
+ level=logging.INFO,
9
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
10
+ datefmt="%Y-%m-%d %H:%M:%S"
11
+ )
12
+
13
+ parser = argparse.ArgumentParser(
14
+ description="Run the data cleaner pipeline driven by the verified dd-parser blueprint matrix map."
15
+ )
16
+ parser.add_argument("--working-dir", required=True, help="Path to raw data directories containing target payloads.")
17
+ parser.add_argument("--config", required=True, help="Path to config.yaml file containing script targets.")
18
+ args = parser.parse_args()
19
+
20
+ # Import engine inside main to allow logging configs to bind cleanly first
21
+ from dd_cleaner.engine import DataCleanerEngine
22
+
23
+ try:
24
+ # 2. Fire up the pipeline runner
25
+ cleaner = DataCleanerEngine()
26
+ cleaner.set_working_config(working_dir=args.working_dir, config_path=args.config)
27
+ cleaner.clean_dataset()
28
+
29
+ except Exception as e:
30
+ logger = logging.getLogger("dd_cleaner_main")
31
+ logger.error(f"šŸ›‘ Critical Pipeline Failure: {str(e)}")
32
+ sys.exit(1)
33
+
34
+ if __name__ == "__main__":
35
+ main()
@@ -0,0 +1,160 @@
1
+ import os
2
+ import logging
3
+ import yaml
4
+ import pandas as pd
5
+ from typing import Dict, Any
6
+
7
+ logger = logging.getLogger("dd_cleaner")
8
+
9
+ class DataCleanerEngine:
10
+ def __init__(self):
11
+ self.working_dir: str = ""
12
+ self.config: Dict[Any, Any] = {}
13
+
14
+ def set_working_config(self, working_dir: str, config_path: str):
15
+ """Loads the shared config and binds the root working directory context."""
16
+ abs_config_path = os.path.abspath(config_path)
17
+ if not os.path.exists(abs_config_path):
18
+ raise FileNotFoundError(f"Configuration file not found at: {abs_config_path}")
19
+ with open(abs_config_path, 'r') as f:
20
+ self.config = yaml.safe_load(f)
21
+ if not os.path.isdir(working_dir):
22
+ raise FileNotFoundError(f"Target data directory not found: {os.path.abspath(working_dir)}")
23
+ self.working_dir = os.path.abspath(working_dir)
24
+ logger.info("Cleaner Context Initialized.")
25
+
26
+ def verify_and_load_blueprint(self) -> pd.DataFrame:
27
+ """Handshakes with the dd_parser output subdirectory to grab the metadata map."""
28
+ parser_dir = self.config.get('dd_parser_output_dir', 'dd_analysis_results')
29
+ blueprint_name = self.config.get('output_filename', 'sba_analysis_results.csv')
30
+ blueprint_path = os.path.isabs(parser_dir) and os.path.join(parser_dir, blueprint_name) or os.path.abspath(os.path.join(self.working_dir, parser_dir, blueprint_name))
31
+
32
+ if not os.path.exists(blueprint_path):
33
+ raise FileNotFoundError(f"Missing parsing matrix blueprint file at: {blueprint_path}")
34
+
35
+ with open(blueprint_path, 'r', encoding='utf-8') as f:
36
+ first_line = f.readline()
37
+ if not first_line.startswith("# DD-PARSER-SIGNATURE"):
38
+ raise ValueError(f"Rejected: File at {blueprint_path} does not originate from dd-parser pipeline!")
39
+
40
+ return pd.read_csv(blueprint_path, comment='#')
41
+
42
+ def generate_cleaning_markdown_summary(self, data_df: pd.DataFrame, base_project_dir: str):
43
+ """Compiles clean data type breakdowns and null metrics to the documents/ workspace."""
44
+ doc_dir_name = self.config.get('documents_dir', 'documents')
45
+ abs_doc_dir = os.path.abspath(os.path.join(base_project_dir, doc_dir_name))
46
+ os.makedirs(abs_doc_dir, exist_ok=True)
47
+
48
+ report_path = os.path.join(abs_doc_dir, "data_cleaning_summary.md")
49
+ type_summary = data_df.dtypes.value_counts()
50
+ null_counts = data_df.isnull().sum()
51
+ total_rows = len(data_df)
52
+
53
+ with open(report_path, 'w', encoding='utf-8') as f:
54
+ f.write("# 🧼 KMDS Data Helper: Data Cleaning Summary Report\n\n")
55
+ f.write("## šŸ“Š Converted Data Types Summary\n")
56
+ for dtype_name, count in type_summary.items():
57
+ f.write(f"| {dtype_name} | {count} |\n")
58
+ f.write("\n## šŸ—ƒļø Missing Value Counts\n")
59
+ for col in data_df.columns:
60
+ f.write(f"| `{col}` | {null_counts[col]} | {(total_rows - null_counts[col])/total_rows*100:.2f}% |\n")
61
+
62
+ logger.info(f"Generated clean dataset validation profile saved to: {report_path}")
63
+
64
+ def execute_numeric_imputer(self, series: pd.Series) -> pd.Series:
65
+ """
66
+ Isolated strategy vector for missing continuous numerical data items.
67
+ Swap out this inner logic to upgrade from median to kNN/Iterative models later.
68
+ """
69
+ fill_value = series.median()
70
+ return series.fillna(fill_value)
71
+
72
+ def execute_poc_feature_prep(self, df: pd.DataFrame, blueprint_df: pd.DataFrame) -> pd.DataFrame:
73
+ """Applies basic missing value strategies and builds address strings for geocoding."""
74
+ prep_df = df.copy()
75
+ raw_cols_lower = {col.lower(): col for col in prep_df.columns}
76
+
77
+ # 1. Extract and compile Geo attributes on a per-entity basis
78
+ geo_blueprint = blueprint_df[blueprint_df['is_geographical'] == True]
79
+ entity_geo_groups = geo_blueprint.groupby('provisional_entity')
80
+
81
+ for entity_name, group in entity_geo_groups:
82
+ geo_cols = []
83
+ for _, row in group.iterrows():
84
+ attr_lower = row['attribute_name'].lower()
85
+ if attr_lower in raw_cols_lower:
86
+ geo_cols.append(raw_cols_lower[attr_lower])
87
+
88
+ if geo_cols:
89
+ logger.info(f" -> Consolidating geo attributes for entity: '{entity_name}'")
90
+ prep_df[f"{entity_name.lower()}_geo_search_string"] = prep_df[geo_cols].fillna("").astype(str).agg(", ".join, axis=1)
91
+
92
+ # 2. Variable Strategy Loop driven by Schema Typing definitions
93
+ for _, row in blueprint_df.iterrows():
94
+ attr_lower = row['attribute_name'].lower()
95
+ if attr_lower not in raw_cols_lower:
96
+ continue
97
+ col_name = raw_cols_lower[attr_lower]
98
+ t_type = row['provisional_python_type']
99
+
100
+ # Numeric Strategy: Route directly through decoupled method
101
+ if t_type in ['int', 'float']:
102
+ if prep_df[col_name].isnull().any():
103
+ prep_df[col_name] = self.execute_numeric_imputer(prep_df[col_name])
104
+
105
+ # Categorical Strategy: Explicitly flag missing indices
106
+ elif t_type == 'str':
107
+ prep_df[col_name] = prep_df[col_name].replace(["nan", "None", ""], None).fillna("MISSING")
108
+
109
+ return prep_df
110
+
111
+ def clean_dataset(self):
112
+ blueprint_df = self.verify_and_load_blueprint()
113
+ raw_file = self.config.get('raw_dataset_file', 'sba_loans_raw.csv')
114
+ base_project_dir = os.path.abspath(os.path.join(self.working_dir, ".."))
115
+ raw_path = os.path.isabs(raw_file) and raw_file or os.path.abspath(os.path.join(base_project_dir, "data", raw_file))
116
+
117
+ data_df = pd.read_csv(raw_path)
118
+ raw_columns_lower = {col.lower(): col for col in data_df.columns}
119
+
120
+ # [STAGE 1] Perform baseline type conversions
121
+ for _, row in blueprint_df.iterrows():
122
+ blueprint_attr = row['attribute_name']
123
+ target_type = row['provisional_python_type']
124
+ attr_lower = blueprint_attr.lower()
125
+ if attr_lower not in raw_columns_lower: continue
126
+ col_name = raw_columns_lower[attr_lower]
127
+
128
+ try:
129
+ if target_type == 'bool':
130
+ if data_df[col_name].dtype == object:
131
+ data_df[col_name] = data_df[col_name].astype(str).str.upper().str.strip().isin(['TRUE', '1', 'Y', 'YES', 'T'])
132
+ else:
133
+ data_df[col_name] = data_df[col_name].fillna(False).astype(bool)
134
+ elif target_type == 'int':
135
+ data_df[col_name] = pd.to_numeric(data_df[col_name], errors='coerce')
136
+ elif target_type == 'float':
137
+ data_df[col_name] = pd.to_numeric(data_df[col_name], errors='coerce')
138
+ elif target_type in ['datetime.date', 'datetime.datetime']:
139
+ data_df[col_name] = pd.to_datetime(data_df[col_name], errors='coerce')
140
+ else:
141
+ data_df[col_name] = data_df[col_name].astype(str).str.strip()
142
+ except Exception: pass
143
+
144
+ # [STAGE 2] Write standard output and generate report
145
+ cleaner_dir = self.config.get('dd_cleaner_output_dir', 'dd_cleaner_results')
146
+ clean_filename = self.config.get('clean_output_filename', 'sba_loans_clean.csv')
147
+ abs_dest_dir = os.path.isabs(cleaner_dir) and cleaner_dir or os.path.abspath(os.path.join(base_project_dir, "data", cleaner_dir))
148
+ os.makedirs(abs_dest_dir, exist_ok=True)
149
+
150
+ data_df.to_csv(os.path.join(abs_dest_dir, clean_filename), index=False)
151
+ self.generate_cleaning_markdown_summary(data_df, base_project_dir)
152
+
153
+ # [STAGE 3] Run PoC feature prep AFTER generating the report
154
+ logger.info("Executing PoC Missing Value Strategies and Geo-string preparation...")
155
+ poc_ready_df = self.execute_poc_feature_prep(data_df, blueprint_df)
156
+
157
+ # Save the finalized feature-selection dataset
158
+ poc_output_path = os.path.join(abs_dest_dir, "feature_selection_ready.csv")
159
+ poc_ready_df.to_csv(poc_output_path, index=False)
160
+ logger.info(f"šŸŽ‰ Hand-off Complete! Modeling matrix saved to: {poc_output_path}")
@@ -0,0 +1,9 @@
1
+ # src/dd_parser/__init__.py
2
+
3
+ from dd_parser.core import LocalEntityClassifier
4
+ from dd_parser.models import AttributeAnalysis
5
+
6
+ __all__ = [
7
+ "LocalEntityClassifier",
8
+ "AttributeAnalysis",
9
+ ]
@@ -0,0 +1,25 @@
1
+ import argparse
2
+ import logging
3
+
4
+ def main():
5
+ # Setup initial clean console output streaming
6
+ logging.basicConfig(
7
+ level=logging.INFO,
8
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
9
+ datefmt="%Y-%m-%d %H:%M:%S"
10
+ )
11
+
12
+ parser = argparse.ArgumentParser(description="Run the private local LLM Data Dictionary Parser.")
13
+ parser.add_argument("--working-dir", required=True, help="Path to raw source files.")
14
+ parser.add_argument("--config", required=True, help="Path to config.yaml file.")
15
+ args = parser.parse_args()
16
+
17
+ # Import inside main to let logging configure cleanly first
18
+ from dd_parser.core import LocalEntityClassifier
19
+
20
+ classifier = LocalEntityClassifier()
21
+ classifier.set_working_config(working_dir=args.working_dir, config_path=args.config)
22
+ classifier.process()
23
+
24
+ if __name__ == "__main__":
25
+ main()
@@ -0,0 +1,224 @@
1
+ import os
2
+ import json
3
+ import yaml
4
+ import time
5
+ import logging
6
+ import pandas as pd
7
+ from pypdf import PdfReader
8
+ import ollama
9
+ from typing import List, Dict, Any
10
+ from dd_parser.models import AttributeAnalysis, BatchAnalysisResponse
11
+
12
+ logger = logging.getLogger("dd_parser")
13
+
14
+ class LocalEntityClassifier:
15
+ def __init__(self):
16
+ self.working_dir: str = ""
17
+ self.config: Dict[Any, Any] = {}
18
+
19
+ def set_working_config(self, working_dir: str, config_path: str):
20
+ abs_config_path = os.path.abspath(config_path)
21
+ if not os.path.exists(abs_config_path):
22
+ raise FileNotFoundError(f"Configuration file not found at: {abs_config_path}")
23
+
24
+ with open(abs_config_path, 'r') as f:
25
+ self.config = yaml.safe_load(f)
26
+
27
+ if not os.path.isdir(working_dir):
28
+ raise FileNotFoundError(f"Target data directory not found: {os.path.abspath(working_dir)}")
29
+ self.working_dir = os.path.abspath(working_dir)
30
+
31
+ raw_output_dir = self.config.get('dd_parser_output_dir', 'dd_analysis_results')
32
+ abs_output_dir = os.path.isabs(raw_output_dir) and raw_output_dir or os.path.abspath(os.path.join(self.working_dir, raw_output_dir))
33
+ os.makedirs(abs_output_dir, exist_ok=True)
34
+
35
+ log_file_path = os.path.join(abs_output_dir, "parser_run.log")
36
+ file_handler = logging.FileHandler(log_file_path, encoding='utf-8')
37
+ file_handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(name)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"))
38
+ logger.addHandler(file_handler)
39
+
40
+ logger.info("Context Initialized with Hybrid Processing Configuration.")
41
+ logger.info(f"Loaded Config: {abs_config_path} | Tracking Log: {log_file_path}")
42
+
43
+ def extract_attributes(self, file_path: str, csv_idx: int = 0) -> List[str]:
44
+ _, ext = os.path.splitext(file_path)
45
+ ext = ext.lower()
46
+ if ext == '.csv':
47
+ return pd.read_csv(file_path).iloc[:, csv_idx].dropna().astype(str).tolist()
48
+ elif ext == '.pdf':
49
+ return [line.strip() for page in PdfReader(file_path).pages for line in page.extract_text().split('\n') if line.strip()]
50
+ elif ext in ['.md', '.markdown']:
51
+ with open(file_path, 'r', encoding='utf-8') as f:
52
+ return [line.strip() for line in f if line.strip()]
53
+ raise ValueError(f"Unsupported format: {ext}")
54
+
55
+ def analyze_batch(self, attributes: List[str]) -> List[AttributeAnalysis]:
56
+ prompt = f"""
57
+ Analyze the following data dictionary attributes.
58
+
59
+ ### EXAMPLES OF EXCELLENT PERFORMANCE
60
+ Input: ["BorrCity", "BankStreet", "GrossApproval", "SoldSecMrktInd"]
61
+ Output Schema Map:
62
+ {{
63
+ "analysis": [
64
+ {{"attribute_name": "BorrCity", "provisional_entity": "Borrower", "is_geographical": true, "related_entity": "Borrower", "provisional_python_type": "str"}},
65
+ {{"attribute_name": "BankStreet", "provisional_entity": "Bank", "is_geographical": true, "related_entity": "Bank", "provisional_python_type": "str"}},
66
+ {{"attribute_name": "GrossApproval", "provisional_entity": "Loan", "is_geographical": false, "related_entity": null, "provisional_python_type": "float"}},
67
+ {{"attribute_name": "SoldSecMrktInd", "provisional_entity": "Loan", "is_geographical": false, "related_entity": null, "provisional_python_type": "bool"}}
68
+ ]
69
+ }}
70
+
71
+ ### CURRENT EXECUTION BATCH
72
+ Attributes to process: {json.dumps(attributes)}
73
+ """
74
+
75
+ response = ollama.chat(
76
+ model=self.config.get('model_name', 'llama3.2'),
77
+ messages=[
78
+ {"role": "system", "content": self.config.get('system_prompt', 'You are a precise data engineering assistant. Respond strictly in JSON.')},
79
+ {"role": "user", "content": prompt}
80
+ ],
81
+ options={"temperature": self.config.get('temperature', 0.0)},
82
+ format=BatchAnalysisResponse.model_json_schema()
83
+ )
84
+ return BatchAnalysisResponse(**json.loads(response['message']['content'])).analysis
85
+
86
+ def post_process_cleaner(self, analysis_list: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
87
+ cleaned_records = []
88
+ for item in analysis_list:
89
+ data = item.copy()
90
+ attr = data['attribute_name']
91
+
92
+ if attr.startswith('Borr'):
93
+ data['provisional_entity'] = 'Borrower'
94
+ if data['is_geographical']: data['related_entity'] = 'Borrower'
95
+ elif attr.startswith('Bank'):
96
+ data['provisional_entity'] = 'Bank'
97
+ if attr in ['BankStreet', 'BankCity', 'BankState', 'BankZip']:
98
+ data['is_geographical'] = True
99
+ data['related_entity'] = 'Bank'
100
+ elif attr.startswith('Project'):
101
+ data['provisional_entity'] = 'Project'
102
+ if attr in ['ProjectCounty', 'ProjectState']:
103
+ data['is_geographical'] = True
104
+ data['related_entity'] = 'Project'
105
+ elif 'Approval' in attr or 'Disbursement' in attr or attr in ['Program', 'Subprogram']:
106
+ data['provisional_entity'] = 'Loan'
107
+ elif attr.startswith('SBA'):
108
+ data['provisional_entity'] = 'SBA'
109
+ if data['is_geographical']: data['related_entity'] = 'SBA'
110
+
111
+ if attr.endswith('Ind') or 'Indicator' in attr:
112
+ data['provisional_python_type'] = 'bool'
113
+
114
+ if not data['is_geographical']:
115
+ data['related_entity'] = ""
116
+
117
+ cleaned_records.append(data)
118
+ return cleaned_records
119
+
120
+ def generate_parsing_markdown_summary(self, final_results: List[Dict[str, Any]], base_project_dir: str):
121
+ """Compiles structural data profiles and metadata metrics as a clean Markdown specification file."""
122
+ df = pd.DataFrame(final_results)
123
+ doc_dir_name = self.config.get('documents_dir', 'documents')
124
+ abs_doc_dir = os.path.abspath(os.path.join(base_project_dir, doc_dir_name))
125
+ os.makedirs(abs_doc_dir, exist_ok=True)
126
+
127
+ report_path = os.path.join(abs_doc_dir, "dd_parsing_summary.md")
128
+ entity_counts = df['provisional_entity'].value_counts()
129
+ total_attributes = len(df)
130
+
131
+ with open(report_path, 'w', encoding='utf-8') as f:
132
+ f.write("# šŸ“‘ KMDS Data Helper: Data Dictionary Parsing Summary\n\n")
133
+ f.write(f"**Total Tracked Attributes:** {total_attributes} \n")
134
+ f.write(f"**Unique Detected Entities:** {len(entity_counts)}\n\n")
135
+
136
+ f.write("## šŸ—ļø Entity Size & Distribution Profile\n")
137
+ f.write("| Detected Entity Node | Number of Attributes (Size) |\n")
138
+ f.write("| :--- | :--- |\n")
139
+ for ent, count in entity_counts.items():
140
+ f.write(f"| {ent} | {count} |\n")
141
+ f.write("\n")
142
+
143
+ f.write("## šŸŽ›ļø Attribute Structural Categories\n")
144
+ categorical_df = df[df['provisional_python_type'] == 'bool']
145
+ numerical_df = df[df['provisional_python_type'].isin(['int', 'float'])]
146
+ semantic_df = df[~df['provisional_python_type'].isin(['bool', 'int', 'float'])]
147
+
148
+ f.write(f"### šŸ“Š Categorical Fields (Total: {len(categorical_df)})\n")
149
+ for _, row in categorical_df.iterrows():
150
+ f.write(f"- `{row['attribute_name']}` ({row['provisional_python_type']}) $\rightarrow$ Node: **{row['provisional_entity']}**\n")
151
+ f.write("\n")
152
+
153
+ f.write(f"### šŸ”¢ Numerical Fields (Total: {len(numerical_df)})\n")
154
+ for _, row in numerical_df.iterrows():
155
+ f.write(f"- `{row['attribute_name']}` ({row['provisional_python_type']}) $\rightarrow$ Node: **{row['provisional_entity']}**\n")
156
+ f.write("\n")
157
+
158
+ f.write(f"### 🧠 Semantic Attributes Grouped By Parent Class (Total: {len(semantic_df)})\n")
159
+ grouped_semantic = semantic_df.groupby('provisional_entity')
160
+ for ent_group, group_df in grouped_semantic:
161
+ f.write(f"#### Entity Category: `{ent_group}`\n")
162
+ for _, row in group_df.iterrows():
163
+ geo_suffix = row['is_geographical'] and f" [GEO Linked: {row['related_entity']}]" or ""
164
+ f.write(f" - `{row['attribute_name']}` ({row['provisional_python_type']}){geo_suffix}\n")
165
+ f.write("\n")
166
+
167
+ logger.info(f"Generated parser metadata documentation report saved to: {report_path}")
168
+
169
+ def process(self):
170
+ files_to_process = self.config.get('files', [])
171
+ if not files_to_process: return
172
+
173
+ raw_output_dir = self.config.get('dd_parser_output_dir', 'dd_analysis_results')
174
+ abs_output_dir = os.path.isabs(raw_output_dir) and raw_output_dir or os.path.abspath(os.path.join(self.working_dir, raw_output_dir))
175
+
176
+ batch_size, csv_col_idx = self.config.get('batch_size', 10), self.config.get('csv_target_column_index', 0)
177
+ config_filename = self.config.get('output_filename')
178
+
179
+ # Extract base project root for kmds-data-helper structure alignment
180
+ base_project_dir = os.path.abspath(os.path.join(self.working_dir, ".."))
181
+
182
+ for filepath in files_to_process:
183
+ input_file_path = os.path.isabs(filepath) and filepath or os.path.abspath(os.path.join(self.working_dir, filepath))
184
+ if not os.path.exists(input_file_path): continue
185
+
186
+ filename = os.path.basename(input_file_path)
187
+ try:
188
+ raw_attributes = self.extract_attributes(input_file_path, csv_col_idx)
189
+ logger.info(f"Extracted {len(raw_attributes)} attributes from {filename}")
190
+ except Exception as e:
191
+ logger.exception(f"Read failure on {filename}: {e}")
192
+ continue
193
+
194
+ final_results = []
195
+ for i in range(0, len(raw_attributes), batch_size):
196
+ batch = raw_attributes[i:i+batch_size]
197
+ start_time = time.perf_counter()
198
+ try:
199
+ batch_output = self.analyze_batch(batch)
200
+ batch_dicts = [item.model_dump() for item in batch_output]
201
+ cleaned_batch = self.post_process_cleaner(batch_dicts)
202
+ final_results.extend(cleaned_batch)
203
+ logger.info(f" Batch run {i} verified in {time.perf_counter() - start_time:.2f}s")
204
+ except Exception as e:
205
+ logger.error(f" Batch crash at element index {i}: {e}")
206
+
207
+ if final_results:
208
+ if config_filename and len(files_to_process) == 1:
209
+ out_filename = config_filename
210
+ else:
211
+ base_name, _ = os.path.splitext(filename)
212
+ out_filename = f"mapped_{base_name}.csv"
213
+
214
+ output_csv_path = os.path.join(abs_output_dir, out_filename)
215
+
216
+ preamble = f"# DD-PARSER-SIGNATURE: PROCESSED-BY-{self.config.get('model_name', 'llama3.2').upper()}\n"
217
+ with open(output_csv_path, 'w', encoding='utf-8') as f:
218
+ f.write(preamble)
219
+
220
+ pd.DataFrame(final_results).to_csv(output_csv_path, mode='a', index=False)
221
+ logger.info(f"Finished tracking {filename}. Saved verified matrix map to: {output_csv_path}")
222
+
223
+ # Dynamic Markdown Generation Task Trigger
224
+ self.generate_parsing_markdown_summary(final_results, base_project_dir)
@@ -0,0 +1,21 @@
1
+ from pydantic import BaseModel, Field
2
+ from typing import List, Optional, Literal
3
+
4
+ class AttributeAnalysis(BaseModel):
5
+ attribute_name: str
6
+ provisional_entity: str = Field(
7
+ description="The primary business entity this attribute belongs to (e.g., Customer, Product, Transaction)."
8
+ )
9
+ is_geographical: bool = Field(
10
+ description="True if the attribute represents a physical location, address, coordinate, country, or region."
11
+ )
12
+ related_entity: Optional[str] = Field(
13
+ None,
14
+ description="If geographical, which entity does this location bind to? (e.g., 'Customer' for 'shipping_state')."
15
+ )
16
+ provisional_python_type: Literal["str", "int", "float", "datetime.date", "datetime.datetime", "bool"] = Field(
17
+ description="Semantic Python data type based on semantics."
18
+ )
19
+
20
+ class BatchAnalysisResponse(BaseModel):
21
+ analysis: List[AttributeAnalysis]
File without changes
@@ -0,0 +1,27 @@
1
+ import os
2
+ from dd_parser import LocalEntityClassifier
3
+
4
+ def run_local_test():
5
+ # Folder containing your dictionary files
6
+ target_data_dir = "/home/rajiv/programming/kmds_descriptive_analytics/kmds_sba_loans/data_dictionary"
7
+
8
+ # Path to the config file sitting in your current workspace
9
+ config_name = "config.yaml"
10
+
11
+ print("=== Starting dd_parser Integration Test ===")
12
+
13
+ try:
14
+ classifier = LocalEntityClassifier()
15
+
16
+ # FIXED: Pass config_path as the second keyword argument matching our refactored core engine
17
+ classifier.set_working_config(working_dir=target_data_dir, config_path=config_name)
18
+
19
+ print("\nšŸš€ Dispatched extraction and Ollama micro-batching pipelines...")
20
+ classifier.process()
21
+ print("\n=== Test Finished Successfully ===")
22
+
23
+ except Exception as e:
24
+ print(f"\nāŒ Pipeline execution failed with exception: {e}")
25
+
26
+ if __name__ == "__main__":
27
+ run_local_test()