dd-parser-cleaner 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dd_cleaner/__init__.py +0 -0
- dd_cleaner/cli.py +35 -0
- dd_cleaner/engine.py +160 -0
- dd_parser/__init__.py +9 -0
- dd_parser/cli.py +25 -0
- dd_parser/core.py +224 -0
- dd_parser/models.py +21 -0
- dd_parser/py.typed +0 -0
- dd_parser_cleaner-0.1.0.dist-info/METADATA +10 -0
- dd_parser_cleaner-0.1.0.dist-info/RECORD +12 -0
- dd_parser_cleaner-0.1.0.dist-info/WHEEL +4 -0
- dd_parser_cleaner-0.1.0.dist-info/entry_points.txt +3 -0
dd_cleaner/__init__.py
ADDED
|
File without changes
|
dd_cleaner/cli.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import logging
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
def main():
|
|
6
|
+
# 1. Initialize clean, standardized console log streaming
|
|
7
|
+
logging.basicConfig(
|
|
8
|
+
level=logging.INFO,
|
|
9
|
+
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
|
10
|
+
datefmt="%Y-%m-%d %H:%M:%S"
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
parser = argparse.ArgumentParser(
|
|
14
|
+
description="Run the data cleaner pipeline driven by the verified dd-parser blueprint matrix map."
|
|
15
|
+
)
|
|
16
|
+
parser.add_argument("--working-dir", required=True, help="Path to raw data directories containing target payloads.")
|
|
17
|
+
parser.add_argument("--config", required=True, help="Path to config.yaml file containing script targets.")
|
|
18
|
+
args = parser.parse_args()
|
|
19
|
+
|
|
20
|
+
# Import engine inside main to allow logging configs to bind cleanly first
|
|
21
|
+
from dd_cleaner.engine import DataCleanerEngine
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
# 2. Fire up the pipeline runner
|
|
25
|
+
cleaner = DataCleanerEngine()
|
|
26
|
+
cleaner.set_working_config(working_dir=args.working_dir, config_path=args.config)
|
|
27
|
+
cleaner.clean_dataset()
|
|
28
|
+
|
|
29
|
+
except Exception as e:
|
|
30
|
+
logger = logging.getLogger("dd_cleaner_main")
|
|
31
|
+
logger.error(f"๐ Critical Pipeline Failure: {str(e)}")
|
|
32
|
+
sys.exit(1)
|
|
33
|
+
|
|
34
|
+
if __name__ == "__main__":
|
|
35
|
+
main()
|
dd_cleaner/engine.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import logging
|
|
3
|
+
import yaml
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from typing import Dict, Any
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger("dd_cleaner")
|
|
8
|
+
|
|
9
|
+
class DataCleanerEngine:
|
|
10
|
+
def __init__(self):
|
|
11
|
+
self.working_dir: str = ""
|
|
12
|
+
self.config: Dict[Any, Any] = {}
|
|
13
|
+
|
|
14
|
+
def set_working_config(self, working_dir: str, config_path: str):
|
|
15
|
+
"""Loads the shared config and binds the root working directory context."""
|
|
16
|
+
abs_config_path = os.path.abspath(config_path)
|
|
17
|
+
if not os.path.exists(abs_config_path):
|
|
18
|
+
raise FileNotFoundError(f"Configuration file not found at: {abs_config_path}")
|
|
19
|
+
with open(abs_config_path, 'r') as f:
|
|
20
|
+
self.config = yaml.safe_load(f)
|
|
21
|
+
if not os.path.isdir(working_dir):
|
|
22
|
+
raise FileNotFoundError(f"Target data directory not found: {os.path.abspath(working_dir)}")
|
|
23
|
+
self.working_dir = os.path.abspath(working_dir)
|
|
24
|
+
logger.info("Cleaner Context Initialized.")
|
|
25
|
+
|
|
26
|
+
def verify_and_load_blueprint(self) -> pd.DataFrame:
|
|
27
|
+
"""Handshakes with the dd_parser output subdirectory to grab the metadata map."""
|
|
28
|
+
parser_dir = self.config.get('dd_parser_output_dir', 'dd_analysis_results')
|
|
29
|
+
blueprint_name = self.config.get('output_filename', 'sba_analysis_results.csv')
|
|
30
|
+
blueprint_path = os.path.isabs(parser_dir) and os.path.join(parser_dir, blueprint_name) or os.path.abspath(os.path.join(self.working_dir, parser_dir, blueprint_name))
|
|
31
|
+
|
|
32
|
+
if not os.path.exists(blueprint_path):
|
|
33
|
+
raise FileNotFoundError(f"Missing parsing matrix blueprint file at: {blueprint_path}")
|
|
34
|
+
|
|
35
|
+
with open(blueprint_path, 'r', encoding='utf-8') as f:
|
|
36
|
+
first_line = f.readline()
|
|
37
|
+
if not first_line.startswith("# DD-PARSER-SIGNATURE"):
|
|
38
|
+
raise ValueError(f"Rejected: File at {blueprint_path} does not originate from dd-parser pipeline!")
|
|
39
|
+
|
|
40
|
+
return pd.read_csv(blueprint_path, comment='#')
|
|
41
|
+
|
|
42
|
+
def generate_cleaning_markdown_summary(self, data_df: pd.DataFrame, base_project_dir: str):
|
|
43
|
+
"""Compiles clean data type breakdowns and null metrics to the documents/ workspace."""
|
|
44
|
+
doc_dir_name = self.config.get('documents_dir', 'documents')
|
|
45
|
+
abs_doc_dir = os.path.abspath(os.path.join(base_project_dir, doc_dir_name))
|
|
46
|
+
os.makedirs(abs_doc_dir, exist_ok=True)
|
|
47
|
+
|
|
48
|
+
report_path = os.path.join(abs_doc_dir, "data_cleaning_summary.md")
|
|
49
|
+
type_summary = data_df.dtypes.value_counts()
|
|
50
|
+
null_counts = data_df.isnull().sum()
|
|
51
|
+
total_rows = len(data_df)
|
|
52
|
+
|
|
53
|
+
with open(report_path, 'w', encoding='utf-8') as f:
|
|
54
|
+
f.write("# ๐งผ KMDS Data Helper: Data Cleaning Summary Report\n\n")
|
|
55
|
+
f.write("## ๐ Converted Data Types Summary\n")
|
|
56
|
+
for dtype_name, count in type_summary.items():
|
|
57
|
+
f.write(f"| {dtype_name} | {count} |\n")
|
|
58
|
+
f.write("\n## ๐๏ธ Missing Value Counts\n")
|
|
59
|
+
for col in data_df.columns:
|
|
60
|
+
f.write(f"| `{col}` | {null_counts[col]} | {(total_rows - null_counts[col])/total_rows*100:.2f}% |\n")
|
|
61
|
+
|
|
62
|
+
logger.info(f"Generated clean dataset validation profile saved to: {report_path}")
|
|
63
|
+
|
|
64
|
+
def execute_numeric_imputer(self, series: pd.Series) -> pd.Series:
|
|
65
|
+
"""
|
|
66
|
+
Isolated strategy vector for missing continuous numerical data items.
|
|
67
|
+
Swap out this inner logic to upgrade from median to kNN/Iterative models later.
|
|
68
|
+
"""
|
|
69
|
+
fill_value = series.median()
|
|
70
|
+
return series.fillna(fill_value)
|
|
71
|
+
|
|
72
|
+
def execute_poc_feature_prep(self, df: pd.DataFrame, blueprint_df: pd.DataFrame) -> pd.DataFrame:
|
|
73
|
+
"""Applies basic missing value strategies and builds address strings for geocoding."""
|
|
74
|
+
prep_df = df.copy()
|
|
75
|
+
raw_cols_lower = {col.lower(): col for col in prep_df.columns}
|
|
76
|
+
|
|
77
|
+
# 1. Extract and compile Geo attributes on a per-entity basis
|
|
78
|
+
geo_blueprint = blueprint_df[blueprint_df['is_geographical'] == True]
|
|
79
|
+
entity_geo_groups = geo_blueprint.groupby('provisional_entity')
|
|
80
|
+
|
|
81
|
+
for entity_name, group in entity_geo_groups:
|
|
82
|
+
geo_cols = []
|
|
83
|
+
for _, row in group.iterrows():
|
|
84
|
+
attr_lower = row['attribute_name'].lower()
|
|
85
|
+
if attr_lower in raw_cols_lower:
|
|
86
|
+
geo_cols.append(raw_cols_lower[attr_lower])
|
|
87
|
+
|
|
88
|
+
if geo_cols:
|
|
89
|
+
logger.info(f" -> Consolidating geo attributes for entity: '{entity_name}'")
|
|
90
|
+
prep_df[f"{entity_name.lower()}_geo_search_string"] = prep_df[geo_cols].fillna("").astype(str).agg(", ".join, axis=1)
|
|
91
|
+
|
|
92
|
+
# 2. Variable Strategy Loop driven by Schema Typing definitions
|
|
93
|
+
for _, row in blueprint_df.iterrows():
|
|
94
|
+
attr_lower = row['attribute_name'].lower()
|
|
95
|
+
if attr_lower not in raw_cols_lower:
|
|
96
|
+
continue
|
|
97
|
+
col_name = raw_cols_lower[attr_lower]
|
|
98
|
+
t_type = row['provisional_python_type']
|
|
99
|
+
|
|
100
|
+
# Numeric Strategy: Route directly through decoupled method
|
|
101
|
+
if t_type in ['int', 'float']:
|
|
102
|
+
if prep_df[col_name].isnull().any():
|
|
103
|
+
prep_df[col_name] = self.execute_numeric_imputer(prep_df[col_name])
|
|
104
|
+
|
|
105
|
+
# Categorical Strategy: Explicitly flag missing indices
|
|
106
|
+
elif t_type == 'str':
|
|
107
|
+
prep_df[col_name] = prep_df[col_name].replace(["nan", "None", ""], None).fillna("MISSING")
|
|
108
|
+
|
|
109
|
+
return prep_df
|
|
110
|
+
|
|
111
|
+
def clean_dataset(self):
|
|
112
|
+
blueprint_df = self.verify_and_load_blueprint()
|
|
113
|
+
raw_file = self.config.get('raw_dataset_file', 'sba_loans_raw.csv')
|
|
114
|
+
base_project_dir = os.path.abspath(os.path.join(self.working_dir, ".."))
|
|
115
|
+
raw_path = os.path.isabs(raw_file) and raw_file or os.path.abspath(os.path.join(base_project_dir, "data", raw_file))
|
|
116
|
+
|
|
117
|
+
data_df = pd.read_csv(raw_path)
|
|
118
|
+
raw_columns_lower = {col.lower(): col for col in data_df.columns}
|
|
119
|
+
|
|
120
|
+
# [STAGE 1] Perform baseline type conversions
|
|
121
|
+
for _, row in blueprint_df.iterrows():
|
|
122
|
+
blueprint_attr = row['attribute_name']
|
|
123
|
+
target_type = row['provisional_python_type']
|
|
124
|
+
attr_lower = blueprint_attr.lower()
|
|
125
|
+
if attr_lower not in raw_columns_lower: continue
|
|
126
|
+
col_name = raw_columns_lower[attr_lower]
|
|
127
|
+
|
|
128
|
+
try:
|
|
129
|
+
if target_type == 'bool':
|
|
130
|
+
if data_df[col_name].dtype == object:
|
|
131
|
+
data_df[col_name] = data_df[col_name].astype(str).str.upper().str.strip().isin(['TRUE', '1', 'Y', 'YES', 'T'])
|
|
132
|
+
else:
|
|
133
|
+
data_df[col_name] = data_df[col_name].fillna(False).astype(bool)
|
|
134
|
+
elif target_type == 'int':
|
|
135
|
+
data_df[col_name] = pd.to_numeric(data_df[col_name], errors='coerce')
|
|
136
|
+
elif target_type == 'float':
|
|
137
|
+
data_df[col_name] = pd.to_numeric(data_df[col_name], errors='coerce')
|
|
138
|
+
elif target_type in ['datetime.date', 'datetime.datetime']:
|
|
139
|
+
data_df[col_name] = pd.to_datetime(data_df[col_name], errors='coerce')
|
|
140
|
+
else:
|
|
141
|
+
data_df[col_name] = data_df[col_name].astype(str).str.strip()
|
|
142
|
+
except Exception: pass
|
|
143
|
+
|
|
144
|
+
# [STAGE 2] Write standard output and generate report
|
|
145
|
+
cleaner_dir = self.config.get('dd_cleaner_output_dir', 'dd_cleaner_results')
|
|
146
|
+
clean_filename = self.config.get('clean_output_filename', 'sba_loans_clean.csv')
|
|
147
|
+
abs_dest_dir = os.path.isabs(cleaner_dir) and cleaner_dir or os.path.abspath(os.path.join(base_project_dir, "data", cleaner_dir))
|
|
148
|
+
os.makedirs(abs_dest_dir, exist_ok=True)
|
|
149
|
+
|
|
150
|
+
data_df.to_csv(os.path.join(abs_dest_dir, clean_filename), index=False)
|
|
151
|
+
self.generate_cleaning_markdown_summary(data_df, base_project_dir)
|
|
152
|
+
|
|
153
|
+
# [STAGE 3] Run PoC feature prep AFTER generating the report
|
|
154
|
+
logger.info("Executing PoC Missing Value Strategies and Geo-string preparation...")
|
|
155
|
+
poc_ready_df = self.execute_poc_feature_prep(data_df, blueprint_df)
|
|
156
|
+
|
|
157
|
+
# Save the finalized feature-selection dataset
|
|
158
|
+
poc_output_path = os.path.join(abs_dest_dir, "feature_selection_ready.csv")
|
|
159
|
+
poc_ready_df.to_csv(poc_output_path, index=False)
|
|
160
|
+
logger.info(f"๐ Hand-off Complete! Modeling matrix saved to: {poc_output_path}")
|
dd_parser/__init__.py
ADDED
dd_parser/cli.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
def main():
|
|
5
|
+
# Setup initial clean console output streaming
|
|
6
|
+
logging.basicConfig(
|
|
7
|
+
level=logging.INFO,
|
|
8
|
+
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
|
9
|
+
datefmt="%Y-%m-%d %H:%M:%S"
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
parser = argparse.ArgumentParser(description="Run the private local LLM Data Dictionary Parser.")
|
|
13
|
+
parser.add_argument("--working-dir", required=True, help="Path to raw source files.")
|
|
14
|
+
parser.add_argument("--config", required=True, help="Path to config.yaml file.")
|
|
15
|
+
args = parser.parse_args()
|
|
16
|
+
|
|
17
|
+
# Import inside main to let logging configure cleanly first
|
|
18
|
+
from dd_parser.core import LocalEntityClassifier
|
|
19
|
+
|
|
20
|
+
classifier = LocalEntityClassifier()
|
|
21
|
+
classifier.set_working_config(working_dir=args.working_dir, config_path=args.config)
|
|
22
|
+
classifier.process()
|
|
23
|
+
|
|
24
|
+
if __name__ == "__main__":
|
|
25
|
+
main()
|
dd_parser/core.py
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import yaml
|
|
4
|
+
import time
|
|
5
|
+
import logging
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from pypdf import PdfReader
|
|
8
|
+
import ollama
|
|
9
|
+
from typing import List, Dict, Any
|
|
10
|
+
from dd_parser.models import AttributeAnalysis, BatchAnalysisResponse
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger("dd_parser")
|
|
13
|
+
|
|
14
|
+
class LocalEntityClassifier:
|
|
15
|
+
def __init__(self):
|
|
16
|
+
self.working_dir: str = ""
|
|
17
|
+
self.config: Dict[Any, Any] = {}
|
|
18
|
+
|
|
19
|
+
def set_working_config(self, working_dir: str, config_path: str):
|
|
20
|
+
abs_config_path = os.path.abspath(config_path)
|
|
21
|
+
if not os.path.exists(abs_config_path):
|
|
22
|
+
raise FileNotFoundError(f"Configuration file not found at: {abs_config_path}")
|
|
23
|
+
|
|
24
|
+
with open(abs_config_path, 'r') as f:
|
|
25
|
+
self.config = yaml.safe_load(f)
|
|
26
|
+
|
|
27
|
+
if not os.path.isdir(working_dir):
|
|
28
|
+
raise FileNotFoundError(f"Target data directory not found: {os.path.abspath(working_dir)}")
|
|
29
|
+
self.working_dir = os.path.abspath(working_dir)
|
|
30
|
+
|
|
31
|
+
raw_output_dir = self.config.get('dd_parser_output_dir', 'dd_analysis_results')
|
|
32
|
+
abs_output_dir = os.path.isabs(raw_output_dir) and raw_output_dir or os.path.abspath(os.path.join(self.working_dir, raw_output_dir))
|
|
33
|
+
os.makedirs(abs_output_dir, exist_ok=True)
|
|
34
|
+
|
|
35
|
+
log_file_path = os.path.join(abs_output_dir, "parser_run.log")
|
|
36
|
+
file_handler = logging.FileHandler(log_file_path, encoding='utf-8')
|
|
37
|
+
file_handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(name)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"))
|
|
38
|
+
logger.addHandler(file_handler)
|
|
39
|
+
|
|
40
|
+
logger.info("Context Initialized with Hybrid Processing Configuration.")
|
|
41
|
+
logger.info(f"Loaded Config: {abs_config_path} | Tracking Log: {log_file_path}")
|
|
42
|
+
|
|
43
|
+
def extract_attributes(self, file_path: str, csv_idx: int = 0) -> List[str]:
|
|
44
|
+
_, ext = os.path.splitext(file_path)
|
|
45
|
+
ext = ext.lower()
|
|
46
|
+
if ext == '.csv':
|
|
47
|
+
return pd.read_csv(file_path).iloc[:, csv_idx].dropna().astype(str).tolist()
|
|
48
|
+
elif ext == '.pdf':
|
|
49
|
+
return [line.strip() for page in PdfReader(file_path).pages for line in page.extract_text().split('\n') if line.strip()]
|
|
50
|
+
elif ext in ['.md', '.markdown']:
|
|
51
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
52
|
+
return [line.strip() for line in f if line.strip()]
|
|
53
|
+
raise ValueError(f"Unsupported format: {ext}")
|
|
54
|
+
|
|
55
|
+
def analyze_batch(self, attributes: List[str]) -> List[AttributeAnalysis]:
|
|
56
|
+
prompt = f"""
|
|
57
|
+
Analyze the following data dictionary attributes.
|
|
58
|
+
|
|
59
|
+
### EXAMPLES OF EXCELLENT PERFORMANCE
|
|
60
|
+
Input: ["BorrCity", "BankStreet", "GrossApproval", "SoldSecMrktInd"]
|
|
61
|
+
Output Schema Map:
|
|
62
|
+
{{
|
|
63
|
+
"analysis": [
|
|
64
|
+
{{"attribute_name": "BorrCity", "provisional_entity": "Borrower", "is_geographical": true, "related_entity": "Borrower", "provisional_python_type": "str"}},
|
|
65
|
+
{{"attribute_name": "BankStreet", "provisional_entity": "Bank", "is_geographical": true, "related_entity": "Bank", "provisional_python_type": "str"}},
|
|
66
|
+
{{"attribute_name": "GrossApproval", "provisional_entity": "Loan", "is_geographical": false, "related_entity": null, "provisional_python_type": "float"}},
|
|
67
|
+
{{"attribute_name": "SoldSecMrktInd", "provisional_entity": "Loan", "is_geographical": false, "related_entity": null, "provisional_python_type": "bool"}}
|
|
68
|
+
]
|
|
69
|
+
}}
|
|
70
|
+
|
|
71
|
+
### CURRENT EXECUTION BATCH
|
|
72
|
+
Attributes to process: {json.dumps(attributes)}
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
response = ollama.chat(
|
|
76
|
+
model=self.config.get('model_name', 'llama3.2'),
|
|
77
|
+
messages=[
|
|
78
|
+
{"role": "system", "content": self.config.get('system_prompt', 'You are a precise data engineering assistant. Respond strictly in JSON.')},
|
|
79
|
+
{"role": "user", "content": prompt}
|
|
80
|
+
],
|
|
81
|
+
options={"temperature": self.config.get('temperature', 0.0)},
|
|
82
|
+
format=BatchAnalysisResponse.model_json_schema()
|
|
83
|
+
)
|
|
84
|
+
return BatchAnalysisResponse(**json.loads(response['message']['content'])).analysis
|
|
85
|
+
|
|
86
|
+
def post_process_cleaner(self, analysis_list: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
87
|
+
cleaned_records = []
|
|
88
|
+
for item in analysis_list:
|
|
89
|
+
data = item.copy()
|
|
90
|
+
attr = data['attribute_name']
|
|
91
|
+
|
|
92
|
+
if attr.startswith('Borr'):
|
|
93
|
+
data['provisional_entity'] = 'Borrower'
|
|
94
|
+
if data['is_geographical']: data['related_entity'] = 'Borrower'
|
|
95
|
+
elif attr.startswith('Bank'):
|
|
96
|
+
data['provisional_entity'] = 'Bank'
|
|
97
|
+
if attr in ['BankStreet', 'BankCity', 'BankState', 'BankZip']:
|
|
98
|
+
data['is_geographical'] = True
|
|
99
|
+
data['related_entity'] = 'Bank'
|
|
100
|
+
elif attr.startswith('Project'):
|
|
101
|
+
data['provisional_entity'] = 'Project'
|
|
102
|
+
if attr in ['ProjectCounty', 'ProjectState']:
|
|
103
|
+
data['is_geographical'] = True
|
|
104
|
+
data['related_entity'] = 'Project'
|
|
105
|
+
elif 'Approval' in attr or 'Disbursement' in attr or attr in ['Program', 'Subprogram']:
|
|
106
|
+
data['provisional_entity'] = 'Loan'
|
|
107
|
+
elif attr.startswith('SBA'):
|
|
108
|
+
data['provisional_entity'] = 'SBA'
|
|
109
|
+
if data['is_geographical']: data['related_entity'] = 'SBA'
|
|
110
|
+
|
|
111
|
+
if attr.endswith('Ind') or 'Indicator' in attr:
|
|
112
|
+
data['provisional_python_type'] = 'bool'
|
|
113
|
+
|
|
114
|
+
if not data['is_geographical']:
|
|
115
|
+
data['related_entity'] = ""
|
|
116
|
+
|
|
117
|
+
cleaned_records.append(data)
|
|
118
|
+
return cleaned_records
|
|
119
|
+
|
|
120
|
+
def generate_parsing_markdown_summary(self, final_results: List[Dict[str, Any]], base_project_dir: str):
|
|
121
|
+
"""Compiles structural data profiles and metadata metrics as a clean Markdown specification file."""
|
|
122
|
+
df = pd.DataFrame(final_results)
|
|
123
|
+
doc_dir_name = self.config.get('documents_dir', 'documents')
|
|
124
|
+
abs_doc_dir = os.path.abspath(os.path.join(base_project_dir, doc_dir_name))
|
|
125
|
+
os.makedirs(abs_doc_dir, exist_ok=True)
|
|
126
|
+
|
|
127
|
+
report_path = os.path.join(abs_doc_dir, "dd_parsing_summary.md")
|
|
128
|
+
entity_counts = df['provisional_entity'].value_counts()
|
|
129
|
+
total_attributes = len(df)
|
|
130
|
+
|
|
131
|
+
with open(report_path, 'w', encoding='utf-8') as f:
|
|
132
|
+
f.write("# ๐ KMDS Data Helper: Data Dictionary Parsing Summary\n\n")
|
|
133
|
+
f.write(f"**Total Tracked Attributes:** {total_attributes} \n")
|
|
134
|
+
f.write(f"**Unique Detected Entities:** {len(entity_counts)}\n\n")
|
|
135
|
+
|
|
136
|
+
f.write("## ๐๏ธ Entity Size & Distribution Profile\n")
|
|
137
|
+
f.write("| Detected Entity Node | Number of Attributes (Size) |\n")
|
|
138
|
+
f.write("| :--- | :--- |\n")
|
|
139
|
+
for ent, count in entity_counts.items():
|
|
140
|
+
f.write(f"| {ent} | {count} |\n")
|
|
141
|
+
f.write("\n")
|
|
142
|
+
|
|
143
|
+
f.write("## ๐๏ธ Attribute Structural Categories\n")
|
|
144
|
+
categorical_df = df[df['provisional_python_type'] == 'bool']
|
|
145
|
+
numerical_df = df[df['provisional_python_type'].isin(['int', 'float'])]
|
|
146
|
+
semantic_df = df[~df['provisional_python_type'].isin(['bool', 'int', 'float'])]
|
|
147
|
+
|
|
148
|
+
f.write(f"### ๐ Categorical Fields (Total: {len(categorical_df)})\n")
|
|
149
|
+
for _, row in categorical_df.iterrows():
|
|
150
|
+
f.write(f"- `{row['attribute_name']}` ({row['provisional_python_type']}) $\rightarrow$ Node: **{row['provisional_entity']}**\n")
|
|
151
|
+
f.write("\n")
|
|
152
|
+
|
|
153
|
+
f.write(f"### ๐ข Numerical Fields (Total: {len(numerical_df)})\n")
|
|
154
|
+
for _, row in numerical_df.iterrows():
|
|
155
|
+
f.write(f"- `{row['attribute_name']}` ({row['provisional_python_type']}) $\rightarrow$ Node: **{row['provisional_entity']}**\n")
|
|
156
|
+
f.write("\n")
|
|
157
|
+
|
|
158
|
+
f.write(f"### ๐ง Semantic Attributes Grouped By Parent Class (Total: {len(semantic_df)})\n")
|
|
159
|
+
grouped_semantic = semantic_df.groupby('provisional_entity')
|
|
160
|
+
for ent_group, group_df in grouped_semantic:
|
|
161
|
+
f.write(f"#### Entity Category: `{ent_group}`\n")
|
|
162
|
+
for _, row in group_df.iterrows():
|
|
163
|
+
geo_suffix = row['is_geographical'] and f" [GEO Linked: {row['related_entity']}]" or ""
|
|
164
|
+
f.write(f" - `{row['attribute_name']}` ({row['provisional_python_type']}){geo_suffix}\n")
|
|
165
|
+
f.write("\n")
|
|
166
|
+
|
|
167
|
+
logger.info(f"Generated parser metadata documentation report saved to: {report_path}")
|
|
168
|
+
|
|
169
|
+
def process(self):
|
|
170
|
+
files_to_process = self.config.get('files', [])
|
|
171
|
+
if not files_to_process: return
|
|
172
|
+
|
|
173
|
+
raw_output_dir = self.config.get('dd_parser_output_dir', 'dd_analysis_results')
|
|
174
|
+
abs_output_dir = os.path.isabs(raw_output_dir) and raw_output_dir or os.path.abspath(os.path.join(self.working_dir, raw_output_dir))
|
|
175
|
+
|
|
176
|
+
batch_size, csv_col_idx = self.config.get('batch_size', 10), self.config.get('csv_target_column_index', 0)
|
|
177
|
+
config_filename = self.config.get('output_filename')
|
|
178
|
+
|
|
179
|
+
# Extract base project root for kmds-data-helper structure alignment
|
|
180
|
+
base_project_dir = os.path.abspath(os.path.join(self.working_dir, ".."))
|
|
181
|
+
|
|
182
|
+
for filepath in files_to_process:
|
|
183
|
+
input_file_path = os.path.isabs(filepath) and filepath or os.path.abspath(os.path.join(self.working_dir, filepath))
|
|
184
|
+
if not os.path.exists(input_file_path): continue
|
|
185
|
+
|
|
186
|
+
filename = os.path.basename(input_file_path)
|
|
187
|
+
try:
|
|
188
|
+
raw_attributes = self.extract_attributes(input_file_path, csv_col_idx)
|
|
189
|
+
logger.info(f"Extracted {len(raw_attributes)} attributes from {filename}")
|
|
190
|
+
except Exception as e:
|
|
191
|
+
logger.exception(f"Read failure on {filename}: {e}")
|
|
192
|
+
continue
|
|
193
|
+
|
|
194
|
+
final_results = []
|
|
195
|
+
for i in range(0, len(raw_attributes), batch_size):
|
|
196
|
+
batch = raw_attributes[i:i+batch_size]
|
|
197
|
+
start_time = time.perf_counter()
|
|
198
|
+
try:
|
|
199
|
+
batch_output = self.analyze_batch(batch)
|
|
200
|
+
batch_dicts = [item.model_dump() for item in batch_output]
|
|
201
|
+
cleaned_batch = self.post_process_cleaner(batch_dicts)
|
|
202
|
+
final_results.extend(cleaned_batch)
|
|
203
|
+
logger.info(f" Batch run {i} verified in {time.perf_counter() - start_time:.2f}s")
|
|
204
|
+
except Exception as e:
|
|
205
|
+
logger.error(f" Batch crash at element index {i}: {e}")
|
|
206
|
+
|
|
207
|
+
if final_results:
|
|
208
|
+
if config_filename and len(files_to_process) == 1:
|
|
209
|
+
out_filename = config_filename
|
|
210
|
+
else:
|
|
211
|
+
base_name, _ = os.path.splitext(filename)
|
|
212
|
+
out_filename = f"mapped_{base_name}.csv"
|
|
213
|
+
|
|
214
|
+
output_csv_path = os.path.join(abs_output_dir, out_filename)
|
|
215
|
+
|
|
216
|
+
preamble = f"# DD-PARSER-SIGNATURE: PROCESSED-BY-{self.config.get('model_name', 'llama3.2').upper()}\n"
|
|
217
|
+
with open(output_csv_path, 'w', encoding='utf-8') as f:
|
|
218
|
+
f.write(preamble)
|
|
219
|
+
|
|
220
|
+
pd.DataFrame(final_results).to_csv(output_csv_path, mode='a', index=False)
|
|
221
|
+
logger.info(f"Finished tracking {filename}. Saved verified matrix map to: {output_csv_path}")
|
|
222
|
+
|
|
223
|
+
# Dynamic Markdown Generation Task Trigger
|
|
224
|
+
self.generate_parsing_markdown_summary(final_results, base_project_dir)
|
dd_parser/models.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from pydantic import BaseModel, Field
|
|
2
|
+
from typing import List, Optional, Literal
|
|
3
|
+
|
|
4
|
+
class AttributeAnalysis(BaseModel):
|
|
5
|
+
attribute_name: str
|
|
6
|
+
provisional_entity: str = Field(
|
|
7
|
+
description="The primary business entity this attribute belongs to (e.g., Customer, Product, Transaction)."
|
|
8
|
+
)
|
|
9
|
+
is_geographical: bool = Field(
|
|
10
|
+
description="True if the attribute represents a physical location, address, coordinate, country, or region."
|
|
11
|
+
)
|
|
12
|
+
related_entity: Optional[str] = Field(
|
|
13
|
+
None,
|
|
14
|
+
description="If geographical, which entity does this location bind to? (e.g., 'Customer' for 'shipping_state')."
|
|
15
|
+
)
|
|
16
|
+
provisional_python_type: Literal["str", "int", "float", "datetime.date", "datetime.datetime", "bool"] = Field(
|
|
17
|
+
description="Semantic Python data type based on semantics."
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
class BatchAnalysisResponse(BaseModel):
|
|
21
|
+
analysis: List[AttributeAnalysis]
|
dd_parser/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dd-parser-cleaner
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A private, local LLM-powered data dictionary parser and entity mapper with automated cleaning.
|
|
5
|
+
Requires-Python: >=3.10
|
|
6
|
+
Requires-Dist: ollama>=0.2.0
|
|
7
|
+
Requires-Dist: pandas>=2.2.0
|
|
8
|
+
Requires-Dist: pydantic>=2.6.0
|
|
9
|
+
Requires-Dist: pypdf>=4.1.0
|
|
10
|
+
Requires-Dist: pyyaml>=6.0.1
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
dd_cleaner/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
dd_cleaner/cli.py,sha256=rDDLQWNm-JLbL2i4teTPqya1Cg4VOtKCsGzGZARZwTU,1253
|
|
3
|
+
dd_cleaner/engine.py,sha256=JAfuW36Os63UOLnWTouexe01jiE8sTCSNrlgy5JqrT0,8523
|
|
4
|
+
dd_parser/__init__.py,sha256=hEBM_BAfswjX_DjyzGSKs1pbP-08MI1lpVd_xFjT31o,194
|
|
5
|
+
dd_parser/cli.py,sha256=Bt2-u23moNhoHkhtoxlSgqEQyLLgTlg5bMlsvQZSF3g,880
|
|
6
|
+
dd_parser/core.py,sha256=WAoPrxSCisZkI7Q56PlbgyHoyXwXugRussHsfyHL7BE,11826
|
|
7
|
+
dd_parser/models.py,sha256=XVFU3VH91bhFw_LT6-4hoI0RPHlaYWCP8phEiIccy-Y,912
|
|
8
|
+
dd_parser/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
+
dd_parser_cleaner-0.1.0.dist-info/METADATA,sha256=kIZAgeaBt8MqeAK-eRVZMVdHr7kNlLCiFd597HtY19I,335
|
|
10
|
+
dd_parser_cleaner-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
11
|
+
dd_parser_cleaner-0.1.0.dist-info/entry_points.txt,sha256=GSGie27i-zUte6BbiBF8OBUGCjVhTjkwjD4rnf_KmtU,93
|
|
12
|
+
dd_parser_cleaner-0.1.0.dist-info/RECORD,,
|