dd-parser-cleaner 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dd_cleaner/__init__.py ADDED
File without changes
dd_cleaner/cli.py ADDED
@@ -0,0 +1,35 @@
1
+ import argparse
2
+ import logging
3
+ import sys
4
+
5
+ def main():
6
+ # 1. Initialize clean, standardized console log streaming
7
+ logging.basicConfig(
8
+ level=logging.INFO,
9
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
10
+ datefmt="%Y-%m-%d %H:%M:%S"
11
+ )
12
+
13
+ parser = argparse.ArgumentParser(
14
+ description="Run the data cleaner pipeline driven by the verified dd-parser blueprint matrix map."
15
+ )
16
+ parser.add_argument("--working-dir", required=True, help="Path to raw data directories containing target payloads.")
17
+ parser.add_argument("--config", required=True, help="Path to config.yaml file containing script targets.")
18
+ args = parser.parse_args()
19
+
20
+ # Import engine inside main to allow logging configs to bind cleanly first
21
+ from dd_cleaner.engine import DataCleanerEngine
22
+
23
+ try:
24
+ # 2. Fire up the pipeline runner
25
+ cleaner = DataCleanerEngine()
26
+ cleaner.set_working_config(working_dir=args.working_dir, config_path=args.config)
27
+ cleaner.clean_dataset()
28
+
29
+ except Exception as e:
30
+ logger = logging.getLogger("dd_cleaner_main")
31
+ logger.error(f"๐Ÿ›‘ Critical Pipeline Failure: {str(e)}")
32
+ sys.exit(1)
33
+
34
+ if __name__ == "__main__":
35
+ main()
dd_cleaner/engine.py ADDED
@@ -0,0 +1,160 @@
1
+ import os
2
+ import logging
3
+ import yaml
4
+ import pandas as pd
5
+ from typing import Dict, Any
6
+
7
+ logger = logging.getLogger("dd_cleaner")
8
+
9
+ class DataCleanerEngine:
10
+ def __init__(self):
11
+ self.working_dir: str = ""
12
+ self.config: Dict[Any, Any] = {}
13
+
14
+ def set_working_config(self, working_dir: str, config_path: str):
15
+ """Loads the shared config and binds the root working directory context."""
16
+ abs_config_path = os.path.abspath(config_path)
17
+ if not os.path.exists(abs_config_path):
18
+ raise FileNotFoundError(f"Configuration file not found at: {abs_config_path}")
19
+ with open(abs_config_path, 'r') as f:
20
+ self.config = yaml.safe_load(f)
21
+ if not os.path.isdir(working_dir):
22
+ raise FileNotFoundError(f"Target data directory not found: {os.path.abspath(working_dir)}")
23
+ self.working_dir = os.path.abspath(working_dir)
24
+ logger.info("Cleaner Context Initialized.")
25
+
26
+ def verify_and_load_blueprint(self) -> pd.DataFrame:
27
+ """Handshakes with the dd_parser output subdirectory to grab the metadata map."""
28
+ parser_dir = self.config.get('dd_parser_output_dir', 'dd_analysis_results')
29
+ blueprint_name = self.config.get('output_filename', 'sba_analysis_results.csv')
30
+ blueprint_path = os.path.isabs(parser_dir) and os.path.join(parser_dir, blueprint_name) or os.path.abspath(os.path.join(self.working_dir, parser_dir, blueprint_name))
31
+
32
+ if not os.path.exists(blueprint_path):
33
+ raise FileNotFoundError(f"Missing parsing matrix blueprint file at: {blueprint_path}")
34
+
35
+ with open(blueprint_path, 'r', encoding='utf-8') as f:
36
+ first_line = f.readline()
37
+ if not first_line.startswith("# DD-PARSER-SIGNATURE"):
38
+ raise ValueError(f"Rejected: File at {blueprint_path} does not originate from dd-parser pipeline!")
39
+
40
+ return pd.read_csv(blueprint_path, comment='#')
41
+
42
+ def generate_cleaning_markdown_summary(self, data_df: pd.DataFrame, base_project_dir: str):
43
+ """Compiles clean data type breakdowns and null metrics to the documents/ workspace."""
44
+ doc_dir_name = self.config.get('documents_dir', 'documents')
45
+ abs_doc_dir = os.path.abspath(os.path.join(base_project_dir, doc_dir_name))
46
+ os.makedirs(abs_doc_dir, exist_ok=True)
47
+
48
+ report_path = os.path.join(abs_doc_dir, "data_cleaning_summary.md")
49
+ type_summary = data_df.dtypes.value_counts()
50
+ null_counts = data_df.isnull().sum()
51
+ total_rows = len(data_df)
52
+
53
+ with open(report_path, 'w', encoding='utf-8') as f:
54
+ f.write("# ๐Ÿงผ KMDS Data Helper: Data Cleaning Summary Report\n\n")
55
+ f.write("## ๐Ÿ“Š Converted Data Types Summary\n")
56
+ for dtype_name, count in type_summary.items():
57
+ f.write(f"| {dtype_name} | {count} |\n")
58
+ f.write("\n## ๐Ÿ—ƒ๏ธ Missing Value Counts\n")
59
+ for col in data_df.columns:
60
+ f.write(f"| `{col}` | {null_counts[col]} | {(total_rows - null_counts[col])/total_rows*100:.2f}% |\n")
61
+
62
+ logger.info(f"Generated clean dataset validation profile saved to: {report_path}")
63
+
64
+ def execute_numeric_imputer(self, series: pd.Series) -> pd.Series:
65
+ """
66
+ Isolated strategy vector for missing continuous numerical data items.
67
+ Swap out this inner logic to upgrade from median to kNN/Iterative models later.
68
+ """
69
+ fill_value = series.median()
70
+ return series.fillna(fill_value)
71
+
72
+ def execute_poc_feature_prep(self, df: pd.DataFrame, blueprint_df: pd.DataFrame) -> pd.DataFrame:
73
+ """Applies basic missing value strategies and builds address strings for geocoding."""
74
+ prep_df = df.copy()
75
+ raw_cols_lower = {col.lower(): col for col in prep_df.columns}
76
+
77
+ # 1. Extract and compile Geo attributes on a per-entity basis
78
+ geo_blueprint = blueprint_df[blueprint_df['is_geographical'] == True]
79
+ entity_geo_groups = geo_blueprint.groupby('provisional_entity')
80
+
81
+ for entity_name, group in entity_geo_groups:
82
+ geo_cols = []
83
+ for _, row in group.iterrows():
84
+ attr_lower = row['attribute_name'].lower()
85
+ if attr_lower in raw_cols_lower:
86
+ geo_cols.append(raw_cols_lower[attr_lower])
87
+
88
+ if geo_cols:
89
+ logger.info(f" -> Consolidating geo attributes for entity: '{entity_name}'")
90
+ prep_df[f"{entity_name.lower()}_geo_search_string"] = prep_df[geo_cols].fillna("").astype(str).agg(", ".join, axis=1)
91
+
92
+ # 2. Variable Strategy Loop driven by Schema Typing definitions
93
+ for _, row in blueprint_df.iterrows():
94
+ attr_lower = row['attribute_name'].lower()
95
+ if attr_lower not in raw_cols_lower:
96
+ continue
97
+ col_name = raw_cols_lower[attr_lower]
98
+ t_type = row['provisional_python_type']
99
+
100
+ # Numeric Strategy: Route directly through decoupled method
101
+ if t_type in ['int', 'float']:
102
+ if prep_df[col_name].isnull().any():
103
+ prep_df[col_name] = self.execute_numeric_imputer(prep_df[col_name])
104
+
105
+ # Categorical Strategy: Explicitly flag missing indices
106
+ elif t_type == 'str':
107
+ prep_df[col_name] = prep_df[col_name].replace(["nan", "None", ""], None).fillna("MISSING")
108
+
109
+ return prep_df
110
+
111
+ def clean_dataset(self):
112
+ blueprint_df = self.verify_and_load_blueprint()
113
+ raw_file = self.config.get('raw_dataset_file', 'sba_loans_raw.csv')
114
+ base_project_dir = os.path.abspath(os.path.join(self.working_dir, ".."))
115
+ raw_path = os.path.isabs(raw_file) and raw_file or os.path.abspath(os.path.join(base_project_dir, "data", raw_file))
116
+
117
+ data_df = pd.read_csv(raw_path)
118
+ raw_columns_lower = {col.lower(): col for col in data_df.columns}
119
+
120
+ # [STAGE 1] Perform baseline type conversions
121
+ for _, row in blueprint_df.iterrows():
122
+ blueprint_attr = row['attribute_name']
123
+ target_type = row['provisional_python_type']
124
+ attr_lower = blueprint_attr.lower()
125
+ if attr_lower not in raw_columns_lower: continue
126
+ col_name = raw_columns_lower[attr_lower]
127
+
128
+ try:
129
+ if target_type == 'bool':
130
+ if data_df[col_name].dtype == object:
131
+ data_df[col_name] = data_df[col_name].astype(str).str.upper().str.strip().isin(['TRUE', '1', 'Y', 'YES', 'T'])
132
+ else:
133
+ data_df[col_name] = data_df[col_name].fillna(False).astype(bool)
134
+ elif target_type == 'int':
135
+ data_df[col_name] = pd.to_numeric(data_df[col_name], errors='coerce')
136
+ elif target_type == 'float':
137
+ data_df[col_name] = pd.to_numeric(data_df[col_name], errors='coerce')
138
+ elif target_type in ['datetime.date', 'datetime.datetime']:
139
+ data_df[col_name] = pd.to_datetime(data_df[col_name], errors='coerce')
140
+ else:
141
+ data_df[col_name] = data_df[col_name].astype(str).str.strip()
142
+ except Exception: pass
143
+
144
+ # [STAGE 2] Write standard output and generate report
145
+ cleaner_dir = self.config.get('dd_cleaner_output_dir', 'dd_cleaner_results')
146
+ clean_filename = self.config.get('clean_output_filename', 'sba_loans_clean.csv')
147
+ abs_dest_dir = os.path.isabs(cleaner_dir) and cleaner_dir or os.path.abspath(os.path.join(base_project_dir, "data", cleaner_dir))
148
+ os.makedirs(abs_dest_dir, exist_ok=True)
149
+
150
+ data_df.to_csv(os.path.join(abs_dest_dir, clean_filename), index=False)
151
+ self.generate_cleaning_markdown_summary(data_df, base_project_dir)
152
+
153
+ # [STAGE 3] Run PoC feature prep AFTER generating the report
154
+ logger.info("Executing PoC Missing Value Strategies and Geo-string preparation...")
155
+ poc_ready_df = self.execute_poc_feature_prep(data_df, blueprint_df)
156
+
157
+ # Save the finalized feature-selection dataset
158
+ poc_output_path = os.path.join(abs_dest_dir, "feature_selection_ready.csv")
159
+ poc_ready_df.to_csv(poc_output_path, index=False)
160
+ logger.info(f"๐ŸŽ‰ Hand-off Complete! Modeling matrix saved to: {poc_output_path}")
dd_parser/__init__.py ADDED
@@ -0,0 +1,9 @@
1
+ # src/dd_parser/__init__.py
2
+
3
+ from dd_parser.core import LocalEntityClassifier
4
+ from dd_parser.models import AttributeAnalysis
5
+
6
+ __all__ = [
7
+ "LocalEntityClassifier",
8
+ "AttributeAnalysis",
9
+ ]
dd_parser/cli.py ADDED
@@ -0,0 +1,25 @@
1
+ import argparse
2
+ import logging
3
+
4
+ def main():
5
+ # Setup initial clean console output streaming
6
+ logging.basicConfig(
7
+ level=logging.INFO,
8
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
9
+ datefmt="%Y-%m-%d %H:%M:%S"
10
+ )
11
+
12
+ parser = argparse.ArgumentParser(description="Run the private local LLM Data Dictionary Parser.")
13
+ parser.add_argument("--working-dir", required=True, help="Path to raw source files.")
14
+ parser.add_argument("--config", required=True, help="Path to config.yaml file.")
15
+ args = parser.parse_args()
16
+
17
+ # Import inside main to let logging configure cleanly first
18
+ from dd_parser.core import LocalEntityClassifier
19
+
20
+ classifier = LocalEntityClassifier()
21
+ classifier.set_working_config(working_dir=args.working_dir, config_path=args.config)
22
+ classifier.process()
23
+
24
+ if __name__ == "__main__":
25
+ main()
dd_parser/core.py ADDED
@@ -0,0 +1,224 @@
1
+ import os
2
+ import json
3
+ import yaml
4
+ import time
5
+ import logging
6
+ import pandas as pd
7
+ from pypdf import PdfReader
8
+ import ollama
9
+ from typing import List, Dict, Any
10
+ from dd_parser.models import AttributeAnalysis, BatchAnalysisResponse
11
+
12
+ logger = logging.getLogger("dd_parser")
13
+
14
+ class LocalEntityClassifier:
15
+ def __init__(self):
16
+ self.working_dir: str = ""
17
+ self.config: Dict[Any, Any] = {}
18
+
19
+ def set_working_config(self, working_dir: str, config_path: str):
20
+ abs_config_path = os.path.abspath(config_path)
21
+ if not os.path.exists(abs_config_path):
22
+ raise FileNotFoundError(f"Configuration file not found at: {abs_config_path}")
23
+
24
+ with open(abs_config_path, 'r') as f:
25
+ self.config = yaml.safe_load(f)
26
+
27
+ if not os.path.isdir(working_dir):
28
+ raise FileNotFoundError(f"Target data directory not found: {os.path.abspath(working_dir)}")
29
+ self.working_dir = os.path.abspath(working_dir)
30
+
31
+ raw_output_dir = self.config.get('dd_parser_output_dir', 'dd_analysis_results')
32
+ abs_output_dir = os.path.isabs(raw_output_dir) and raw_output_dir or os.path.abspath(os.path.join(self.working_dir, raw_output_dir))
33
+ os.makedirs(abs_output_dir, exist_ok=True)
34
+
35
+ log_file_path = os.path.join(abs_output_dir, "parser_run.log")
36
+ file_handler = logging.FileHandler(log_file_path, encoding='utf-8')
37
+ file_handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(name)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"))
38
+ logger.addHandler(file_handler)
39
+
40
+ logger.info("Context Initialized with Hybrid Processing Configuration.")
41
+ logger.info(f"Loaded Config: {abs_config_path} | Tracking Log: {log_file_path}")
42
+
43
+ def extract_attributes(self, file_path: str, csv_idx: int = 0) -> List[str]:
44
+ _, ext = os.path.splitext(file_path)
45
+ ext = ext.lower()
46
+ if ext == '.csv':
47
+ return pd.read_csv(file_path).iloc[:, csv_idx].dropna().astype(str).tolist()
48
+ elif ext == '.pdf':
49
+ return [line.strip() for page in PdfReader(file_path).pages for line in page.extract_text().split('\n') if line.strip()]
50
+ elif ext in ['.md', '.markdown']:
51
+ with open(file_path, 'r', encoding='utf-8') as f:
52
+ return [line.strip() for line in f if line.strip()]
53
+ raise ValueError(f"Unsupported format: {ext}")
54
+
55
+ def analyze_batch(self, attributes: List[str]) -> List[AttributeAnalysis]:
56
+ prompt = f"""
57
+ Analyze the following data dictionary attributes.
58
+
59
+ ### EXAMPLES OF EXCELLENT PERFORMANCE
60
+ Input: ["BorrCity", "BankStreet", "GrossApproval", "SoldSecMrktInd"]
61
+ Output Schema Map:
62
+ {{
63
+ "analysis": [
64
+ {{"attribute_name": "BorrCity", "provisional_entity": "Borrower", "is_geographical": true, "related_entity": "Borrower", "provisional_python_type": "str"}},
65
+ {{"attribute_name": "BankStreet", "provisional_entity": "Bank", "is_geographical": true, "related_entity": "Bank", "provisional_python_type": "str"}},
66
+ {{"attribute_name": "GrossApproval", "provisional_entity": "Loan", "is_geographical": false, "related_entity": null, "provisional_python_type": "float"}},
67
+ {{"attribute_name": "SoldSecMrktInd", "provisional_entity": "Loan", "is_geographical": false, "related_entity": null, "provisional_python_type": "bool"}}
68
+ ]
69
+ }}
70
+
71
+ ### CURRENT EXECUTION BATCH
72
+ Attributes to process: {json.dumps(attributes)}
73
+ """
74
+
75
+ response = ollama.chat(
76
+ model=self.config.get('model_name', 'llama3.2'),
77
+ messages=[
78
+ {"role": "system", "content": self.config.get('system_prompt', 'You are a precise data engineering assistant. Respond strictly in JSON.')},
79
+ {"role": "user", "content": prompt}
80
+ ],
81
+ options={"temperature": self.config.get('temperature', 0.0)},
82
+ format=BatchAnalysisResponse.model_json_schema()
83
+ )
84
+ return BatchAnalysisResponse(**json.loads(response['message']['content'])).analysis
85
+
86
+ def post_process_cleaner(self, analysis_list: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
87
+ cleaned_records = []
88
+ for item in analysis_list:
89
+ data = item.copy()
90
+ attr = data['attribute_name']
91
+
92
+ if attr.startswith('Borr'):
93
+ data['provisional_entity'] = 'Borrower'
94
+ if data['is_geographical']: data['related_entity'] = 'Borrower'
95
+ elif attr.startswith('Bank'):
96
+ data['provisional_entity'] = 'Bank'
97
+ if attr in ['BankStreet', 'BankCity', 'BankState', 'BankZip']:
98
+ data['is_geographical'] = True
99
+ data['related_entity'] = 'Bank'
100
+ elif attr.startswith('Project'):
101
+ data['provisional_entity'] = 'Project'
102
+ if attr in ['ProjectCounty', 'ProjectState']:
103
+ data['is_geographical'] = True
104
+ data['related_entity'] = 'Project'
105
+ elif 'Approval' in attr or 'Disbursement' in attr or attr in ['Program', 'Subprogram']:
106
+ data['provisional_entity'] = 'Loan'
107
+ elif attr.startswith('SBA'):
108
+ data['provisional_entity'] = 'SBA'
109
+ if data['is_geographical']: data['related_entity'] = 'SBA'
110
+
111
+ if attr.endswith('Ind') or 'Indicator' in attr:
112
+ data['provisional_python_type'] = 'bool'
113
+
114
+ if not data['is_geographical']:
115
+ data['related_entity'] = ""
116
+
117
+ cleaned_records.append(data)
118
+ return cleaned_records
119
+
120
+ def generate_parsing_markdown_summary(self, final_results: List[Dict[str, Any]], base_project_dir: str):
121
+ """Compiles structural data profiles and metadata metrics as a clean Markdown specification file."""
122
+ df = pd.DataFrame(final_results)
123
+ doc_dir_name = self.config.get('documents_dir', 'documents')
124
+ abs_doc_dir = os.path.abspath(os.path.join(base_project_dir, doc_dir_name))
125
+ os.makedirs(abs_doc_dir, exist_ok=True)
126
+
127
+ report_path = os.path.join(abs_doc_dir, "dd_parsing_summary.md")
128
+ entity_counts = df['provisional_entity'].value_counts()
129
+ total_attributes = len(df)
130
+
131
+ with open(report_path, 'w', encoding='utf-8') as f:
132
+ f.write("# ๐Ÿ“‘ KMDS Data Helper: Data Dictionary Parsing Summary\n\n")
133
+ f.write(f"**Total Tracked Attributes:** {total_attributes} \n")
134
+ f.write(f"**Unique Detected Entities:** {len(entity_counts)}\n\n")
135
+
136
+ f.write("## ๐Ÿ—๏ธ Entity Size & Distribution Profile\n")
137
+ f.write("| Detected Entity Node | Number of Attributes (Size) |\n")
138
+ f.write("| :--- | :--- |\n")
139
+ for ent, count in entity_counts.items():
140
+ f.write(f"| {ent} | {count} |\n")
141
+ f.write("\n")
142
+
143
+ f.write("## ๐ŸŽ›๏ธ Attribute Structural Categories\n")
144
+ categorical_df = df[df['provisional_python_type'] == 'bool']
145
+ numerical_df = df[df['provisional_python_type'].isin(['int', 'float'])]
146
+ semantic_df = df[~df['provisional_python_type'].isin(['bool', 'int', 'float'])]
147
+
148
+ f.write(f"### ๐Ÿ“Š Categorical Fields (Total: {len(categorical_df)})\n")
149
+ for _, row in categorical_df.iterrows():
150
+ f.write(f"- `{row['attribute_name']}` ({row['provisional_python_type']}) $\rightarrow$ Node: **{row['provisional_entity']}**\n")
151
+ f.write("\n")
152
+
153
+ f.write(f"### ๐Ÿ”ข Numerical Fields (Total: {len(numerical_df)})\n")
154
+ for _, row in numerical_df.iterrows():
155
+ f.write(f"- `{row['attribute_name']}` ({row['provisional_python_type']}) $\rightarrow$ Node: **{row['provisional_entity']}**\n")
156
+ f.write("\n")
157
+
158
+ f.write(f"### ๐Ÿง  Semantic Attributes Grouped By Parent Class (Total: {len(semantic_df)})\n")
159
+ grouped_semantic = semantic_df.groupby('provisional_entity')
160
+ for ent_group, group_df in grouped_semantic:
161
+ f.write(f"#### Entity Category: `{ent_group}`\n")
162
+ for _, row in group_df.iterrows():
163
+ geo_suffix = row['is_geographical'] and f" [GEO Linked: {row['related_entity']}]" or ""
164
+ f.write(f" - `{row['attribute_name']}` ({row['provisional_python_type']}){geo_suffix}\n")
165
+ f.write("\n")
166
+
167
+ logger.info(f"Generated parser metadata documentation report saved to: {report_path}")
168
+
169
+ def process(self):
170
+ files_to_process = self.config.get('files', [])
171
+ if not files_to_process: return
172
+
173
+ raw_output_dir = self.config.get('dd_parser_output_dir', 'dd_analysis_results')
174
+ abs_output_dir = os.path.isabs(raw_output_dir) and raw_output_dir or os.path.abspath(os.path.join(self.working_dir, raw_output_dir))
175
+
176
+ batch_size, csv_col_idx = self.config.get('batch_size', 10), self.config.get('csv_target_column_index', 0)
177
+ config_filename = self.config.get('output_filename')
178
+
179
+ # Extract base project root for kmds-data-helper structure alignment
180
+ base_project_dir = os.path.abspath(os.path.join(self.working_dir, ".."))
181
+
182
+ for filepath in files_to_process:
183
+ input_file_path = os.path.isabs(filepath) and filepath or os.path.abspath(os.path.join(self.working_dir, filepath))
184
+ if not os.path.exists(input_file_path): continue
185
+
186
+ filename = os.path.basename(input_file_path)
187
+ try:
188
+ raw_attributes = self.extract_attributes(input_file_path, csv_col_idx)
189
+ logger.info(f"Extracted {len(raw_attributes)} attributes from {filename}")
190
+ except Exception as e:
191
+ logger.exception(f"Read failure on {filename}: {e}")
192
+ continue
193
+
194
+ final_results = []
195
+ for i in range(0, len(raw_attributes), batch_size):
196
+ batch = raw_attributes[i:i+batch_size]
197
+ start_time = time.perf_counter()
198
+ try:
199
+ batch_output = self.analyze_batch(batch)
200
+ batch_dicts = [item.model_dump() for item in batch_output]
201
+ cleaned_batch = self.post_process_cleaner(batch_dicts)
202
+ final_results.extend(cleaned_batch)
203
+ logger.info(f" Batch run {i} verified in {time.perf_counter() - start_time:.2f}s")
204
+ except Exception as e:
205
+ logger.error(f" Batch crash at element index {i}: {e}")
206
+
207
+ if final_results:
208
+ if config_filename and len(files_to_process) == 1:
209
+ out_filename = config_filename
210
+ else:
211
+ base_name, _ = os.path.splitext(filename)
212
+ out_filename = f"mapped_{base_name}.csv"
213
+
214
+ output_csv_path = os.path.join(abs_output_dir, out_filename)
215
+
216
+ preamble = f"# DD-PARSER-SIGNATURE: PROCESSED-BY-{self.config.get('model_name', 'llama3.2').upper()}\n"
217
+ with open(output_csv_path, 'w', encoding='utf-8') as f:
218
+ f.write(preamble)
219
+
220
+ pd.DataFrame(final_results).to_csv(output_csv_path, mode='a', index=False)
221
+ logger.info(f"Finished tracking {filename}. Saved verified matrix map to: {output_csv_path}")
222
+
223
+ # Dynamic Markdown Generation Task Trigger
224
+ self.generate_parsing_markdown_summary(final_results, base_project_dir)
dd_parser/models.py ADDED
@@ -0,0 +1,21 @@
1
+ from pydantic import BaseModel, Field
2
+ from typing import List, Optional, Literal
3
+
4
+ class AttributeAnalysis(BaseModel):
5
+ attribute_name: str
6
+ provisional_entity: str = Field(
7
+ description="The primary business entity this attribute belongs to (e.g., Customer, Product, Transaction)."
8
+ )
9
+ is_geographical: bool = Field(
10
+ description="True if the attribute represents a physical location, address, coordinate, country, or region."
11
+ )
12
+ related_entity: Optional[str] = Field(
13
+ None,
14
+ description="If geographical, which entity does this location bind to? (e.g., 'Customer' for 'shipping_state')."
15
+ )
16
+ provisional_python_type: Literal["str", "int", "float", "datetime.date", "datetime.datetime", "bool"] = Field(
17
+ description="Semantic Python data type based on semantics."
18
+ )
19
+
20
+ class BatchAnalysisResponse(BaseModel):
21
+ analysis: List[AttributeAnalysis]
dd_parser/py.typed ADDED
File without changes
@@ -0,0 +1,10 @@
1
+ Metadata-Version: 2.4
2
+ Name: dd-parser-cleaner
3
+ Version: 0.1.0
4
+ Summary: A private, local LLM-powered data dictionary parser and entity mapper with automated cleaning.
5
+ Requires-Python: >=3.10
6
+ Requires-Dist: ollama>=0.2.0
7
+ Requires-Dist: pandas>=2.2.0
8
+ Requires-Dist: pydantic>=2.6.0
9
+ Requires-Dist: pypdf>=4.1.0
10
+ Requires-Dist: pyyaml>=6.0.1
@@ -0,0 +1,12 @@
1
+ dd_cleaner/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ dd_cleaner/cli.py,sha256=rDDLQWNm-JLbL2i4teTPqya1Cg4VOtKCsGzGZARZwTU,1253
3
+ dd_cleaner/engine.py,sha256=JAfuW36Os63UOLnWTouexe01jiE8sTCSNrlgy5JqrT0,8523
4
+ dd_parser/__init__.py,sha256=hEBM_BAfswjX_DjyzGSKs1pbP-08MI1lpVd_xFjT31o,194
5
+ dd_parser/cli.py,sha256=Bt2-u23moNhoHkhtoxlSgqEQyLLgTlg5bMlsvQZSF3g,880
6
+ dd_parser/core.py,sha256=WAoPrxSCisZkI7Q56PlbgyHoyXwXugRussHsfyHL7BE,11826
7
+ dd_parser/models.py,sha256=XVFU3VH91bhFw_LT6-4hoI0RPHlaYWCP8phEiIccy-Y,912
8
+ dd_parser/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
+ dd_parser_cleaner-0.1.0.dist-info/METADATA,sha256=kIZAgeaBt8MqeAK-eRVZMVdHr7kNlLCiFd597HtY19I,335
10
+ dd_parser_cleaner-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
11
+ dd_parser_cleaner-0.1.0.dist-info/entry_points.txt,sha256=GSGie27i-zUte6BbiBF8OBUGCjVhTjkwjD4rnf_KmtU,93
12
+ dd_parser_cleaner-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ classify-entities = dd_parser.cli:main
3
+ clean-dataset = dd_cleaner.cli:main