kmds-data-helper 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kmds_data_helper-0.1.0/PKG-INFO +110 -0
- kmds_data_helper-0.1.0/README.md +80 -0
- kmds_data_helper-0.1.0/pyproject.toml +72 -0
- kmds_data_helper-0.1.0/setup.cfg +4 -0
- kmds_data_helper-0.1.0/src/kmds_data_helper/__init__.py +12 -0
- kmds_data_helper-0.1.0/src/kmds_data_helper/aggregator.py +44 -0
- kmds_data_helper-0.1.0/src/kmds_data_helper/cli.py +12 -0
- kmds_data_helper-0.1.0/src/kmds_data_helper/config_manager.py +43 -0
- kmds_data_helper-0.1.0/src/kmds_data_helper/data_processor.py +200 -0
- kmds_data_helper-0.1.0/src/kmds_data_helper/engine.py +123 -0
- kmds_data_helper-0.1.0/src/kmds_data_helper/helper_output_adapter.py +352 -0
- kmds_data_helper-0.1.0/src/kmds_data_helper/kb_aggregator.py +121 -0
- kmds_data_helper-0.1.0/src/kmds_data_helper/kmds_check.py +69 -0
- kmds_data_helper-0.1.0/src/kmds_data_helper/llm_client.py +102 -0
- kmds_data_helper-0.1.0/src/kmds_data_helper/service.py +152 -0
- kmds_data_helper-0.1.0/src/kmds_data_helper/utils.py +53 -0
- kmds_data_helper-0.1.0/src/kmds_data_helper.egg-info/PKG-INFO +110 -0
- kmds_data_helper-0.1.0/src/kmds_data_helper.egg-info/SOURCES.txt +23 -0
- kmds_data_helper-0.1.0/src/kmds_data_helper.egg-info/dependency_links.txt +1 -0
- kmds_data_helper-0.1.0/src/kmds_data_helper.egg-info/entry_points.txt +5 -0
- kmds_data_helper-0.1.0/src/kmds_data_helper.egg-info/requires.txt +19 -0
- kmds_data_helper-0.1.0/src/kmds_data_helper.egg-info/top_level.txt +1 -0
- kmds_data_helper-0.1.0/tests/test_gatekeeper.py +23 -0
- kmds_data_helper-0.1.0/tests/test_kmds.py +26 -0
- kmds_data_helper-0.1.0/tests/test_personas.py +66 -0
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: kmds-data-helper
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Repository-grounded KMDS helper that analyzes project artifacts and builds a KMDS knowledge graph
|
|
5
|
+
Author: KMDS Data Helper Team
|
|
6
|
+
License: Proprietary
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Requires-Python: <3.13,>=3.12
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
Requires-Dist: kmds
|
|
13
|
+
Requires-Dist: pymupdf4llm>=0.0.12
|
|
14
|
+
Requires-Dist: ollama>=0.1.0
|
|
15
|
+
Requires-Dist: pandas>=2.0.0
|
|
16
|
+
Requires-Dist: setuptools<70.0.0
|
|
17
|
+
Requires-Dist: urllib3<2.0.0
|
|
18
|
+
Requires-Dist: rich>=13.0.0
|
|
19
|
+
Requires-Dist: nbformat>=5.10.4
|
|
20
|
+
Requires-Dist: fastapi>=0.136.1
|
|
21
|
+
Requires-Dist: uvicorn>=0.46.0
|
|
22
|
+
Requires-Dist: requests>=2.32.5
|
|
23
|
+
Requires-Dist: pytest>=9.0.3
|
|
24
|
+
Requires-Dist: pypdf>=6.11.0
|
|
25
|
+
Requires-Dist: fg-data-profiling>=4.19.1
|
|
26
|
+
Provides-Extra: test
|
|
27
|
+
Requires-Dist: pytest; extra == "test"
|
|
28
|
+
Requires-Dist: pytest-xdist; extra == "test"
|
|
29
|
+
Requires-Dist: requests; extra == "test"
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
# KMDS Data Helper: Repo Architect Framework
|
|
33
|
+
|
|
34
|
+
A modular, multi-persona framework for analyzing data science repositories. Uses local LLMs (via Ollama) to synthesize insights from documentation, data schemas, and Jupyter notebooks.
|
|
35
|
+
|
|
36
|
+
## 📂 Project Structure
|
|
37
|
+
KMDS-Helper follows a strict modular architecture to separate concerns:
|
|
38
|
+
- `src/kmds_data_helper/`: Core logic modules (Config, Processing, LLM, Engine).
|
|
39
|
+
- `documents/`: Project documentation (.pdf, .txt).
|
|
40
|
+
- `data/`: Physical data assets (CSVs) - isolated from output.
|
|
41
|
+
- `notebooks/`: Experimental code (.ipynb).
|
|
42
|
+
- `output/`: Isolated directory for generated reports.
|
|
43
|
+
|
|
44
|
+
## 🛠️ Installation & Setup
|
|
45
|
+
1. **Environment**: Ensure you are using the local virtual environment.
|
|
46
|
+
```bash
|
|
47
|
+
source .venv/bin/activate
|
|
48
|
+
```
|
|
49
|
+
2. **LLM Engine**: Requires [Ollama](https://ollama.com) running locally with the `qwen2.5-coder:7b` model.
|
|
50
|
+
3. **Dependencies**:
|
|
51
|
+
```bash
|
|
52
|
+
pip install rich ollama dataprofiler pymupdf4llm nbformat pyyaml
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## ⚙️ Configuration
|
|
56
|
+
The framework is controlled by `kmds_config.yaml` in the root directory. You can toggle persona behaviors (Scientist, Tech Lead, Architect) and pathing without changing Python code.
|
|
57
|
+
|
|
58
|
+
## 🚀 Usage
|
|
59
|
+
Run the main orchestrator from the project root:
|
|
60
|
+
```bash
|
|
61
|
+
python3 main.py
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## 📦 Packaged Usage (v1)
|
|
65
|
+
This first version assumes a fixed repository structure. A user can install the package, run the knowledge-graph aggregator in a cloned repo, and produce a KMDS knowledge graph.
|
|
66
|
+
|
|
67
|
+
### Required folders in the cloned repo
|
|
68
|
+
- `documents/`
|
|
69
|
+
- `notebooks/`
|
|
70
|
+
- `data_dictionary/`
|
|
71
|
+
- `output/`
|
|
72
|
+
|
|
73
|
+
### Expected helper output artifacts
|
|
74
|
+
At least one of these files should exist in `output/`:
|
|
75
|
+
- `full_service_report.json`
|
|
76
|
+
- `kmds_summary.json`
|
|
77
|
+
- `kmds_strategic_summary.json`
|
|
78
|
+
|
|
79
|
+
### Install
|
|
80
|
+
From the project root:
|
|
81
|
+
```bash
|
|
82
|
+
pip install -e .
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### Generate knowledge graph from helper outputs
|
|
86
|
+
```bash
|
|
87
|
+
kmds-kb --workspace . --project-file project_knowledge_graph.xml --mode auto
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
The command validates the required folders, ingests the helper output artifacts, and writes:
|
|
91
|
+
- `project_knowledge_graph.xml`
|
|
92
|
+
|
|
93
|
+
### Adapter command (direct use)
|
|
94
|
+
You can also run the output adapter directly for a single file:
|
|
95
|
+
```bash
|
|
96
|
+
kmds-analyze --input output/full_service_report.json --project-file project_knowledge_graph.xml --create-project --workflow-name kmds_project_workflow --mode auto
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### Backward-compatible template script
|
|
100
|
+
If you are using the template script path, this remains supported:
|
|
101
|
+
```bash
|
|
102
|
+
python kb_aggregator.py --workspace . --project-file project_knowledge_graph.xml --mode auto
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### Common failure messages
|
|
106
|
+
- Missing folder(s): one or more required directories are absent.
|
|
107
|
+
- No helper output files found: none of the expected JSON artifacts are present in `output/`.
|
|
108
|
+
- Project file already exists in create mode: rerun with update mode or choose a new target path.
|
|
109
|
+
|
|
110
|
+
---
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
---
|
|
2
|
+
# KMDS Data Helper: Repo Architect Framework
|
|
3
|
+
|
|
4
|
+
A modular, multi-persona framework for analyzing data science repositories. Uses local LLMs (via Ollama) to synthesize insights from documentation, data schemas, and Jupyter notebooks.
|
|
5
|
+
|
|
6
|
+
## 📂 Project Structure
|
|
7
|
+
KMDS-Helper follows a strict modular architecture to separate concerns:
|
|
8
|
+
- `src/kmds_data_helper/`: Core logic modules (Config, Processing, LLM, Engine).
|
|
9
|
+
- `documents/`: Project documentation (.pdf, .txt).
|
|
10
|
+
- `data/`: Physical data assets (CSVs) - isolated from output.
|
|
11
|
+
- `notebooks/`: Experimental code (.ipynb).
|
|
12
|
+
- `output/`: Isolated directory for generated reports.
|
|
13
|
+
|
|
14
|
+
## 🛠️ Installation & Setup
|
|
15
|
+
1. **Environment**: Ensure you are using the local virtual environment.
|
|
16
|
+
```bash
|
|
17
|
+
source .venv/bin/activate
|
|
18
|
+
```
|
|
19
|
+
2. **LLM Engine**: Requires [Ollama](https://ollama.com) running locally with the `qwen2.5-coder:7b` model.
|
|
20
|
+
3. **Dependencies**:
|
|
21
|
+
```bash
|
|
22
|
+
pip install rich ollama dataprofiler pymupdf4llm nbformat pyyaml
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## ⚙️ Configuration
|
|
26
|
+
The framework is controlled by `kmds_config.yaml` in the root directory. You can toggle persona behaviors (Scientist, Tech Lead, Architect) and pathing without changing Python code.
|
|
27
|
+
|
|
28
|
+
## 🚀 Usage
|
|
29
|
+
Run the main orchestrator from the project root:
|
|
30
|
+
```bash
|
|
31
|
+
python3 main.py
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## 📦 Packaged Usage (v1)
|
|
35
|
+
This first version assumes a fixed repository structure. A user can install the package, run the knowledge-graph aggregator in a cloned repo, and produce a KMDS knowledge graph.
|
|
36
|
+
|
|
37
|
+
### Required folders in the cloned repo
|
|
38
|
+
- `documents/`
|
|
39
|
+
- `notebooks/`
|
|
40
|
+
- `data_dictionary/`
|
|
41
|
+
- `output/`
|
|
42
|
+
|
|
43
|
+
### Expected helper output artifacts
|
|
44
|
+
At least one of these files should exist in `output/`:
|
|
45
|
+
- `full_service_report.json`
|
|
46
|
+
- `kmds_summary.json`
|
|
47
|
+
- `kmds_strategic_summary.json`
|
|
48
|
+
|
|
49
|
+
### Install
|
|
50
|
+
From the project root:
|
|
51
|
+
```bash
|
|
52
|
+
pip install -e .
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Generate knowledge graph from helper outputs
|
|
56
|
+
```bash
|
|
57
|
+
kmds-kb --workspace . --project-file project_knowledge_graph.xml --mode auto
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
The command validates the required folders, ingests the helper output artifacts, and writes:
|
|
61
|
+
- `project_knowledge_graph.xml`
|
|
62
|
+
|
|
63
|
+
### Adapter command (direct use)
|
|
64
|
+
You can also run the output adapter directly for a single file:
|
|
65
|
+
```bash
|
|
66
|
+
kmds-analyze --input output/full_service_report.json --project-file project_knowledge_graph.xml --create-project --workflow-name kmds_project_workflow --mode auto
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### Backward-compatible template script
|
|
70
|
+
If you are using the template script path, this remains supported:
|
|
71
|
+
```bash
|
|
72
|
+
python kb_aggregator.py --workspace . --project-file project_knowledge_graph.xml --mode auto
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### Common failure messages
|
|
76
|
+
- Missing folder(s): one or more required directories are absent.
|
|
77
|
+
- No helper output files found: none of the expected JSON artifacts are present in `output/`.
|
|
78
|
+
- Project file already exists in create mode: rerun with update mode or choose a new target path.
|
|
79
|
+
|
|
80
|
+
---
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "kmds-data-helper"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Repository-grounded KMDS helper that analyzes project artifacts and builds a KMDS knowledge graph"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = { text = "Proprietary" }
|
|
7
|
+
authors = [
|
|
8
|
+
{ name = "KMDS Data Helper Team" }
|
|
9
|
+
]
|
|
10
|
+
requires-python = ">=3.12,<3.13"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"kmds",
|
|
13
|
+
"pymupdf4llm>=0.0.12",
|
|
14
|
+
"ollama>=0.1.0",
|
|
15
|
+
"pandas>=2.0.0",
|
|
16
|
+
"setuptools<70.0.0",
|
|
17
|
+
"urllib3<2.0.0",
|
|
18
|
+
"rich>=13.0.0",
|
|
19
|
+
"nbformat>=5.10.4",
|
|
20
|
+
"fastapi>=0.136.1",
|
|
21
|
+
"uvicorn>=0.46.0",
|
|
22
|
+
"requests>=2.32.5",
|
|
23
|
+
"pytest>=9.0.3",
|
|
24
|
+
"pypdf>=6.11.0",
|
|
25
|
+
"fg-data-profiling>=4.19.1",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
classifiers = [
|
|
29
|
+
"Programming Language :: Python :: 3",
|
|
30
|
+
"Programming Language :: Python :: 3.12",
|
|
31
|
+
"Operating System :: OS Independent",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
[build-system]
|
|
35
|
+
requires = ["setuptools>=61.0"]
|
|
36
|
+
build-backend = "setuptools.build_meta"
|
|
37
|
+
|
|
38
|
+
[project.optional-dependencies]
|
|
39
|
+
test = [
|
|
40
|
+
"pytest",
|
|
41
|
+
"pytest-xdist",
|
|
42
|
+
"requests"
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
[project.scripts]
|
|
46
|
+
# Maps kmds-check to your verified, production pre-flight entry point
|
|
47
|
+
kmds-check = "kmds_data_helper.kmds_check:main"
|
|
48
|
+
# NEW: Maps kmds-analyze to your unified application CLI execution entry point
|
|
49
|
+
kmds-analyze = "kmds_data_helper.helper_output_adapter:main"
|
|
50
|
+
kmds-kb = "kmds_data_helper.kb_aggregator:main"
|
|
51
|
+
kmds-test = "pytest:main"
|
|
52
|
+
|
|
53
|
+
[tool.pytest.ini_options]
|
|
54
|
+
# Ensures pytest only runs the new persona tests and stays away from broken legacy files
|
|
55
|
+
testpaths = ["tests/test_personas.py"]
|
|
56
|
+
python_files = "test_*.py"
|
|
57
|
+
|
|
58
|
+
# -n 3: Runs 3 parallel workers
|
|
59
|
+
# -v: Verbose output
|
|
60
|
+
# --disable-warnings: Hides those annoying urllib3 version mismatch logs
|
|
61
|
+
addopts = "-n 3 -v --disable-warnings"
|
|
62
|
+
|
|
63
|
+
filterwarnings = [
|
|
64
|
+
"ignore:urllib3.*match a supported version:requests.RequestsDependencyWarning",
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
[dependency-groups]
|
|
69
|
+
dev = [
|
|
70
|
+
"pytest-asyncio>=1.3.0",
|
|
71
|
+
"pytest-xdist>=3.8.0",
|
|
72
|
+
]
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from .engine import KMDSEngine
|
|
2
|
+
from .utils import parse_notebook_with_outputs, save_kmds_json
|
|
3
|
+
from .aggregator import PersonaAggregator
|
|
4
|
+
from .llm_client import LLMClient
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"KMDSEngine",
|
|
8
|
+
"parse_notebook_with_outputs",
|
|
9
|
+
"save_kmds_json",
|
|
10
|
+
"PersonaAggregator",
|
|
11
|
+
"LLMClient"
|
|
12
|
+
]
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Dict, Any, List
|
|
3
|
+
|
|
4
|
+
class PersonaAggregator:
|
|
5
|
+
"""
|
|
6
|
+
Knowledge Dictionary Builder.
|
|
7
|
+
Consolidates individual persona reports into a single grounded
|
|
8
|
+
knowledge base for strategic synthesis.
|
|
9
|
+
"""
|
|
10
|
+
def __init__(self):
|
|
11
|
+
# Stores the grounded findings for the full project
|
|
12
|
+
self.knowledge_dict: Dict[str, Any] = {}
|
|
13
|
+
|
|
14
|
+
def add_audit_result(self, result: Dict[str, Any]):
|
|
15
|
+
"""
|
|
16
|
+
Ingests a result from KMDSEngine and maps it to the knowledge dictionary.
|
|
17
|
+
Input format: {"notebook": "...", "persona": "...", "analysis": {...}}
|
|
18
|
+
"""
|
|
19
|
+
persona = result.get("persona")
|
|
20
|
+
notebook = result.get("notebook")
|
|
21
|
+
data = result.get("analysis", {})
|
|
22
|
+
|
|
23
|
+
# Grounding Safeguard: Only aggregate successful, non-error findings
|
|
24
|
+
if "error" not in data:
|
|
25
|
+
if persona not in self.knowledge_dict:
|
|
26
|
+
self.knowledge_dict[persona] = {}
|
|
27
|
+
|
|
28
|
+
# Index findings by notebook for multi-notebook project tracking
|
|
29
|
+
self.knowledge_dict[persona][notebook] = data
|
|
30
|
+
|
|
31
|
+
def get_grounded_stats(self) -> str:
|
|
32
|
+
"""
|
|
33
|
+
Formats the current knowledge dictionary into a string for the
|
|
34
|
+
Strategic Lead's RUN_STATS input.
|
|
35
|
+
"""
|
|
36
|
+
if not self.knowledge_dict:
|
|
37
|
+
return "NO PRIOR AUDIT EVIDENCE FOUND."
|
|
38
|
+
|
|
39
|
+
summary = "KMDS GROUNDED EVIDENCE DICTIONARY:\n"
|
|
40
|
+
for persona, notebooks in self.knowledge_dict.items():
|
|
41
|
+
for nb_name, findings in notebooks.items():
|
|
42
|
+
summary += f"[{persona}] Evidence from {nb_name}: {json.dumps(findings)}\n"
|
|
43
|
+
|
|
44
|
+
return summary
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Backward-compatible CLI shim.
|
|
2
|
+
|
|
3
|
+
The canonical implementation lives in helper_output_adapter.py.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from .helper_output_adapter import ingest_helper_output, main
|
|
7
|
+
|
|
8
|
+
__all__ = ["ingest_helper_output", "main"]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
if __name__ == "__main__":
|
|
12
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import yaml
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Dict, Any
|
|
4
|
+
|
|
5
|
+
class ConfigManager:
|
|
6
|
+
"""
|
|
7
|
+
The central source of truth for the KMDS Workspace configuration.
|
|
8
|
+
Resolves explicit layout directory maps from kmds_config.yaml.
|
|
9
|
+
"""
|
|
10
|
+
def __init__(self, workspace_path: str = "."):
|
|
11
|
+
self.workspace = Path(workspace_path)
|
|
12
|
+
self.config_path = self.workspace / "kmds_config.yaml"
|
|
13
|
+
self.config = self._load_config()
|
|
14
|
+
|
|
15
|
+
# Centralized pathing map: Separates KMDS 'documents' from Sphinx 'docs'
|
|
16
|
+
self.paths = {
|
|
17
|
+
"notebooks": self.get_directory_path("notebooks"),
|
|
18
|
+
"personas": self.get_directory_path("personas"),
|
|
19
|
+
"documents": self.get_directory_path("documents"),
|
|
20
|
+
"data_dictionary": self.get_directory_path("data_dictionary"),
|
|
21
|
+
"data": self.get_directory_path("data"),
|
|
22
|
+
"sphinx_docs": self.workspace / "docs", # Isolated Sphinx tree
|
|
23
|
+
"output": self.workspace / "output"
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
def _load_config(self) -> Dict[str, Any]:
|
|
27
|
+
if not self.config_path.exists():
|
|
28
|
+
return {}
|
|
29
|
+
with open(self.config_path, 'r', encoding='utf-8') as f:
|
|
30
|
+
return yaml.safe_load(f) or {}
|
|
31
|
+
|
|
32
|
+
def get_directory_path(self, key: str) -> Path:
|
|
33
|
+
"""Resolves the configured folder path against the workspace root."""
|
|
34
|
+
dirs = self.config.get("directories", {})
|
|
35
|
+
fallback_map = {
|
|
36
|
+
"notebooks": "notebooks",
|
|
37
|
+
"personas": "personas",
|
|
38
|
+
"documents": "documents",
|
|
39
|
+
"data_dictionary": "data_dictionary",
|
|
40
|
+
"data": "data"
|
|
41
|
+
}
|
|
42
|
+
folder_name = dirs.get(key, fallback_map.get(key, key))
|
|
43
|
+
return self.workspace / folder_name
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import csv as native_csv
|
|
3
|
+
from data_profiling import ProfileReport
|
|
4
|
+
import nbformat
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
class KMDSDataProcessor:
|
|
8
|
+
def __init__(self, config_manager):
|
|
9
|
+
self.cfg = config_manager
|
|
10
|
+
|
|
11
|
+
self.active_features = {
|
|
12
|
+
"pdf_processing": False,
|
|
13
|
+
"data_profiling": False,
|
|
14
|
+
"notebook_analysis": False
|
|
15
|
+
}
|
|
16
|
+
self._run_system_checks()
|
|
17
|
+
|
|
18
|
+
def _run_system_checks(self):
|
|
19
|
+
"""Internal guardrail to verify if directories contain valid files."""
|
|
20
|
+
if any(self.cfg.paths["documents"].glob("*.pdf")):
|
|
21
|
+
self.active_features["pdf_processing"] = True
|
|
22
|
+
|
|
23
|
+
if any(self.cfg.paths["data"].glob("**/*.csv")):
|
|
24
|
+
self.active_features["data_profiling"] = True
|
|
25
|
+
|
|
26
|
+
if any(self.cfg.paths["notebooks"].glob("*.ipynb")):
|
|
27
|
+
self.active_features["notebook_analysis"] = True
|
|
28
|
+
|
|
29
|
+
def get_ground_truth(self):
|
|
30
|
+
"""
|
|
31
|
+
Scans KMDS documents and data directories while protecting Sphinx tree structure.
|
|
32
|
+
"""
|
|
33
|
+
truth = []
|
|
34
|
+
|
|
35
|
+
# 1. Ingest text instructions from your KMDS 'documents' folder safely
|
|
36
|
+
for txt in self.cfg.paths["documents"].glob("*.txt"):
|
|
37
|
+
with open(txt, 'r', encoding='utf-8') as f:
|
|
38
|
+
truth.append({"source": txt.name, "type": "doc", "content": f.read()[:5000]})
|
|
39
|
+
|
|
40
|
+
# 2. Ingest tabular data schemas cleanly
|
|
41
|
+
for csv_path in self.cfg.paths["data"].glob("**/*.csv"):
|
|
42
|
+
if str(self.cfg.paths["output"].resolve()) in str(csv_path.resolve()):
|
|
43
|
+
continue
|
|
44
|
+
|
|
45
|
+
try:
|
|
46
|
+
print(f"🔬 [PROFILER] Reading native file structures for {csv_path.name}...")
|
|
47
|
+
|
|
48
|
+
with open(csv_path, 'r', encoding='utf-8-sig') as f:
|
|
49
|
+
sample = f.readline()
|
|
50
|
+
delimiter = ';' if ';' in sample else ','
|
|
51
|
+
f.seek(0)
|
|
52
|
+
reader = native_csv.reader(f, delimiter=delimiter)
|
|
53
|
+
columns_found = next(reader)
|
|
54
|
+
|
|
55
|
+
columns_found = [c.strip().strip('"').strip("'") for c in columns_found if c.strip()]
|
|
56
|
+
|
|
57
|
+
# Load a slice into pandas to inspect types
|
|
58
|
+
df = pd.read_csv(
|
|
59
|
+
csv_path,
|
|
60
|
+
header=0,
|
|
61
|
+
names=columns_found,
|
|
62
|
+
nrows=20,
|
|
63
|
+
on_bad_lines='skip'
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
profile = ProfileReport(df, minimal=True, progress_bar=False)
|
|
67
|
+
description = profile.get_description()
|
|
68
|
+
|
|
69
|
+
type_insights = {}
|
|
70
|
+
variables_map = description.get("variables", {})
|
|
71
|
+
for col in columns_found:
|
|
72
|
+
col_type = variables_map.get(col, {}).get("type", "unknown")
|
|
73
|
+
type_insights[col] = str(col_type)
|
|
74
|
+
|
|
75
|
+
truth.append({
|
|
76
|
+
"source": csv_path.name,
|
|
77
|
+
"type": "physical_schema",
|
|
78
|
+
"columns": columns_found,
|
|
79
|
+
"data_types": type_insights
|
|
80
|
+
})
|
|
81
|
+
print(f"✅ [PROFILER] Identified all {len(columns_found)} schema column parameters.")
|
|
82
|
+
|
|
83
|
+
except Exception as e:
|
|
84
|
+
print(f"[-] Profile extraction failed for {csv_path.name}: {e}")
|
|
85
|
+
|
|
86
|
+
return truth
|
|
87
|
+
|
|
88
|
+
def read_notebook(self, nb_path):
|
|
89
|
+
with open(nb_path, 'r', encoding='utf-8') as f:
|
|
90
|
+
nb = nbformat.read(f, as_version=4)
|
|
91
|
+
return {
|
|
92
|
+
"markdown": [c.source for c in nb.cells if c.cell_type == 'markdown'],
|
|
93
|
+
"code": [c.source for c in nb.cells if c.cell_type == 'code']
|
|
94
|
+
}
|
|
95
|
+
import pandas as pd
|
|
96
|
+
import csv as native_csv
|
|
97
|
+
import nbformat
|
|
98
|
+
from pathlib import Path
|
|
99
|
+
|
|
100
|
+
class KMDSDataProcessor:
|
|
101
|
+
def __init__(self, config_manager):
|
|
102
|
+
self.cfg = config_manager
|
|
103
|
+
|
|
104
|
+
self.active_features = {
|
|
105
|
+
"pdf_processing": False,
|
|
106
|
+
"data_profiling": False,
|
|
107
|
+
"notebook_analysis": False
|
|
108
|
+
}
|
|
109
|
+
self._run_system_checks()
|
|
110
|
+
|
|
111
|
+
def _run_system_checks(self):
|
|
112
|
+
"""Internal guardrail to verify if directories contain valid files."""
|
|
113
|
+
if any(self.cfg.paths["documents"].glob("*.pdf")):
|
|
114
|
+
self.active_features["pdf_processing"] = True
|
|
115
|
+
|
|
116
|
+
if any(self.cfg.paths["data"].glob("**/*.csv")):
|
|
117
|
+
self.active_features["data_profiling"] = True
|
|
118
|
+
|
|
119
|
+
if any(self.cfg.paths["notebooks"].glob("*.ipynb")):
|
|
120
|
+
self.active_features["notebook_analysis"] = True
|
|
121
|
+
|
|
122
|
+
def get_ground_truth(self):
|
|
123
|
+
"""
|
|
124
|
+
Scans KMDS documents and data directories, ensuring full schema metrics
|
|
125
|
+
are written without triggering internal pandas profiling type-errors.
|
|
126
|
+
"""
|
|
127
|
+
truth = []
|
|
128
|
+
|
|
129
|
+
# 1. Ingest text instructions from your KMDS 'documents' folder
|
|
130
|
+
for txt in self.cfg.paths["documents"].glob("*.txt"):
|
|
131
|
+
try:
|
|
132
|
+
with open(txt, 'r', encoding='utf-8') as f:
|
|
133
|
+
truth.append({"source": txt.name, "type": "doc", "content": f.read()[:5000]})
|
|
134
|
+
except Exception as e:
|
|
135
|
+
print(f"[-] Failed reading text asset {txt.name}: {e}")
|
|
136
|
+
|
|
137
|
+
# 2. Tabular Data Schema Profiling
|
|
138
|
+
for csv_path in self.cfg.paths["data"].glob("**/*.csv"):
|
|
139
|
+
if str(self.cfg.paths["output"].resolve()) in str(csv_path.resolve()):
|
|
140
|
+
continue
|
|
141
|
+
|
|
142
|
+
try:
|
|
143
|
+
print(f"🔬 [PROFILER] Parsing file structure for {csv_path.name}...")
|
|
144
|
+
|
|
145
|
+
# Step A: Native CSV stream lookahead to extract absolute raw headers
|
|
146
|
+
with open(csv_path, 'r', encoding='utf-8-sig') as f:
|
|
147
|
+
sample = f.readline()
|
|
148
|
+
delimiter = ';' if ';' in sample else ','
|
|
149
|
+
f.seek(0)
|
|
150
|
+
reader = native_csv.reader(f, delimiter=delimiter)
|
|
151
|
+
columns_found = next(reader)
|
|
152
|
+
|
|
153
|
+
# Trim quotation tokens and whitespaces cleanly
|
|
154
|
+
columns_found = [c.strip().strip('"').strip("'") for c in columns_found if c.strip()]
|
|
155
|
+
|
|
156
|
+
# Step B: Read Data Sample using standard pandas mapping engine
|
|
157
|
+
df = pd.read_csv(
|
|
158
|
+
csv_path,
|
|
159
|
+
header=0,
|
|
160
|
+
names=columns_found,
|
|
161
|
+
nrows=100,
|
|
162
|
+
low_memory=False,
|
|
163
|
+
on_bad_lines='skip'
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
# Step C: Generate Type Insights explicitly using standard DataFrame analysis
|
|
167
|
+
# This guarantees full column tracking without crashing on complex data rows
|
|
168
|
+
type_insights = {}
|
|
169
|
+
for col in df.columns:
|
|
170
|
+
# Cleanly deduce column profiling metric categories natively
|
|
171
|
+
if pd.api.types.is_numeric_dtype(df[col]):
|
|
172
|
+
type_insights[col] = "Numeric"
|
|
173
|
+
elif pd.api.types.is_bool_dtype(df[col]):
|
|
174
|
+
type_insights[col] = "Boolean"
|
|
175
|
+
elif pd.api.types.is_datetime64_any_dtype(df[col]):
|
|
176
|
+
type_insights[col] = "DateTime"
|
|
177
|
+
else:
|
|
178
|
+
type_insights[col] = "Categorical"
|
|
179
|
+
|
|
180
|
+
truth.append({
|
|
181
|
+
"source": csv_path.name,
|
|
182
|
+
"type": "physical_schema",
|
|
183
|
+
"columns": list(df.columns),
|
|
184
|
+
"data_types": type_insights
|
|
185
|
+
})
|
|
186
|
+
print(f"✅ [PROFILER] Extracted all {len(df.columns)} active schema parameters.")
|
|
187
|
+
|
|
188
|
+
except Exception as e:
|
|
189
|
+
print(f"[-] Profile extraction failed for {csv_path.name}: {e}")
|
|
190
|
+
|
|
191
|
+
return truth
|
|
192
|
+
|
|
193
|
+
def read_notebook(self, nb_path):
|
|
194
|
+
"""Parses .ipynb files into markdown and code chunks."""
|
|
195
|
+
with open(nb_path, 'r', encoding='utf-8') as f:
|
|
196
|
+
nb = nbformat.read(f, as_version=4)
|
|
197
|
+
return {
|
|
198
|
+
"markdown": [c.source for c in nb.cells if c.cell_type == 'markdown'],
|
|
199
|
+
"code": [c.source for c in nb.cells if c.cell_type == 'code']
|
|
200
|
+
}
|