jdatamunch-mcp 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jdatamunch_mcp-0.1.0/PKG-INFO +24 -0
- jdatamunch_mcp-0.1.0/README.md +1 -0
- jdatamunch_mcp-0.1.0/pyproject.toml +45 -0
- jdatamunch_mcp-0.1.0/server.json +21 -0
- jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/__init__.py +3 -0
- jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/config.py +25 -0
- jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/parser/__init__.py +34 -0
- jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/parser/csv_parser.py +107 -0
- jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/parser/types.py +19 -0
- jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/profiler/__init__.py +1 -0
- jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/profiler/column_profiler.py +299 -0
- jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/profiler/histogram.py +36 -0
- jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/profiler/value_indexer.py +32 -0
- jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/security.py +107 -0
- jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/server.py +441 -0
- jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/storage/__init__.py +1 -0
- jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/storage/data_store.py +232 -0
- jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/storage/sqlite_store.py +403 -0
- jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/storage/token_tracker.py +104 -0
- jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/tools/__init__.py +1 -0
- jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/tools/aggregate.py +79 -0
- jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/tools/describe_column.py +121 -0
- jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/tools/describe_dataset.py +89 -0
- jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/tools/get_rows.py +96 -0
- jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/tools/get_session_stats.py +36 -0
- jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/tools/index_local.py +191 -0
- jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/tools/list_datasets.py +25 -0
- jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/tools/sample_rows.py +75 -0
- jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/tools/search_data.py +178 -0
- jdatamunch_mcp-0.1.0/tests/conftest.py +74 -0
- jdatamunch_mcp-0.1.0/tests/test_aggregate.py +80 -0
- jdatamunch_mcp-0.1.0/tests/test_describe_dataset.py +96 -0
- jdatamunch_mcp-0.1.0/tests/test_get_rows.py +142 -0
- jdatamunch_mcp-0.1.0/tests/test_index_local.py +82 -0
- jdatamunch_mcp-0.1.0/tests/test_search_data.py +66 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: jdatamunch-mcp
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Token-efficient MCP server for tabular data retrieval via CSV/Excel indexing
|
|
5
|
+
Project-URL: Homepage, https://github.com/jgravelle/jdatamunch-mcp
|
|
6
|
+
Project-URL: Repository, https://github.com/jgravelle/jdatamunch-mcp
|
|
7
|
+
Project-URL: Bug Tracker, https://github.com/jgravelle/jdatamunch-mcp/issues
|
|
8
|
+
Author-email: "J. Gravelle" <j@gravelle.us>
|
|
9
|
+
Requires-Python: >=3.10
|
|
10
|
+
Requires-Dist: charset-normalizer>=3.0.0
|
|
11
|
+
Requires-Dist: mcp<2.0.0,>=1.10.0
|
|
12
|
+
Provides-Extra: all
|
|
13
|
+
Requires-Dist: anthropic>=0.40.0; extra == 'all'
|
|
14
|
+
Requires-Dist: google-generativeai>=0.8.0; extra == 'all'
|
|
15
|
+
Requires-Dist: openpyxl>=3.1.0; extra == 'all'
|
|
16
|
+
Provides-Extra: anthropic
|
|
17
|
+
Requires-Dist: anthropic>=0.40.0; extra == 'anthropic'
|
|
18
|
+
Provides-Extra: excel
|
|
19
|
+
Requires-Dist: openpyxl>=3.1.0; extra == 'excel'
|
|
20
|
+
Provides-Extra: gemini
|
|
21
|
+
Requires-Dist: google-generativeai>=0.8.0; extra == 'gemini'
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
|
|
24
|
+
# jdatamunch-mcp
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# jdatamunch-mcp
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "jdatamunch-mcp"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Token-efficient MCP server for tabular data retrieval via CSV/Excel indexing"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.10"
|
|
7
|
+
authors = [
|
|
8
|
+
{ name = "J. Gravelle", email = "j@gravelle.us" },
|
|
9
|
+
]
|
|
10
|
+
dependencies = [
|
|
11
|
+
"mcp>=1.10.0,<2.0.0",
|
|
12
|
+
"charset-normalizer>=3.0.0",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
[project.urls]
|
|
16
|
+
Homepage = "https://github.com/jgravelle/jdatamunch-mcp"
|
|
17
|
+
Repository = "https://github.com/jgravelle/jdatamunch-mcp"
|
|
18
|
+
"Bug Tracker" = "https://github.com/jgravelle/jdatamunch-mcp/issues"
|
|
19
|
+
|
|
20
|
+
[project.optional-dependencies]
|
|
21
|
+
excel = ["openpyxl>=3.1.0"]
|
|
22
|
+
anthropic = ["anthropic>=0.40.0"]
|
|
23
|
+
gemini = ["google-generativeai>=0.8.0"]
|
|
24
|
+
all = ["openpyxl>=3.1.0", "anthropic>=0.40.0", "google-generativeai>=0.8.0"]
|
|
25
|
+
|
|
26
|
+
[project.scripts]
|
|
27
|
+
jdatamunch-mcp = "jdatamunch_mcp.server:main"
|
|
28
|
+
|
|
29
|
+
[build-system]
|
|
30
|
+
requires = ["hatchling"]
|
|
31
|
+
build-backend = "hatchling.build"
|
|
32
|
+
|
|
33
|
+
[tool.hatch.build.targets.wheel]
|
|
34
|
+
packages = ["src/jdatamunch_mcp"]
|
|
35
|
+
|
|
36
|
+
[dependency-groups]
|
|
37
|
+
dev = [
|
|
38
|
+
"pytest>=9.0.2",
|
|
39
|
+
"pytest-asyncio>=1.3.0",
|
|
40
|
+
"pytest-cov>=7.0.0",
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
[tool.pytest.ini_options]
|
|
44
|
+
testpaths = ["tests"]
|
|
45
|
+
asyncio_mode = "auto"
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "https://static.modelcontextprotocol.io/schemas/2025-12-11/server.schema.json",
|
|
3
|
+
"name": "io.github.jgravelle/jdatamunch-mcp",
|
|
4
|
+
"title": "jDataMunch MCP",
|
|
5
|
+
"description": "Token-efficient MCP server for tabular data retrieval. Index CSV/Excel files, query rows, aggregate — 99%+ token savings vs raw file reads.",
|
|
6
|
+
"version": "0.1.0",
|
|
7
|
+
"packages": [
|
|
8
|
+
{
|
|
9
|
+
"registryType": "pypi",
|
|
10
|
+
"identifier": "jdatamunch-mcp",
|
|
11
|
+
"version": "0.1.0",
|
|
12
|
+
"transport": {
|
|
13
|
+
"type": "stdio"
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
],
|
|
17
|
+
"repository": {
|
|
18
|
+
"url": "https://github.com/jgravelle/jdatamunch-mcp",
|
|
19
|
+
"source": "github"
|
|
20
|
+
}
|
|
21
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Environment variable handling and defaults for jdatamunch-mcp."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def get_index_path(override: Optional[str] = None) -> Path:
|
|
9
|
+
"""Return the base index storage path."""
|
|
10
|
+
if override:
|
|
11
|
+
return Path(override)
|
|
12
|
+
return Path(os.environ.get("DATA_INDEX_PATH", str(Path.home() / ".data-index")))
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_max_rows() -> int:
|
|
16
|
+
return int(os.environ.get("JDATAMUNCH_MAX_ROWS", "5000000"))
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def get_share_savings() -> bool:
|
|
20
|
+
return os.environ.get("JDATAMUNCH_SHARE_SAVINGS", "1") != "0"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def get_use_ai_summaries() -> bool:
|
|
24
|
+
v = os.environ.get("JDATAMUNCH_USE_AI_SUMMARIES", "true").lower()
|
|
25
|
+
return v not in ("false", "0", "no", "off")
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Parser registry for tabular file formats."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
from .types import ParsedDataset
|
|
7
|
+
from .csv_parser import parse_csv
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def parse_file(
|
|
11
|
+
path: str,
|
|
12
|
+
encoding: Optional[str] = None,
|
|
13
|
+
delimiter: Optional[str] = None,
|
|
14
|
+
header_row: int = 0,
|
|
15
|
+
sheet: Optional[str] = None,
|
|
16
|
+
) -> ParsedDataset:
|
|
17
|
+
"""Parse a tabular file and return a streaming ParsedDataset."""
|
|
18
|
+
p = Path(path)
|
|
19
|
+
suffix = p.suffix.lower()
|
|
20
|
+
|
|
21
|
+
if suffix in (".csv", ".tsv", ".txt"):
|
|
22
|
+
return parse_csv(path, encoding=encoding, delimiter=delimiter, header_row=header_row)
|
|
23
|
+
elif suffix in (".xlsx", ".xls"):
|
|
24
|
+
try:
|
|
25
|
+
from .excel_parser import parse_excel
|
|
26
|
+
return parse_excel(path, sheet=sheet, header_row=header_row)
|
|
27
|
+
except ImportError:
|
|
28
|
+
raise ValueError(
|
|
29
|
+
"Excel support requires openpyxl: pip install 'jdatamunch-mcp[excel]'"
|
|
30
|
+
)
|
|
31
|
+
else:
|
|
32
|
+
raise ValueError(
|
|
33
|
+
f"Unsupported file format: {suffix!r}. Supported: .csv, .tsv, .xlsx, .xls"
|
|
34
|
+
)
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""Streaming CSV parser with auto-encoding detection."""
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
import os
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Generator, Optional
|
|
7
|
+
|
|
8
|
+
from .types import ColumnInfo, ParsedDataset
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _detect_encoding(path: str) -> str:
|
|
12
|
+
"""Detect file encoding using charset-normalizer."""
|
|
13
|
+
try:
|
|
14
|
+
from charset_normalizer import from_path
|
|
15
|
+
result = from_path(path, cp_isolation=["utf-8", "latin-1", "cp1252"])
|
|
16
|
+
if result.best():
|
|
17
|
+
return str(result.best().encoding)
|
|
18
|
+
except Exception:
|
|
19
|
+
pass
|
|
20
|
+
return "utf-8"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _detect_delimiter(sample: str) -> str:
|
|
24
|
+
"""Detect CSV delimiter using csv.Sniffer."""
|
|
25
|
+
try:
|
|
26
|
+
dialect = csv.Sniffer().sniff(sample, delimiters=",\t|;")
|
|
27
|
+
return dialect.delimiter
|
|
28
|
+
except Exception:
|
|
29
|
+
return ","
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _row_generator(
|
|
33
|
+
path: str,
|
|
34
|
+
encoding: str,
|
|
35
|
+
delimiter: str,
|
|
36
|
+
header_row: int,
|
|
37
|
+
) -> Generator:
|
|
38
|
+
"""Yield data rows as lists of strings, skipping the header row."""
|
|
39
|
+
with open(path, newline="", encoding=encoding, errors="replace") as f:
|
|
40
|
+
reader = csv.reader(f, delimiter=delimiter)
|
|
41
|
+
for i, row in enumerate(reader):
|
|
42
|
+
if i == header_row:
|
|
43
|
+
continue # skip header
|
|
44
|
+
yield row
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def parse_csv(
|
|
48
|
+
path: str,
|
|
49
|
+
encoding: Optional[str] = None,
|
|
50
|
+
delimiter: Optional[str] = None,
|
|
51
|
+
header_row: int = 0,
|
|
52
|
+
) -> ParsedDataset:
|
|
53
|
+
"""Parse a CSV file and return a streaming ParsedDataset.
|
|
54
|
+
|
|
55
|
+
The row_iterator is a fresh generator each time this is called.
|
|
56
|
+
Column names come from the header_row.
|
|
57
|
+
"""
|
|
58
|
+
path = str(Path(path).resolve())
|
|
59
|
+
file_size = os.path.getsize(path)
|
|
60
|
+
|
|
61
|
+
# Detect encoding from file bytes
|
|
62
|
+
if not encoding:
|
|
63
|
+
encoding = _detect_encoding(path)
|
|
64
|
+
|
|
65
|
+
# Read a sample for delimiter sniffing
|
|
66
|
+
with open(path, newline="", encoding=encoding, errors="replace") as f:
|
|
67
|
+
sample = f.read(65536)
|
|
68
|
+
|
|
69
|
+
# Detect delimiter
|
|
70
|
+
if not delimiter:
|
|
71
|
+
if path.lower().endswith(".tsv"):
|
|
72
|
+
delimiter = "\t"
|
|
73
|
+
else:
|
|
74
|
+
delimiter = _detect_delimiter(sample)
|
|
75
|
+
|
|
76
|
+
# Parse header row to get column names
|
|
77
|
+
header: list = []
|
|
78
|
+
with open(path, newline="", encoding=encoding, errors="replace") as f:
|
|
79
|
+
reader = csv.reader(f, delimiter=delimiter)
|
|
80
|
+
for i, row in enumerate(reader):
|
|
81
|
+
if i == header_row:
|
|
82
|
+
header = row
|
|
83
|
+
break
|
|
84
|
+
|
|
85
|
+
columns = [ColumnInfo(name=name.strip(), position=i) for i, name in enumerate(header)]
|
|
86
|
+
|
|
87
|
+
# Estimate total row count from file size + sample density
|
|
88
|
+
sample_lines = sample.count("\n")
|
|
89
|
+
if sample_lines > 1:
|
|
90
|
+
bytes_per_row = len(sample.encode(encoding, errors="replace")) / sample_lines
|
|
91
|
+
estimated_rows = max(0, int(file_size / bytes_per_row) - 1)
|
|
92
|
+
else:
|
|
93
|
+
estimated_rows = 0
|
|
94
|
+
|
|
95
|
+
metadata = {
|
|
96
|
+
"encoding": encoding,
|
|
97
|
+
"delimiter": delimiter,
|
|
98
|
+
"header_row": header_row,
|
|
99
|
+
"estimated_rows": estimated_rows,
|
|
100
|
+
"file_size": file_size,
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
return ParsedDataset(
|
|
104
|
+
columns=columns,
|
|
105
|
+
row_iterator=_row_generator(path, encoding, delimiter, header_row),
|
|
106
|
+
metadata=metadata,
|
|
107
|
+
)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Core dataclasses for parsed tabular datasets."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class ColumnInfo:
|
|
9
|
+
"""Metadata about a single column parsed from the file header."""
|
|
10
|
+
name: str
|
|
11
|
+
position: int
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class ParsedDataset:
|
|
16
|
+
"""Result of parsing a tabular file. Row iterator is lazy/streaming."""
|
|
17
|
+
columns: list # list[ColumnInfo]
|
|
18
|
+
row_iterator: Any # Generator[list[str], None, None] — yields lists of raw strings
|
|
19
|
+
metadata: dict # encoding, delimiter, header_row, estimated_rows, file_size
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Column profiling for tabular datasets."""
|
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
"""Single-pass streaming column profiler.
|
|
2
|
+
|
|
3
|
+
Processes rows one at a time using per-column accumulators.
|
|
4
|
+
Designed to work with index_local.py's main loop where profiling
|
|
5
|
+
and SQLite loading happen in the same pass over the data.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from typing import Any, Optional
|
|
11
|
+
|
|
12
|
+
_NULL_VALUES = frozenset([
|
|
13
|
+
"", "null", "NULL", "none", "None", "N/A", "n/a", "NA", "na",
|
|
14
|
+
"NaN", "nan", "-", ".", "#N/A", "#NA", "#NULL!", "n.a.", "N.A.",
|
|
15
|
+
])
|
|
16
|
+
|
|
17
|
+
# Type rank: lower = more specific
|
|
18
|
+
_TYPE_RANK = {"integer": 0, "float": 1, "datetime": 2, "string": 3}
|
|
19
|
+
_TYPE_FROM_RANK = {0: "integer", 1: "float", 2: "datetime", 3: "string"}
|
|
20
|
+
|
|
21
|
+
MAX_CARDINALITY_TRACK = 5_000 # stop adding new keys to value_counts after this
|
|
22
|
+
SAMPLE_SIZE = 10 # distinct non-null samples to collect
|
|
23
|
+
RESERVOIR_SIZE = 10_000 # numeric values for approximate median
|
|
24
|
+
|
|
25
|
+
VALUE_INDEX_CARDINALITY_LIMIT = 1_000 # full value map stored if cardinality <= this
|
|
26
|
+
TOP_VALUES_LIMIT = 50 # top values stored for high-cardinality columns
|
|
27
|
+
|
|
28
|
+
# Common datetime patterns (regex → strptime format string)
|
|
29
|
+
_DATETIME_PATTERNS = [
|
|
30
|
+
(re.compile(r"^\d{4}-\d{2}-\d{2}$"), "%Y-%m-%d"),
|
|
31
|
+
(re.compile(r"^\d{2}/\d{2}/\d{4}$"), "%m/%d/%Y"),
|
|
32
|
+
(re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}"), "%Y-%m-%dT%H:%M:%S"),
|
|
33
|
+
(re.compile(r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}"), "%Y-%m-%d %H:%M:%S"),
|
|
34
|
+
(re.compile(r"^\d{2}/\d{2}/\d{4} \d{2}:\d{2}:\d{2}"), "%m/%d/%Y %H:%M:%S"),
|
|
35
|
+
# US date + 12h time with AM/PM (e.g. "01/15/2020 12:00:00 AM")
|
|
36
|
+
(re.compile(r"^\d{1,2}/\d{1,2}/\d{4} \d{1,2}:\d{2}:\d{2} [AP]M$"), "%m/%d/%Y %I:%M:%S %p"),
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _is_datetime_str(value: str) -> bool:
|
|
41
|
+
for rx, _ in _DATETIME_PATTERNS:
|
|
42
|
+
if rx.match(value):
|
|
43
|
+
return True
|
|
44
|
+
return False
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _get_datetime_format(value: str) -> Optional[str]:
|
|
48
|
+
for rx, fmt in _DATETIME_PATTERNS:
|
|
49
|
+
if rx.match(value):
|
|
50
|
+
return fmt
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass
|
|
55
|
+
class _ColAcc:
|
|
56
|
+
"""Per-column accumulator updated once per row."""
|
|
57
|
+
name: str
|
|
58
|
+
position: int
|
|
59
|
+
# Type tracking (rank only advances upward)
|
|
60
|
+
type_rank: int = 0
|
|
61
|
+
# Row counts
|
|
62
|
+
count: int = 0 # non-null rows
|
|
63
|
+
null_count: int = 0
|
|
64
|
+
# Numeric stats (valid when type_rank <= 1)
|
|
65
|
+
num_min: float = field(default_factory=lambda: float("inf"))
|
|
66
|
+
num_max: float = field(default_factory=lambda: float("-inf"))
|
|
67
|
+
num_sum: float = 0.0
|
|
68
|
+
# Reservoir for approximate median (first RESERVOIR_SIZE numeric values)
|
|
69
|
+
reservoir: list = field(default_factory=list)
|
|
70
|
+
# Cardinality / value frequency
|
|
71
|
+
value_counts: dict = field(default_factory=dict)
|
|
72
|
+
cardinality_overflow: bool = False
|
|
73
|
+
# Samples: first SAMPLE_SIZE distinct non-null values seen
|
|
74
|
+
samples: list = field(default_factory=list)
|
|
75
|
+
_samples_set: set = field(default_factory=set)
|
|
76
|
+
# Datetime range (valid when type_rank == 2)
|
|
77
|
+
dt_min: Optional[str] = None
|
|
78
|
+
dt_max: Optional[str] = None
|
|
79
|
+
dt_format: Optional[str] = None
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def update_acc(acc: _ColAcc, raw_value: str) -> None:
|
|
83
|
+
"""Update accumulator with one raw string value from the CSV."""
|
|
84
|
+
stripped = raw_value.strip() if raw_value else ""
|
|
85
|
+
|
|
86
|
+
if stripped in _NULL_VALUES:
|
|
87
|
+
acc.null_count += 1
|
|
88
|
+
return
|
|
89
|
+
|
|
90
|
+
acc.count += 1
|
|
91
|
+
|
|
92
|
+
# --- Type detection & promotion ---
|
|
93
|
+
if acc.type_rank == 0: # currently integer
|
|
94
|
+
try:
|
|
95
|
+
int(stripped)
|
|
96
|
+
except ValueError:
|
|
97
|
+
acc.type_rank = 1 # promote to float
|
|
98
|
+
|
|
99
|
+
if acc.type_rank == 1: # currently float
|
|
100
|
+
try:
|
|
101
|
+
float(stripped)
|
|
102
|
+
except ValueError:
|
|
103
|
+
# Check datetime before falling to string
|
|
104
|
+
if _is_datetime_str(stripped):
|
|
105
|
+
acc.type_rank = 2
|
|
106
|
+
else:
|
|
107
|
+
acc.type_rank = 3 # string
|
|
108
|
+
|
|
109
|
+
if acc.type_rank == 2: # currently datetime
|
|
110
|
+
if not _is_datetime_str(stripped):
|
|
111
|
+
acc.type_rank = 3 # string
|
|
112
|
+
elif acc.dt_format is None:
|
|
113
|
+
acc.dt_format = _get_datetime_format(stripped)
|
|
114
|
+
|
|
115
|
+
# --- Numeric stats ---
|
|
116
|
+
if acc.type_rank <= 1:
|
|
117
|
+
try:
|
|
118
|
+
num = float(stripped)
|
|
119
|
+
if num < acc.num_min:
|
|
120
|
+
acc.num_min = num
|
|
121
|
+
if num > acc.num_max:
|
|
122
|
+
acc.num_max = num
|
|
123
|
+
acc.num_sum += num
|
|
124
|
+
if len(acc.reservoir) < RESERVOIR_SIZE:
|
|
125
|
+
acc.reservoir.append(num)
|
|
126
|
+
except ValueError:
|
|
127
|
+
pass
|
|
128
|
+
|
|
129
|
+
# --- Datetime min/max ---
|
|
130
|
+
if acc.type_rank == 2:
|
|
131
|
+
if acc.dt_min is None or stripped < acc.dt_min:
|
|
132
|
+
acc.dt_min = stripped
|
|
133
|
+
if acc.dt_max is None or stripped > acc.dt_max:
|
|
134
|
+
acc.dt_max = stripped
|
|
135
|
+
|
|
136
|
+
# --- Cardinality / value counts ---
|
|
137
|
+
if stripped in acc.value_counts:
|
|
138
|
+
acc.value_counts[stripped] += 1
|
|
139
|
+
elif not acc.cardinality_overflow:
|
|
140
|
+
if len(acc.value_counts) < MAX_CARDINALITY_TRACK:
|
|
141
|
+
acc.value_counts[stripped] = 1
|
|
142
|
+
else:
|
|
143
|
+
acc.cardinality_overflow = True
|
|
144
|
+
acc.value_counts[stripped] = 1
|
|
145
|
+
|
|
146
|
+
# --- Samples ---
|
|
147
|
+
if len(acc.samples) < SAMPLE_SIZE and stripped not in acc._samples_set:
|
|
148
|
+
acc.samples.append(stripped)
|
|
149
|
+
acc._samples_set.add(stripped)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _compute_median(reservoir: list) -> Optional[float]:
|
|
153
|
+
if not reservoir:
|
|
154
|
+
return None
|
|
155
|
+
sorted_vals = sorted(reservoir)
|
|
156
|
+
n = len(sorted_vals)
|
|
157
|
+
mid = n // 2
|
|
158
|
+
if n % 2 == 0:
|
|
159
|
+
return (sorted_vals[mid - 1] + sorted_vals[mid]) / 2.0
|
|
160
|
+
return float(sorted_vals[mid])
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
@dataclass
|
|
164
|
+
class ColumnProfile:
|
|
165
|
+
"""Fully computed profile for a single column."""
|
|
166
|
+
name: str
|
|
167
|
+
position: int
|
|
168
|
+
type: str # "integer", "float", "datetime", "string"
|
|
169
|
+
count: int # non-null row count
|
|
170
|
+
null_count: int
|
|
171
|
+
null_pct: float
|
|
172
|
+
cardinality: int
|
|
173
|
+
cardinality_is_exact: bool
|
|
174
|
+
is_unique: bool
|
|
175
|
+
is_primary_key_candidate: bool
|
|
176
|
+
min: Optional[Any]
|
|
177
|
+
max: Optional[Any]
|
|
178
|
+
mean: Optional[float]
|
|
179
|
+
median: Optional[float]
|
|
180
|
+
sample_values: list
|
|
181
|
+
value_index: Optional[dict] # full {value: count} for cardinality <= 1000
|
|
182
|
+
top_values: Optional[list] # [{"value": ..., "count": ...}] for high-cardinality
|
|
183
|
+
datetime_min: Optional[str] = None
|
|
184
|
+
datetime_max: Optional[str] = None
|
|
185
|
+
datetime_format: Optional[str] = None
|
|
186
|
+
ai_summary: Optional[str] = None
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def finalize_profile(acc: _ColAcc) -> ColumnProfile:
|
|
190
|
+
"""Build a ColumnProfile from a completed _ColAcc."""
|
|
191
|
+
total = acc.count + acc.null_count
|
|
192
|
+
null_pct = round(acc.null_count / total * 100, 1) if total > 0 else 0.0
|
|
193
|
+
col_type = _TYPE_FROM_RANK[acc.type_rank]
|
|
194
|
+
|
|
195
|
+
cardinality = len(acc.value_counts)
|
|
196
|
+
cardinality_is_exact = not acc.cardinality_overflow
|
|
197
|
+
|
|
198
|
+
is_unique = (cardinality_is_exact and cardinality == acc.count and acc.count > 0)
|
|
199
|
+
is_pk_candidate = (
|
|
200
|
+
is_unique
|
|
201
|
+
and acc.null_count == 0
|
|
202
|
+
and col_type in ("integer", "string")
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
# Numeric stats
|
|
206
|
+
if col_type in ("integer", "float") and acc.count > 0:
|
|
207
|
+
raw_min = acc.num_min
|
|
208
|
+
raw_max = acc.num_max
|
|
209
|
+
mean_val = round(acc.num_sum / acc.count, 4)
|
|
210
|
+
median_val = _compute_median(acc.reservoir)
|
|
211
|
+
if col_type == "integer":
|
|
212
|
+
min_val = int(raw_min) if raw_min != float("inf") else None
|
|
213
|
+
max_val = int(raw_max) if raw_max != float("-inf") else None
|
|
214
|
+
median_val = round(median_val, 1) if median_val is not None else None
|
|
215
|
+
else:
|
|
216
|
+
min_val = raw_min if raw_min != float("inf") else None
|
|
217
|
+
max_val = raw_max if raw_max != float("-inf") else None
|
|
218
|
+
else:
|
|
219
|
+
min_val = max_val = mean_val = median_val = None
|
|
220
|
+
|
|
221
|
+
# Convert sample values to their native type
|
|
222
|
+
samples: list = []
|
|
223
|
+
for s in acc.samples:
|
|
224
|
+
if col_type == "integer":
|
|
225
|
+
try:
|
|
226
|
+
samples.append(int(s))
|
|
227
|
+
continue
|
|
228
|
+
except ValueError:
|
|
229
|
+
pass
|
|
230
|
+
elif col_type == "float":
|
|
231
|
+
try:
|
|
232
|
+
samples.append(float(s))
|
|
233
|
+
continue
|
|
234
|
+
except ValueError:
|
|
235
|
+
pass
|
|
236
|
+
samples.append(s)
|
|
237
|
+
|
|
238
|
+
# Value index / top values
|
|
239
|
+
if cardinality <= VALUE_INDEX_CARDINALITY_LIMIT:
|
|
240
|
+
value_index: Optional[dict] = {}
|
|
241
|
+
for val_str, cnt in acc.value_counts.items():
|
|
242
|
+
if col_type == "integer":
|
|
243
|
+
try:
|
|
244
|
+
key: Any = int(val_str)
|
|
245
|
+
except ValueError:
|
|
246
|
+
key = val_str
|
|
247
|
+
elif col_type == "float":
|
|
248
|
+
try:
|
|
249
|
+
key = float(val_str)
|
|
250
|
+
except ValueError:
|
|
251
|
+
key = val_str
|
|
252
|
+
else:
|
|
253
|
+
key = val_str
|
|
254
|
+
value_index[str(key)] = cnt
|
|
255
|
+
top_values = None
|
|
256
|
+
else:
|
|
257
|
+
value_index = None
|
|
258
|
+
sorted_vals = sorted(acc.value_counts.items(), key=lambda x: x[1], reverse=True)
|
|
259
|
+
top_values = [{"value": v, "count": c} for v, c in sorted_vals[:TOP_VALUES_LIMIT]]
|
|
260
|
+
|
|
261
|
+
return ColumnProfile(
|
|
262
|
+
name=acc.name,
|
|
263
|
+
position=acc.position,
|
|
264
|
+
type=col_type,
|
|
265
|
+
count=acc.count,
|
|
266
|
+
null_count=acc.null_count,
|
|
267
|
+
null_pct=null_pct,
|
|
268
|
+
cardinality=cardinality,
|
|
269
|
+
cardinality_is_exact=cardinality_is_exact,
|
|
270
|
+
is_unique=is_unique,
|
|
271
|
+
is_primary_key_candidate=is_pk_candidate,
|
|
272
|
+
min=min_val,
|
|
273
|
+
max=max_val,
|
|
274
|
+
mean=mean_val,
|
|
275
|
+
median=median_val,
|
|
276
|
+
sample_values=samples,
|
|
277
|
+
value_index=value_index,
|
|
278
|
+
top_values=top_values,
|
|
279
|
+
datetime_min=acc.dt_min if col_type == "datetime" else None,
|
|
280
|
+
datetime_max=acc.dt_max if col_type == "datetime" else None,
|
|
281
|
+
datetime_format=acc.dt_format if col_type == "datetime" else None,
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def infer_types_from_sample(
|
|
286
|
+
accs: list, # list[_ColAcc]
|
|
287
|
+
sample_rows: list,
|
|
288
|
+
) -> list:
|
|
289
|
+
"""Run a subset of rows through the accumulators to detect preliminary types.
|
|
290
|
+
|
|
291
|
+
Used to determine column types before creating the SQLite schema.
|
|
292
|
+
Returns the modified accs list (same objects, updated in-place).
|
|
293
|
+
"""
|
|
294
|
+
n_cols = len(accs)
|
|
295
|
+
for row in sample_rows:
|
|
296
|
+
for i, acc in enumerate(accs):
|
|
297
|
+
raw = row[i] if i < len(row) else ""
|
|
298
|
+
update_acc(acc, raw)
|
|
299
|
+
return accs
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Numeric histogram computation from a column profile's reservoir."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def compute_histogram(
|
|
7
|
+
reservoir: list,
|
|
8
|
+
bins: int = 10,
|
|
9
|
+
col_min: Optional[float] = None,
|
|
10
|
+
col_max: Optional[float] = None,
|
|
11
|
+
) -> Optional[dict]:
|
|
12
|
+
"""Compute a histogram from a reservoir of numeric values.
|
|
13
|
+
|
|
14
|
+
Returns a dict with 'bins' (count per bin) and 'edges' (bin boundaries),
|
|
15
|
+
or None if the reservoir is empty.
|
|
16
|
+
"""
|
|
17
|
+
if not reservoir:
|
|
18
|
+
return None
|
|
19
|
+
|
|
20
|
+
lo = col_min if col_min is not None else min(reservoir)
|
|
21
|
+
hi = col_max if col_max is not None else max(reservoir)
|
|
22
|
+
|
|
23
|
+
if lo == hi:
|
|
24
|
+
return {"bins": [len(reservoir)], "edges": [lo, hi], "bin_count": 1}
|
|
25
|
+
|
|
26
|
+
bin_width = (hi - lo) / bins
|
|
27
|
+
counts = [0] * bins
|
|
28
|
+
edges = [round(lo + i * bin_width, 6) for i in range(bins + 1)]
|
|
29
|
+
|
|
30
|
+
for val in reservoir:
|
|
31
|
+
idx = int((val - lo) / bin_width)
|
|
32
|
+
if idx >= bins:
|
|
33
|
+
idx = bins - 1
|
|
34
|
+
counts[idx] += 1
|
|
35
|
+
|
|
36
|
+
return {"bins": counts, "edges": edges, "bin_count": bins}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Value index builder for low-cardinality columns.
|
|
2
|
+
|
|
3
|
+
Low-cardinality columns (cardinality <= VALUE_INDEX_CARDINALITY_LIMIT) get a
|
|
4
|
+
full inverted index: {value_str: count}. This index is stored in index.json
|
|
5
|
+
and used by search_data for fast value matching without touching SQLite.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
# Re-export the constant so callers can import from this module
|
|
9
|
+
from .column_profiler import VALUE_INDEX_CARDINALITY_LIMIT
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def build_value_search_index(profiles: list) -> dict:
|
|
13
|
+
"""Build a search-optimised view of all value indexes.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
{column_name: {"low_cardinality": bool, "values": [str, ...], "top_values": [...]}}
|
|
17
|
+
|
|
18
|
+
Used by search_data to quickly find columns whose values match a query.
|
|
19
|
+
"""
|
|
20
|
+
result = {}
|
|
21
|
+
for p in profiles:
|
|
22
|
+
if p.value_index is not None:
|
|
23
|
+
result[p.name] = {
|
|
24
|
+
"low_cardinality": True,
|
|
25
|
+
"values": list(p.value_index.keys()),
|
|
26
|
+
}
|
|
27
|
+
elif p.top_values is not None:
|
|
28
|
+
result[p.name] = {
|
|
29
|
+
"low_cardinality": False,
|
|
30
|
+
"values": [tv["value"] for tv in p.top_values],
|
|
31
|
+
}
|
|
32
|
+
return result
|