jdatamunch-mcp 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. jdatamunch_mcp-0.1.0/PKG-INFO +24 -0
  2. jdatamunch_mcp-0.1.0/README.md +1 -0
  3. jdatamunch_mcp-0.1.0/pyproject.toml +45 -0
  4. jdatamunch_mcp-0.1.0/server.json +21 -0
  5. jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/__init__.py +3 -0
  6. jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/config.py +25 -0
  7. jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/parser/__init__.py +34 -0
  8. jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/parser/csv_parser.py +107 -0
  9. jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/parser/types.py +19 -0
  10. jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/profiler/__init__.py +1 -0
  11. jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/profiler/column_profiler.py +299 -0
  12. jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/profiler/histogram.py +36 -0
  13. jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/profiler/value_indexer.py +32 -0
  14. jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/security.py +107 -0
  15. jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/server.py +441 -0
  16. jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/storage/__init__.py +1 -0
  17. jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/storage/data_store.py +232 -0
  18. jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/storage/sqlite_store.py +403 -0
  19. jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/storage/token_tracker.py +104 -0
  20. jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/tools/__init__.py +1 -0
  21. jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/tools/aggregate.py +79 -0
  22. jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/tools/describe_column.py +121 -0
  23. jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/tools/describe_dataset.py +89 -0
  24. jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/tools/get_rows.py +96 -0
  25. jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/tools/get_session_stats.py +36 -0
  26. jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/tools/index_local.py +191 -0
  27. jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/tools/list_datasets.py +25 -0
  28. jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/tools/sample_rows.py +75 -0
  29. jdatamunch_mcp-0.1.0/src/jdatamunch_mcp/tools/search_data.py +178 -0
  30. jdatamunch_mcp-0.1.0/tests/conftest.py +74 -0
  31. jdatamunch_mcp-0.1.0/tests/test_aggregate.py +80 -0
  32. jdatamunch_mcp-0.1.0/tests/test_describe_dataset.py +96 -0
  33. jdatamunch_mcp-0.1.0/tests/test_get_rows.py +142 -0
  34. jdatamunch_mcp-0.1.0/tests/test_index_local.py +82 -0
  35. jdatamunch_mcp-0.1.0/tests/test_search_data.py +66 -0
@@ -0,0 +1,24 @@
1
+ Metadata-Version: 2.4
2
+ Name: jdatamunch-mcp
3
+ Version: 0.1.0
4
+ Summary: Token-efficient MCP server for tabular data retrieval via CSV/Excel indexing
5
+ Project-URL: Homepage, https://github.com/jgravelle/jdatamunch-mcp
6
+ Project-URL: Repository, https://github.com/jgravelle/jdatamunch-mcp
7
+ Project-URL: Bug Tracker, https://github.com/jgravelle/jdatamunch-mcp/issues
8
+ Author-email: "J. Gravelle" <j@gravelle.us>
9
+ Requires-Python: >=3.10
10
+ Requires-Dist: charset-normalizer>=3.0.0
11
+ Requires-Dist: mcp<2.0.0,>=1.10.0
12
+ Provides-Extra: all
13
+ Requires-Dist: anthropic>=0.40.0; extra == 'all'
14
+ Requires-Dist: google-generativeai>=0.8.0; extra == 'all'
15
+ Requires-Dist: openpyxl>=3.1.0; extra == 'all'
16
+ Provides-Extra: anthropic
17
+ Requires-Dist: anthropic>=0.40.0; extra == 'anthropic'
18
+ Provides-Extra: excel
19
+ Requires-Dist: openpyxl>=3.1.0; extra == 'excel'
20
+ Provides-Extra: gemini
21
+ Requires-Dist: google-generativeai>=0.8.0; extra == 'gemini'
22
+ Description-Content-Type: text/markdown
23
+
24
+ # jdatamunch-mcp
@@ -0,0 +1 @@
1
+ # jdatamunch-mcp
@@ -0,0 +1,45 @@
1
+ [project]
2
+ name = "jdatamunch-mcp"
3
+ version = "0.1.0"
4
+ description = "Token-efficient MCP server for tabular data retrieval via CSV/Excel indexing"
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ authors = [
8
+ { name = "J. Gravelle", email = "j@gravelle.us" },
9
+ ]
10
+ dependencies = [
11
+ "mcp>=1.10.0,<2.0.0",
12
+ "charset-normalizer>=3.0.0",
13
+ ]
14
+
15
+ [project.urls]
16
+ Homepage = "https://github.com/jgravelle/jdatamunch-mcp"
17
+ Repository = "https://github.com/jgravelle/jdatamunch-mcp"
18
+ "Bug Tracker" = "https://github.com/jgravelle/jdatamunch-mcp/issues"
19
+
20
+ [project.optional-dependencies]
21
+ excel = ["openpyxl>=3.1.0"]
22
+ anthropic = ["anthropic>=0.40.0"]
23
+ gemini = ["google-generativeai>=0.8.0"]
24
+ all = ["openpyxl>=3.1.0", "anthropic>=0.40.0", "google-generativeai>=0.8.0"]
25
+
26
+ [project.scripts]
27
+ jdatamunch-mcp = "jdatamunch_mcp.server:main"
28
+
29
+ [build-system]
30
+ requires = ["hatchling"]
31
+ build-backend = "hatchling.build"
32
+
33
+ [tool.hatch.build.targets.wheel]
34
+ packages = ["src/jdatamunch_mcp"]
35
+
36
+ [dependency-groups]
37
+ dev = [
38
+ "pytest>=9.0.2",
39
+ "pytest-asyncio>=1.3.0",
40
+ "pytest-cov>=7.0.0",
41
+ ]
42
+
43
+ [tool.pytest.ini_options]
44
+ testpaths = ["tests"]
45
+ asyncio_mode = "auto"
@@ -0,0 +1,21 @@
1
+ {
2
+ "$schema": "https://static.modelcontextprotocol.io/schemas/2025-12-11/server.schema.json",
3
+ "name": "io.github.jgravelle/jdatamunch-mcp",
4
+ "title": "jDataMunch MCP",
5
+ "description": "Token-efficient MCP server for tabular data retrieval. Index CSV/Excel files, query rows, aggregate — 99%+ token savings vs raw file reads.",
6
+ "version": "0.1.0",
7
+ "packages": [
8
+ {
9
+ "registryType": "pypi",
10
+ "identifier": "jdatamunch-mcp",
11
+ "version": "0.1.0",
12
+ "transport": {
13
+ "type": "stdio"
14
+ }
15
+ }
16
+ ],
17
+ "repository": {
18
+ "url": "https://github.com/jgravelle/jdatamunch-mcp",
19
+ "source": "github"
20
+ }
21
+ }
@@ -0,0 +1,3 @@
1
+ """jdatamunch-mcp: Token-efficient MCP server for tabular data retrieval."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,25 @@
1
+ """Environment variable handling and defaults for jdatamunch-mcp."""
2
+
3
+ import os
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+
8
+ def get_index_path(override: Optional[str] = None) -> Path:
9
+ """Return the base index storage path."""
10
+ if override:
11
+ return Path(override)
12
+ return Path(os.environ.get("DATA_INDEX_PATH", str(Path.home() / ".data-index")))
13
+
14
+
15
+ def get_max_rows() -> int:
16
+ return int(os.environ.get("JDATAMUNCH_MAX_ROWS", "5000000"))
17
+
18
+
19
+ def get_share_savings() -> bool:
20
+ return os.environ.get("JDATAMUNCH_SHARE_SAVINGS", "1") != "0"
21
+
22
+
23
+ def get_use_ai_summaries() -> bool:
24
+ v = os.environ.get("JDATAMUNCH_USE_AI_SUMMARIES", "true").lower()
25
+ return v not in ("false", "0", "no", "off")
@@ -0,0 +1,34 @@
1
+ """Parser registry for tabular file formats."""
2
+
3
+ from pathlib import Path
4
+ from typing import Optional
5
+
6
+ from .types import ParsedDataset
7
+ from .csv_parser import parse_csv
8
+
9
+
10
+ def parse_file(
11
+ path: str,
12
+ encoding: Optional[str] = None,
13
+ delimiter: Optional[str] = None,
14
+ header_row: int = 0,
15
+ sheet: Optional[str] = None,
16
+ ) -> ParsedDataset:
17
+ """Parse a tabular file and return a streaming ParsedDataset."""
18
+ p = Path(path)
19
+ suffix = p.suffix.lower()
20
+
21
+ if suffix in (".csv", ".tsv", ".txt"):
22
+ return parse_csv(path, encoding=encoding, delimiter=delimiter, header_row=header_row)
23
+ elif suffix in (".xlsx", ".xls"):
24
+ try:
25
+ from .excel_parser import parse_excel
26
+ return parse_excel(path, sheet=sheet, header_row=header_row)
27
+ except ImportError:
28
+ raise ValueError(
29
+ "Excel support requires openpyxl: pip install 'jdatamunch-mcp[excel]'"
30
+ )
31
+ else:
32
+ raise ValueError(
33
+ f"Unsupported file format: {suffix!r}. Supported: .csv, .tsv, .xlsx, .xls"
34
+ )
@@ -0,0 +1,107 @@
1
+ """Streaming CSV parser with auto-encoding detection."""
2
+
3
+ import csv
4
+ import os
5
+ from pathlib import Path
6
+ from typing import Generator, Optional
7
+
8
+ from .types import ColumnInfo, ParsedDataset
9
+
10
+
11
+ def _detect_encoding(path: str) -> str:
12
+ """Detect file encoding using charset-normalizer."""
13
+ try:
14
+ from charset_normalizer import from_path
15
+ result = from_path(path, cp_isolation=["utf-8", "latin-1", "cp1252"])
16
+ if result.best():
17
+ return str(result.best().encoding)
18
+ except Exception:
19
+ pass
20
+ return "utf-8"
21
+
22
+
23
+ def _detect_delimiter(sample: str) -> str:
24
+ """Detect CSV delimiter using csv.Sniffer."""
25
+ try:
26
+ dialect = csv.Sniffer().sniff(sample, delimiters=",\t|;")
27
+ return dialect.delimiter
28
+ except Exception:
29
+ return ","
30
+
31
+
32
+ def _row_generator(
33
+ path: str,
34
+ encoding: str,
35
+ delimiter: str,
36
+ header_row: int,
37
+ ) -> Generator:
38
+ """Yield data rows as lists of strings, skipping the header row."""
39
+ with open(path, newline="", encoding=encoding, errors="replace") as f:
40
+ reader = csv.reader(f, delimiter=delimiter)
41
+ for i, row in enumerate(reader):
42
+ if i == header_row:
43
+ continue # skip header
44
+ yield row
45
+
46
+
47
+ def parse_csv(
48
+ path: str,
49
+ encoding: Optional[str] = None,
50
+ delimiter: Optional[str] = None,
51
+ header_row: int = 0,
52
+ ) -> ParsedDataset:
53
+ """Parse a CSV file and return a streaming ParsedDataset.
54
+
55
+ The row_iterator is a fresh generator each time this is called.
56
+ Column names come from the header_row.
57
+ """
58
+ path = str(Path(path).resolve())
59
+ file_size = os.path.getsize(path)
60
+
61
+ # Detect encoding from file bytes
62
+ if not encoding:
63
+ encoding = _detect_encoding(path)
64
+
65
+ # Read a sample for delimiter sniffing
66
+ with open(path, newline="", encoding=encoding, errors="replace") as f:
67
+ sample = f.read(65536)
68
+
69
+ # Detect delimiter
70
+ if not delimiter:
71
+ if path.lower().endswith(".tsv"):
72
+ delimiter = "\t"
73
+ else:
74
+ delimiter = _detect_delimiter(sample)
75
+
76
+ # Parse header row to get column names
77
+ header: list = []
78
+ with open(path, newline="", encoding=encoding, errors="replace") as f:
79
+ reader = csv.reader(f, delimiter=delimiter)
80
+ for i, row in enumerate(reader):
81
+ if i == header_row:
82
+ header = row
83
+ break
84
+
85
+ columns = [ColumnInfo(name=name.strip(), position=i) for i, name in enumerate(header)]
86
+
87
+ # Estimate total row count from file size + sample density
88
+ sample_lines = sample.count("\n")
89
+ if sample_lines > 1:
90
+ bytes_per_row = len(sample.encode(encoding, errors="replace")) / sample_lines
91
+ estimated_rows = max(0, int(file_size / bytes_per_row) - 1)
92
+ else:
93
+ estimated_rows = 0
94
+
95
+ metadata = {
96
+ "encoding": encoding,
97
+ "delimiter": delimiter,
98
+ "header_row": header_row,
99
+ "estimated_rows": estimated_rows,
100
+ "file_size": file_size,
101
+ }
102
+
103
+ return ParsedDataset(
104
+ columns=columns,
105
+ row_iterator=_row_generator(path, encoding, delimiter, header_row),
106
+ metadata=metadata,
107
+ )
@@ -0,0 +1,19 @@
1
+ """Core dataclasses for parsed tabular datasets."""
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Any
5
+
6
+
7
+ @dataclass
8
+ class ColumnInfo:
9
+ """Metadata about a single column parsed from the file header."""
10
+ name: str
11
+ position: int
12
+
13
+
14
+ @dataclass
15
+ class ParsedDataset:
16
+ """Result of parsing a tabular file. Row iterator is lazy/streaming."""
17
+ columns: list # list[ColumnInfo]
18
+ row_iterator: Any # Generator[list[str], None, None] — yields lists of raw strings
19
+ metadata: dict # encoding, delimiter, header_row, estimated_rows, file_size
@@ -0,0 +1 @@
1
+ """Column profiling for tabular datasets."""
@@ -0,0 +1,299 @@
1
+ """Single-pass streaming column profiler.
2
+
3
+ Processes rows one at a time using per-column accumulators.
4
+ Designed to work with index_local.py's main loop where profiling
5
+ and SQLite loading happen in the same pass over the data.
6
+ """
7
+
8
+ import re
9
+ from dataclasses import dataclass, field
10
+ from typing import Any, Optional
11
+
12
+ _NULL_VALUES = frozenset([
13
+ "", "null", "NULL", "none", "None", "N/A", "n/a", "NA", "na",
14
+ "NaN", "nan", "-", ".", "#N/A", "#NA", "#NULL!", "n.a.", "N.A.",
15
+ ])
16
+
17
+ # Type rank: lower = more specific
18
+ _TYPE_RANK = {"integer": 0, "float": 1, "datetime": 2, "string": 3}
19
+ _TYPE_FROM_RANK = {0: "integer", 1: "float", 2: "datetime", 3: "string"}
20
+
21
+ MAX_CARDINALITY_TRACK = 5_000 # stop adding new keys to value_counts after this
22
+ SAMPLE_SIZE = 10 # distinct non-null samples to collect
23
+ RESERVOIR_SIZE = 10_000 # numeric values for approximate median
24
+
25
+ VALUE_INDEX_CARDINALITY_LIMIT = 1_000 # full value map stored if cardinality <= this
26
+ TOP_VALUES_LIMIT = 50 # top values stored for high-cardinality columns
27
+
28
+ # Common datetime patterns (regex → strptime format string)
29
+ _DATETIME_PATTERNS = [
30
+ (re.compile(r"^\d{4}-\d{2}-\d{2}$"), "%Y-%m-%d"),
31
+ (re.compile(r"^\d{2}/\d{2}/\d{4}$"), "%m/%d/%Y"),
32
+ (re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}"), "%Y-%m-%dT%H:%M:%S"),
33
+ (re.compile(r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}"), "%Y-%m-%d %H:%M:%S"),
34
+ (re.compile(r"^\d{2}/\d{2}/\d{4} \d{2}:\d{2}:\d{2}"), "%m/%d/%Y %H:%M:%S"),
35
+ # US date + 12h time with AM/PM (e.g. "01/15/2020 12:00:00 AM")
36
+ (re.compile(r"^\d{1,2}/\d{1,2}/\d{4} \d{1,2}:\d{2}:\d{2} [AP]M$"), "%m/%d/%Y %I:%M:%S %p"),
37
+ ]
38
+
39
+
40
+ def _is_datetime_str(value: str) -> bool:
41
+ for rx, _ in _DATETIME_PATTERNS:
42
+ if rx.match(value):
43
+ return True
44
+ return False
45
+
46
+
47
+ def _get_datetime_format(value: str) -> Optional[str]:
48
+ for rx, fmt in _DATETIME_PATTERNS:
49
+ if rx.match(value):
50
+ return fmt
51
+ return None
52
+
53
+
54
+ @dataclass
55
+ class _ColAcc:
56
+ """Per-column accumulator updated once per row."""
57
+ name: str
58
+ position: int
59
+ # Type tracking (rank only advances upward)
60
+ type_rank: int = 0
61
+ # Row counts
62
+ count: int = 0 # non-null rows
63
+ null_count: int = 0
64
+ # Numeric stats (valid when type_rank <= 1)
65
+ num_min: float = field(default_factory=lambda: float("inf"))
66
+ num_max: float = field(default_factory=lambda: float("-inf"))
67
+ num_sum: float = 0.0
68
+ # Reservoir for approximate median (first RESERVOIR_SIZE numeric values)
69
+ reservoir: list = field(default_factory=list)
70
+ # Cardinality / value frequency
71
+ value_counts: dict = field(default_factory=dict)
72
+ cardinality_overflow: bool = False
73
+ # Samples: first SAMPLE_SIZE distinct non-null values seen
74
+ samples: list = field(default_factory=list)
75
+ _samples_set: set = field(default_factory=set)
76
+ # Datetime range (valid when type_rank == 2)
77
+ dt_min: Optional[str] = None
78
+ dt_max: Optional[str] = None
79
+ dt_format: Optional[str] = None
80
+
81
+
82
+ def update_acc(acc: _ColAcc, raw_value: str) -> None:
83
+ """Update accumulator with one raw string value from the CSV."""
84
+ stripped = raw_value.strip() if raw_value else ""
85
+
86
+ if stripped in _NULL_VALUES:
87
+ acc.null_count += 1
88
+ return
89
+
90
+ acc.count += 1
91
+
92
+ # --- Type detection & promotion ---
93
+ if acc.type_rank == 0: # currently integer
94
+ try:
95
+ int(stripped)
96
+ except ValueError:
97
+ acc.type_rank = 1 # promote to float
98
+
99
+ if acc.type_rank == 1: # currently float
100
+ try:
101
+ float(stripped)
102
+ except ValueError:
103
+ # Check datetime before falling to string
104
+ if _is_datetime_str(stripped):
105
+ acc.type_rank = 2
106
+ else:
107
+ acc.type_rank = 3 # string
108
+
109
+ if acc.type_rank == 2: # currently datetime
110
+ if not _is_datetime_str(stripped):
111
+ acc.type_rank = 3 # string
112
+ elif acc.dt_format is None:
113
+ acc.dt_format = _get_datetime_format(stripped)
114
+
115
+ # --- Numeric stats ---
116
+ if acc.type_rank <= 1:
117
+ try:
118
+ num = float(stripped)
119
+ if num < acc.num_min:
120
+ acc.num_min = num
121
+ if num > acc.num_max:
122
+ acc.num_max = num
123
+ acc.num_sum += num
124
+ if len(acc.reservoir) < RESERVOIR_SIZE:
125
+ acc.reservoir.append(num)
126
+ except ValueError:
127
+ pass
128
+
129
+ # --- Datetime min/max ---
130
+ if acc.type_rank == 2:
131
+ if acc.dt_min is None or stripped < acc.dt_min:
132
+ acc.dt_min = stripped
133
+ if acc.dt_max is None or stripped > acc.dt_max:
134
+ acc.dt_max = stripped
135
+
136
+ # --- Cardinality / value counts ---
137
+ if stripped in acc.value_counts:
138
+ acc.value_counts[stripped] += 1
139
+ elif not acc.cardinality_overflow:
140
+ if len(acc.value_counts) < MAX_CARDINALITY_TRACK:
141
+ acc.value_counts[stripped] = 1
142
+ else:
143
+ acc.cardinality_overflow = True
144
+ acc.value_counts[stripped] = 1
145
+
146
+ # --- Samples ---
147
+ if len(acc.samples) < SAMPLE_SIZE and stripped not in acc._samples_set:
148
+ acc.samples.append(stripped)
149
+ acc._samples_set.add(stripped)
150
+
151
+
152
+ def _compute_median(reservoir: list) -> Optional[float]:
153
+ if not reservoir:
154
+ return None
155
+ sorted_vals = sorted(reservoir)
156
+ n = len(sorted_vals)
157
+ mid = n // 2
158
+ if n % 2 == 0:
159
+ return (sorted_vals[mid - 1] + sorted_vals[mid]) / 2.0
160
+ return float(sorted_vals[mid])
161
+
162
+
163
+ @dataclass
164
+ class ColumnProfile:
165
+ """Fully computed profile for a single column."""
166
+ name: str
167
+ position: int
168
+ type: str # "integer", "float", "datetime", "string"
169
+ count: int # non-null row count
170
+ null_count: int
171
+ null_pct: float
172
+ cardinality: int
173
+ cardinality_is_exact: bool
174
+ is_unique: bool
175
+ is_primary_key_candidate: bool
176
+ min: Optional[Any]
177
+ max: Optional[Any]
178
+ mean: Optional[float]
179
+ median: Optional[float]
180
+ sample_values: list
181
+ value_index: Optional[dict] # full {value: count} for cardinality <= 1000
182
+ top_values: Optional[list] # [{"value": ..., "count": ...}] for high-cardinality
183
+ datetime_min: Optional[str] = None
184
+ datetime_max: Optional[str] = None
185
+ datetime_format: Optional[str] = None
186
+ ai_summary: Optional[str] = None
187
+
188
+
189
+ def finalize_profile(acc: _ColAcc) -> ColumnProfile:
190
+ """Build a ColumnProfile from a completed _ColAcc."""
191
+ total = acc.count + acc.null_count
192
+ null_pct = round(acc.null_count / total * 100, 1) if total > 0 else 0.0
193
+ col_type = _TYPE_FROM_RANK[acc.type_rank]
194
+
195
+ cardinality = len(acc.value_counts)
196
+ cardinality_is_exact = not acc.cardinality_overflow
197
+
198
+ is_unique = (cardinality_is_exact and cardinality == acc.count and acc.count > 0)
199
+ is_pk_candidate = (
200
+ is_unique
201
+ and acc.null_count == 0
202
+ and col_type in ("integer", "string")
203
+ )
204
+
205
+ # Numeric stats
206
+ if col_type in ("integer", "float") and acc.count > 0:
207
+ raw_min = acc.num_min
208
+ raw_max = acc.num_max
209
+ mean_val = round(acc.num_sum / acc.count, 4)
210
+ median_val = _compute_median(acc.reservoir)
211
+ if col_type == "integer":
212
+ min_val = int(raw_min) if raw_min != float("inf") else None
213
+ max_val = int(raw_max) if raw_max != float("-inf") else None
214
+ median_val = round(median_val, 1) if median_val is not None else None
215
+ else:
216
+ min_val = raw_min if raw_min != float("inf") else None
217
+ max_val = raw_max if raw_max != float("-inf") else None
218
+ else:
219
+ min_val = max_val = mean_val = median_val = None
220
+
221
+ # Convert sample values to their native type
222
+ samples: list = []
223
+ for s in acc.samples:
224
+ if col_type == "integer":
225
+ try:
226
+ samples.append(int(s))
227
+ continue
228
+ except ValueError:
229
+ pass
230
+ elif col_type == "float":
231
+ try:
232
+ samples.append(float(s))
233
+ continue
234
+ except ValueError:
235
+ pass
236
+ samples.append(s)
237
+
238
+ # Value index / top values
239
+ if cardinality <= VALUE_INDEX_CARDINALITY_LIMIT:
240
+ value_index: Optional[dict] = {}
241
+ for val_str, cnt in acc.value_counts.items():
242
+ if col_type == "integer":
243
+ try:
244
+ key: Any = int(val_str)
245
+ except ValueError:
246
+ key = val_str
247
+ elif col_type == "float":
248
+ try:
249
+ key = float(val_str)
250
+ except ValueError:
251
+ key = val_str
252
+ else:
253
+ key = val_str
254
+ value_index[str(key)] = cnt
255
+ top_values = None
256
+ else:
257
+ value_index = None
258
+ sorted_vals = sorted(acc.value_counts.items(), key=lambda x: x[1], reverse=True)
259
+ top_values = [{"value": v, "count": c} for v, c in sorted_vals[:TOP_VALUES_LIMIT]]
260
+
261
+ return ColumnProfile(
262
+ name=acc.name,
263
+ position=acc.position,
264
+ type=col_type,
265
+ count=acc.count,
266
+ null_count=acc.null_count,
267
+ null_pct=null_pct,
268
+ cardinality=cardinality,
269
+ cardinality_is_exact=cardinality_is_exact,
270
+ is_unique=is_unique,
271
+ is_primary_key_candidate=is_pk_candidate,
272
+ min=min_val,
273
+ max=max_val,
274
+ mean=mean_val,
275
+ median=median_val,
276
+ sample_values=samples,
277
+ value_index=value_index,
278
+ top_values=top_values,
279
+ datetime_min=acc.dt_min if col_type == "datetime" else None,
280
+ datetime_max=acc.dt_max if col_type == "datetime" else None,
281
+ datetime_format=acc.dt_format if col_type == "datetime" else None,
282
+ )
283
+
284
+
285
+ def infer_types_from_sample(
286
+ accs: list, # list[_ColAcc]
287
+ sample_rows: list,
288
+ ) -> list:
289
+ """Run a subset of rows through the accumulators to detect preliminary types.
290
+
291
+ Used to determine column types before creating the SQLite schema.
292
+ Returns the modified accs list (same objects, updated in-place).
293
+ """
294
+ n_cols = len(accs)
295
+ for row in sample_rows:
296
+ for i, acc in enumerate(accs):
297
+ raw = row[i] if i < len(row) else ""
298
+ update_acc(acc, raw)
299
+ return accs
@@ -0,0 +1,36 @@
1
+ """Numeric histogram computation from a column profile's reservoir."""
2
+
3
+ from typing import Optional
4
+
5
+
6
+ def compute_histogram(
7
+ reservoir: list,
8
+ bins: int = 10,
9
+ col_min: Optional[float] = None,
10
+ col_max: Optional[float] = None,
11
+ ) -> Optional[dict]:
12
+ """Compute a histogram from a reservoir of numeric values.
13
+
14
+ Returns a dict with 'bins' (count per bin) and 'edges' (bin boundaries),
15
+ or None if the reservoir is empty.
16
+ """
17
+ if not reservoir:
18
+ return None
19
+
20
+ lo = col_min if col_min is not None else min(reservoir)
21
+ hi = col_max if col_max is not None else max(reservoir)
22
+
23
+ if lo == hi:
24
+ return {"bins": [len(reservoir)], "edges": [lo, hi], "bin_count": 1}
25
+
26
+ bin_width = (hi - lo) / bins
27
+ counts = [0] * bins
28
+ edges = [round(lo + i * bin_width, 6) for i in range(bins + 1)]
29
+
30
+ for val in reservoir:
31
+ idx = int((val - lo) / bin_width)
32
+ if idx >= bins:
33
+ idx = bins - 1
34
+ counts[idx] += 1
35
+
36
+ return {"bins": counts, "edges": edges, "bin_count": bins}
@@ -0,0 +1,32 @@
1
+ """Value index builder for low-cardinality columns.
2
+
3
+ Low-cardinality columns (cardinality <= VALUE_INDEX_CARDINALITY_LIMIT) get a
4
+ full inverted index: {value_str: count}. This index is stored in index.json
5
+ and used by search_data for fast value matching without touching SQLite.
6
+ """
7
+
8
+ # Re-export the constant so callers can import from this module
9
+ from .column_profiler import VALUE_INDEX_CARDINALITY_LIMIT
10
+
11
+
12
+ def build_value_search_index(profiles: list) -> dict:
13
+ """Build a search-optimised view of all value indexes.
14
+
15
+ Returns:
16
+ {column_name: {"low_cardinality": bool, "values": [str, ...], "top_values": [...]}}
17
+
18
+ Used by search_data to quickly find columns whose values match a query.
19
+ """
20
+ result = {}
21
+ for p in profiles:
22
+ if p.value_index is not None:
23
+ result[p.name] = {
24
+ "low_cardinality": True,
25
+ "values": list(p.value_index.keys()),
26
+ }
27
+ elif p.top_values is not None:
28
+ result[p.name] = {
29
+ "low_cardinality": False,
30
+ "values": [tv["value"] for tv in p.top_values],
31
+ }
32
+ return result