thaieda 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thaieda-1.0.0/.github/ISSUE_TEMPLATE/bug_report.md +31 -0
- thaieda-1.0.0/.github/ISSUE_TEMPLATE/feature_request.md +19 -0
- thaieda-1.0.0/.github/workflows/ci.yml +41 -0
- thaieda-1.0.0/.gitignore +60 -0
- thaieda-1.0.0/AGENTS.md +55 -0
- thaieda-1.0.0/CHANGELOG.md +275 -0
- thaieda-1.0.0/CODE_OF_CONDUCT.md +55 -0
- thaieda-1.0.0/CONTRIBUTING.md +49 -0
- thaieda-1.0.0/LICENSE +201 -0
- thaieda-1.0.0/PKG-INFO +270 -0
- thaieda-1.0.0/README.md +193 -0
- thaieda-1.0.0/assets/fonts/README.md +13 -0
- thaieda-1.0.0/assets/fonts/Sarabun-OFL.txt +93 -0
- thaieda-1.0.0/assets/fonts/Sarabun-Regular.ttf +0 -0
- thaieda-1.0.0/eval/README.md +130 -0
- thaieda-1.0.0/eval/fixtures/build_fixtures.py +243 -0
- thaieda-1.0.0/eval/fixtures/clean-thai.csv +101 -0
- thaieda-1.0.0/eval/fixtures/coffee-chain/CUSTOMER.csv +6001 -0
- thaieda-1.0.0/eval/fixtures/coffee-chain/DATE_DIM.csv +732 -0
- thaieda-1.0.0/eval/fixtures/coffee-chain/INVENTORY.csv +10001 -0
- thaieda-1.0.0/eval/fixtures/coffee-chain/LOCAL_EVENT.csv +1441 -0
- thaieda-1.0.0/eval/fixtures/coffee-chain/ORDER.csv +10001 -0
- thaieda-1.0.0/eval/fixtures/coffee-chain/PRODUCT.csv +61 -0
- thaieda-1.0.0/eval/fixtures/coffee-chain/PROMOTION.csv +8001 -0
- thaieda-1.0.0/eval/fixtures/coffee-chain/STORE.csv +21 -0
- thaieda-1.0.0/eval/fixtures/coffee-chain/TRANSACTION.csv +20978 -0
- thaieda-1.0.0/eval/fixtures/dirty-thai-labeled.csv +81 -0
- thaieda-1.0.0/eval/fixtures/superstore.csv +10801 -0
- thaieda-1.0.0/eval/manifests/coffee-chain-schema.expected.json +25 -0
- thaieda-1.0.0/eval/manifests/dirty-thai-labeled.expected.json +12 -0
- thaieda-1.0.0/eval/results/REPORT.md +58 -0
- thaieda-1.0.0/eval/results/results.json +60 -0
- thaieda-1.0.0/eval/run_eval.py +253 -0
- thaieda-1.0.0/eval/scenarios/__init__.py +1 -0
- thaieda-1.0.0/eval/scenarios/s1_thai_quality.py +115 -0
- thaieda-1.0.0/eval/scenarios/s2_relationships.py +79 -0
- thaieda-1.0.0/eval/scenarios/s3_insight_honesty.py +106 -0
- thaieda-1.0.0/pyproject.toml +126 -0
- thaieda-1.0.0/research/README.md +8 -0
- thaieda-1.0.0/research/eda-anomaly-research.md +194 -0
- thaieda-1.0.0/research/llm-privacy-design.md +776 -0
- thaieda-1.0.0/research/viz-cleaning-research.md +49 -0
- thaieda-1.0.0/src/thaieda/__init__.py +295 -0
- thaieda-1.0.0/src/thaieda/analysis/__init__.py +266 -0
- thaieda-1.0.0/src/thaieda/anomaly/__init__.py +1369 -0
- thaieda-1.0.0/src/thaieda/clean/__init__.py +948 -0
- thaieda-1.0.0/src/thaieda/cli.py +812 -0
- thaieda-1.0.0/src/thaieda/detect/__init__.py +357 -0
- thaieda-1.0.0/src/thaieda/i18n/__init__.py +268 -0
- thaieda-1.0.0/src/thaieda/insight/__init__.py +694 -0
- thaieda-1.0.0/src/thaieda/insight_engine/__init__.py +1248 -0
- thaieda-1.0.0/src/thaieda/io/__init__.py +235 -0
- thaieda-1.0.0/src/thaieda/llm/__init__.py +128 -0
- thaieda-1.0.0/src/thaieda/llm/_anonymize.py +212 -0
- thaieda-1.0.0/src/thaieda/llm/_prepare.py +279 -0
- thaieda-1.0.0/src/thaieda/llm/_prompt.py +248 -0
- thaieda-1.0.0/src/thaieda/llm/_provider.py +250 -0
- thaieda-1.0.0/src/thaieda/ner/__init__.py +260 -0
- thaieda-1.0.0/src/thaieda/quality/__init__.py +636 -0
- thaieda-1.0.0/src/thaieda/report/__init__.py +921 -0
- thaieda-1.0.0/src/thaieda/report/_dataset.py +143 -0
- thaieda-1.0.0/src/thaieda/report/_dataset_template.py +214 -0
- thaieda-1.0.0/src/thaieda/report/_template.py +583 -0
- thaieda-1.0.0/src/thaieda/schema/__init__.py +656 -0
- thaieda-1.0.0/src/thaieda/text/__init__.py +295 -0
- thaieda-1.0.0/src/thaieda/timeseries/__init__.py +620 -0
- thaieda-1.0.0/src/thaieda/tokenize/__init__.py +150 -0
- thaieda-1.0.0/src/thaieda/viz/__init__.py +1306 -0
- thaieda-1.0.0/tests/test_analysis.py +124 -0
- thaieda-1.0.0/tests/test_anomaly.py +391 -0
- thaieda-1.0.0/tests/test_clean.py +346 -0
- thaieda-1.0.0/tests/test_cli.py +183 -0
- thaieda-1.0.0/tests/test_detect.py +209 -0
- thaieda-1.0.0/tests/test_insight.py +295 -0
- thaieda-1.0.0/tests/test_insight_engine.py +377 -0
- thaieda-1.0.0/tests/test_io.py +141 -0
- thaieda-1.0.0/tests/test_llm.py +663 -0
- thaieda-1.0.0/tests/test_ner.py +150 -0
- thaieda-1.0.0/tests/test_oneliner.py +361 -0
- thaieda-1.0.0/tests/test_quality.py +166 -0
- thaieda-1.0.0/tests/test_report.py +168 -0
- thaieda-1.0.0/tests/test_schema.py +378 -0
- thaieda-1.0.0/tests/test_text.py +86 -0
- thaieda-1.0.0/tests/test_timeseries.py +221 -0
- thaieda-1.0.0/tests/test_tokenize.py +68 -0
- thaieda-1.0.0/tests/test_v08.py +277 -0
- thaieda-1.0.0/tests/test_viz.py +249 -0
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: Bug report
|
|
3
|
+
about: รายงานปัญหาการใช้งาน
|
|
4
|
+
title: "[BUG] "
|
|
5
|
+
labels: bug
|
|
6
|
+
assignees: ''
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## อธิบายปัญหา
|
|
10
|
+
คำอธิบายสั้น ๆ ว่าเกิดอะไรขึ้น
|
|
11
|
+
|
|
12
|
+
## ข้อมูลสำหรับทำซ้ำ
|
|
13
|
+
ขั้นตอนที่ทำให้เกิดปัญหา:
|
|
14
|
+
1. ...
|
|
15
|
+
2. ...
|
|
16
|
+
|
|
17
|
+
```python
|
|
18
|
+
# โค้ดตัวอย่าง
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## ผลที่ได้
|
|
22
|
+
...
|
|
23
|
+
|
|
24
|
+
## ผลที่คาดหวัง
|
|
25
|
+
...
|
|
26
|
+
|
|
27
|
+
## สภาพแวดล้อม
|
|
28
|
+
- OS: [เช่น Windows 11, Ubuntu 22.04]
|
|
29
|
+
- Python version:
|
|
30
|
+
- ThaiEDA version:
|
|
31
|
+
- Tokenizer engine: [pythainlp / nlpo3 / attacut / none]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: Feature request
|
|
3
|
+
about: เสนอ feature ใหม่
|
|
4
|
+
title: "[FEAT] "
|
|
5
|
+
labels: enhancement
|
|
6
|
+
assignees: ''
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## Feature ที่อยากเสนอ
|
|
10
|
+
...
|
|
11
|
+
|
|
12
|
+
## ทำไมถึงมีประโยชน์
|
|
13
|
+
...
|
|
14
|
+
|
|
15
|
+
## ทางเลือกที่พอมี
|
|
16
|
+
...
|
|
17
|
+
|
|
18
|
+
## ข้อมูลเพิ่มเติม
|
|
19
|
+
...
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main, develop]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ${{ matrix.os }}
|
|
12
|
+
strategy:
|
|
13
|
+
fail-fast: false
|
|
14
|
+
matrix:
|
|
15
|
+
os: [ubuntu-latest, windows-latest]
|
|
16
|
+
python-version: ["3.10", "3.11", "3.12", "3.13"]
|
|
17
|
+
|
|
18
|
+
steps:
|
|
19
|
+
- uses: actions/checkout@v4
|
|
20
|
+
|
|
21
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
22
|
+
uses: actions/setup-python@v5
|
|
23
|
+
with:
|
|
24
|
+
python-version: ${{ matrix.python-version }}
|
|
25
|
+
|
|
26
|
+
- name: Install dependencies
|
|
27
|
+
run: |
|
|
28
|
+
python -m pip install --upgrade pip
|
|
29
|
+
pip install -e ".[dev,thai,viz]"
|
|
30
|
+
|
|
31
|
+
- name: Lint with ruff
|
|
32
|
+
run: |
|
|
33
|
+
ruff check src/ tests/
|
|
34
|
+
ruff format --check src/ tests/
|
|
35
|
+
|
|
36
|
+
- name: Type check with mypy
|
|
37
|
+
run: mypy src/thaieda --ignore-missing-imports
|
|
38
|
+
continue-on-error: true
|
|
39
|
+
|
|
40
|
+
- name: Test with pytest
|
|
41
|
+
run: pytest tests/ -v --tb=short
|
thaieda-1.0.0/.gitignore
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
*.egg-info/
|
|
7
|
+
*.egg
|
|
8
|
+
dist/
|
|
9
|
+
build/
|
|
10
|
+
.eggs/
|
|
11
|
+
wheels/
|
|
12
|
+
|
|
13
|
+
# Virtual environments
|
|
14
|
+
.venv/
|
|
15
|
+
venv/
|
|
16
|
+
env/
|
|
17
|
+
ENV/
|
|
18
|
+
|
|
19
|
+
# IDE
|
|
20
|
+
.idea/
|
|
21
|
+
.vscode/
|
|
22
|
+
*.swp
|
|
23
|
+
*.swo
|
|
24
|
+
*~
|
|
25
|
+
.DS_Store
|
|
26
|
+
|
|
27
|
+
# Testing
|
|
28
|
+
.pytest_cache/
|
|
29
|
+
.coverage
|
|
30
|
+
htmlcov/
|
|
31
|
+
.tox/
|
|
32
|
+
.mypy_cache/
|
|
33
|
+
.ruff_cache/
|
|
34
|
+
|
|
35
|
+
# Jupyter
|
|
36
|
+
.ipynb_checkpoints/
|
|
37
|
+
*.ipynb_checkpoints
|
|
38
|
+
|
|
39
|
+
# OS
|
|
40
|
+
Thumbs.db
|
|
41
|
+
desktop.ini
|
|
42
|
+
|
|
43
|
+
# Secrets — ห้าม commit เด็ดขาด
|
|
44
|
+
.env
|
|
45
|
+
.env.*
|
|
46
|
+
*.pem
|
|
47
|
+
*.key
|
|
48
|
+
secret*
|
|
49
|
+
credentials*
|
|
50
|
+
|
|
51
|
+
# Generated reports (output จากการรัน)
|
|
52
|
+
*.thaieda.html
|
|
53
|
+
*.cleaned.csv
|
|
54
|
+
reports_output/
|
|
55
|
+
|
|
56
|
+
# Test/example data — ห้าม commit (ไฟล์ใหญ่ ใช้ทดสอบเท่านั้น)
|
|
57
|
+
data-example/
|
|
58
|
+
|
|
59
|
+
# Git worktrees ภายใน (สำหรับงาน agent)
|
|
60
|
+
_worktrees/
|
thaieda-1.0.0/AGENTS.md
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# AGENTS.md — ThaiEDA
|
|
2
|
+
|
|
3
|
+
## Project
|
|
4
|
+
ThaiEDA คือ open-source AutoEDA library สำหรับข้อมูลภาษาไทย
|
|
5
|
+
- Repo: https://github.com/peetwan/thaieda
|
|
6
|
+
- License: Apache-2.0
|
|
7
|
+
- Python 3.10+, pandas + matplotlib + Jinja2 + pythainlp (optional)
|
|
8
|
+
- Current version: v0.8 (commit 3351e9c)
|
|
9
|
+
- Tests: 356 passed, 1 skipped, ruff clean
|
|
10
|
+
|
|
11
|
+
## Structure
|
|
12
|
+
```
|
|
13
|
+
src/thaieda/
|
|
14
|
+
detect/ column type detection + Thai month name detection (v0.8)
|
|
15
|
+
tokenize/ tokenizer adapter (pythainlp/nlpo3/attacut)
|
|
16
|
+
text/ text metrics
|
|
17
|
+
quality/ Thai data quality checks + placeholder/constant detection (v0.8) + vectorized BE
|
|
18
|
+
anomaly/ anomaly detection (numeric + text + Thai-specific)
|
|
19
|
+
clean/ data cleaning: encoding, zwspace, numerals, BE→CE, dates, duplicates, missing (v0.8)
|
|
20
|
+
ner/ Thai NER (v0.2)
|
|
21
|
+
analysis/ target variable analysis (v0.2)
|
|
22
|
+
insight/ auto insight summary — interpreter (v0.3) + distribution/correlation (v0.4)
|
|
23
|
+
insight_engine/ cross-column insight discovery — 6 patterns + BH correction (v0.6+v0.8)
|
|
24
|
+
timeseries/ timeseries analysis (v0.4)
|
|
25
|
+
schema/ multi-file schema discovery — PK/FK + relationship matching (v0.5)
|
|
26
|
+
viz/ visualization + Thai font + insight charts (v0.7)
|
|
27
|
+
report/ HTML report (Jinja2) + DatasetReport (v0.5)
|
|
28
|
+
i18n/ TH/EN labels
|
|
29
|
+
llm/ placeholder (v0.9+)
|
|
30
|
+
tests/ pytest (356 tests)
|
|
31
|
+
research/ research notes (cron-generated, NOT source code)
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## v0.8 additions
|
|
35
|
+
- clean/: coerce_numeric_column, convert_buddhist_era, normalize_dates, remove_duplicate_rows, handle_missing_values
|
|
36
|
+
- insight_engine/: _detect_strong_correlations, _detect_outlier_insights, cross-pattern novelty, adaptive min_segment, all_buckets in trend evidence
|
|
37
|
+
- quality/: check_placeholder_values, check_constant_column, vectorized check_buddhist_era
|
|
38
|
+
- detect/: Thai month name detection in _looks_like_datetime
|
|
39
|
+
- io/: Excel (.xlsx/.xls) support via openpyxl
|
|
40
|
+
|
|
41
|
+
## Cron job rules
|
|
42
|
+
- Cron jobs แก้ไขเฉพาะ `research/` directory เท่านั้น
|
|
43
|
+
- ห้ามแก้ src/, tests/, pyproject.toml, หรือ config อื่น ๆ
|
|
44
|
+
- ทุก finding ต้องมี source URL
|
|
45
|
+
- Git commit message: `docs: research update - <topic> (YYYY-MM-DD)`
|
|
46
|
+
- Push to origin main
|
|
47
|
+
|
|
48
|
+
## Coding conventions
|
|
49
|
+
- Python 3.10+ with type hints
|
|
50
|
+
- Thai docstrings/comments
|
|
51
|
+
- Lazy imports for optional deps
|
|
52
|
+
- No silent fallbacks — fail loudly with helpful message
|
|
53
|
+
- matplotlib Agg backend (no GUI)
|
|
54
|
+
- Vectorized operations preferred (pandas .str accessors over row-by-row loops)
|
|
55
|
+
- Use contextlib.suppress instead of try/except/pass
|
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [0.9.0] - 2026-06-25
|
|
9
|
+
|
|
10
|
+
Headline feature: **privacy-preserving LLM analysis** — feed your EDA results to
|
|
11
|
+
any LLM (OpenAI / Anthropic / Ollama) with selectable privacy modes so you
|
|
12
|
+
control exactly what data leaves your machine.
|
|
13
|
+
|
|
14
|
+
### Added — LLM Analysis Module
|
|
15
|
+
|
|
16
|
+
Four **privacy modes** for `analyze_with_llm()` — ranked from safest to riskiest:
|
|
17
|
+
|
|
18
|
+
| Mode | What leaves the machine | Privacy guarantee | Use case |
|
|
19
|
+
|------|-------------------------|-------------------|----------|
|
|
20
|
+
| `insight_only` (default) | Only summary statistics + insight cards sent | Raw data never leaves the machine | Regulated/PDPA data, default for cautious users |
|
|
21
|
+
| `anonymized` | PII replaced with reversible tokens before sending | Names/phones/ID cards replaced with `[NAME_1]`, `[PHONE_1]` | Need LLM to see structure/patterns without raw PII |
|
|
22
|
+
| `dp_noise` | Stats with Laplace mechanism noise (ε configurable) | DP noise prevents re-identification from small stats | Small datasets where stats alone may leak identity |
|
|
23
|
+
| `full` | All raw data sent (user accepts risk) | None — user accepts tradeoff for max accuracy | Public/non-sensitive data; dev/demo workflows |
|
|
24
|
+
|
|
25
|
+
### New Files (v0.9)
|
|
26
|
+
|
|
27
|
+
- `src/thaieda/llm/__init__.py` — public API: `analyze_with_llm(df, privacy, provider, model, language)`
|
|
28
|
+
- `src/thaieda/llm/_prepare.py` — `prepare_for_llm()` — prepares data per privacy mode (4 modes)
|
|
29
|
+
- `src/thaieda/llm/_anonymize.py` — `anonymize_dataframe()` — PII detection (phone, ID card, NER)
|
|
30
|
+
- `src/thaieda/llm/_prompt.py` — `build_prompt()` — Thai/English prompt builder
|
|
31
|
+
- `src/thaieda/llm/_provider.py` — `call_llm()` — lazy import OpenAI/Anthropic/Ollama
|
|
32
|
+
- `tests/test_llm.py` — 59 tests covering all modes + protocol contracts
|
|
33
|
+
|
|
34
|
+
### Added — Three LLM Providers (all optional / lazy)
|
|
35
|
+
|
|
36
|
+
| Provider | Package | Default model | API key env var |
|
|
37
|
+
|----------|---------|---------------|-----------------|
|
|
38
|
+
| `openai` (default) | `pip install openai` | `gpt-4o-mini` | `OPENAI_API_KEY` |
|
|
39
|
+
| `anthropic` | `pip install anthropic` | `claude-3-5-sonnet-20241022` | `ANTHROPIC_API_KEY` |
|
|
40
|
+
| `ollama` | `pip install ollama` *(or built-in HTTP fallback)* | `llama3.1` | `OLLAMA_HOST` (default `http://localhost:11434`) |
|
|
41
|
+
|
|
42
|
+
**All LLM dependencies are optional and lazy-imported** — the library imports
|
|
43
|
+
fine without `openai`/`anthropic`/`ollama` installed. The provider raises a
|
|
44
|
+
helpful `ImportError`/`RuntimeError` only when actually called.
|
|
45
|
+
|
|
46
|
+
### Privacy Guarantees Per Mode
|
|
47
|
+
|
|
48
|
+
- **`insight_only`** — data DataFrame is never sent; only `summary` and insight
|
|
49
|
+
cards computed locally. The LLM sees aggregated stats, not rows. **Nothing
|
|
50
|
+
user-identifiable leaves the machine** beyond aggregate statistics.
|
|
51
|
+
- **`anonymized`** — phone numbers (all Thai formats: `081-234-5678`,
|
|
52
|
+
`0812345678`, `+668****5678`), ID cards (`X-XXXX-XXXXX-XX-X`), and named
|
|
53
|
+
entities (PERSON/LOCATION/ORGANIZATION via NER) are replaced with consistent
|
|
54
|
+
tokens. The `token_map` is returned so users can reverse-lookup tokens to
|
|
55
|
+
originals locally. Numeric columns are untouched.
|
|
56
|
+
- **`dp_noise`** — Laplace mechanism noise added to numeric stats (mean, min,
|
|
57
|
+
max, count) and categorical counts. Configurable `epsilon` (smaller = more
|
|
58
|
+
noise = more privacy). `epsilon=0` raises `ValueError`.
|
|
59
|
+
- **`full`** — raw DataFrame copied and sent in prompt. No privacy guarantee.
|
|
60
|
+
|
|
61
|
+
### Tests
|
|
62
|
+
|
|
63
|
+
- 23 new tests in `test_llm.py` covering real-world Thai government dataset,
|
|
64
|
+
token reversibility, prompt branding, full pipeline mock, dp_noise with fixed
|
|
65
|
+
seed, and ollama dispatch.
|
|
66
|
+
- Total LLM tests: 59 (36 original + 23 new), all passing.
|
|
67
|
+
- Ruff clean.
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
## [0.8.0] - 2026-06-25
|
|
72
|
+
|
|
73
|
+
Headline feature: **clean data + actionable insights** — 12 improvements addressing
|
|
74
|
+
real-world gaps found by code review (Claude Code) and testing on messy Thai data.
|
|
75
|
+
|
|
76
|
+
### Added — Data Cleaning
|
|
77
|
+
- **`coerce_numeric_column()`** — converts string columns with Thai numerals to proper
|
|
78
|
+
numeric dtype (not NaN). Handles placeholder values (`-`, `N/A`, `ไม่มี`) → NaN first.
|
|
79
|
+
- **`convert_buddhist_era()`** — converts BE years (2440–2599) to CE in both numeric
|
|
80
|
+
columns (`2530` → `1987`) and date-string columns (`2567-01-15` → `2024-01-15`).
|
|
81
|
+
- **`normalize_dates()`** — standardizes Thai month name dates to ISO format
|
|
82
|
+
(`15 มกราคม 2567` → `15/01/2024`) and converts BE→CE in date strings.
|
|
83
|
+
- **`remove_duplicate_rows()`** — detects and removes fully duplicated rows from a DataFrame.
|
|
84
|
+
- **`handle_missing_values()`** — 5 strategies: `flag` (0/ไม่ระบุ), `drop`, `median`,
|
|
85
|
+
`mode`, `unknown`. Type-aware (numeric vs text).
|
|
86
|
+
|
|
87
|
+
### Added — Insight Engine
|
|
88
|
+
- **Correlation pattern** — detects strong pairwise correlations (|r| ≥ 0.7) between
|
|
89
|
+
numeric columns and generates insight cards with r-value and direction.
|
|
90
|
+
- **Outlier pattern** — detects row-level statistical outliers (z-score ≥ 3) and
|
|
91
|
+
generates insight cards with outlier count, percentage, and max z-score.
|
|
92
|
+
- **Cross-pattern novelty penalty** — when multiple patterns point to the same
|
|
93
|
+
breakdown × measure × segment (e.g., an outlier causing outstanding + comparison +
|
|
94
|
+
attribution), only the highest-ranked keeps full score.
|
|
95
|
+
- **Adaptive `min_segment`** — auto-adjusts for small datasets (`max(5, total_n // 20)`)
|
|
96
|
+
with a note when triggered.
|
|
97
|
+
- **Trend evidence `all_buckets`** — trend findings now include the full bucket series
|
|
98
|
+
in evidence, fixing the v0.7 bug where trend charts showed only 2 data points.
|
|
99
|
+
- **`_recompute_full` for correlation/outlier** — correlation and outlier evidence is
|
|
100
|
+
recomputed on the full dataset (not just sample) during two-phase scoring.
|
|
101
|
+
|
|
102
|
+
### Added — Quality Checks
|
|
103
|
+
- **`check_placeholder_values()`** — flags placeholder values (`-`, `N/A`, `NULL`,
|
|
104
|
+
`ไม่มี`, `ไม่ระบุ`) that should be NaN. Vectorized with `.isin()`.
|
|
105
|
+
- **`check_constant_column()`** — flags columns with zero variance (single unique value)
|
|
106
|
+
as useless for analysis.
|
|
107
|
+
|
|
108
|
+
### Added — Detection
|
|
109
|
+
- **Thai month name detection** — `_looks_like_datetime` now recognizes Thai month names
|
|
110
|
+
(`15 มกราคม 2567`, `1 ก.พ. 67`) in addition to numeric date formats.
|
|
111
|
+
|
|
112
|
+
### Added — I/O
|
|
113
|
+
- **Excel support** — `.xlsx` and `.xls` files via `pd.read_excel` (requires `openpyxl`).
|
|
114
|
+
Auto-detected from file extension; error message guides installation if missing.
|
|
115
|
+
|
|
116
|
+
### Changed
|
|
117
|
+
- **Vectorized `check_buddhist_era`** — uses `.str.extractall` instead of row-by-row
|
|
118
|
+
loop; falls back to original method on error. Counts rows (not matches) to prevent
|
|
119
|
+
percentage > 100%.
|
|
120
|
+
- Roadmap shifted: v0.9 = LLM Q&A, v1.0 = interactive dashboard (was v0.8/v0.9).
|
|
121
|
+
- Architecture section updated to reflect v0.8 modules.
|
|
122
|
+
|
|
123
|
+
### Tests
|
|
124
|
+
- 35 new tests in `test_v08.py` covering all new functions (happy path + edge cases).
|
|
125
|
+
- Total: 356 passed, 1 skipped, ruff clean.
|
|
126
|
+
|
|
127
|
+
---
|
|
128
|
+
|
|
129
|
+
## [0.7.0] - 2026-06-25
|
|
130
|
+
|
|
131
|
+
Headline feature: **insight visualization** — every cross-column insight card now
|
|
132
|
+
includes an auto-generated chart matched to its pattern, embedded in the HTML report.
|
|
133
|
+
|
|
134
|
+
### Added
|
|
135
|
+
- **`thaieda.viz` insight chart functions**:
|
|
136
|
+
- `create_insight_outstanding_chart(segments, top_segment, title, font_path)` —
|
|
137
|
+
horizontal bar chart; top segment highlighted in green, others in muted gray.
|
|
138
|
+
- `create_insight_attribution_chart(segments, top_segment, share, title, font_path)` —
|
|
139
|
+
donut chart with the dominant segment's share % in the center.
|
|
140
|
+
- `create_insight_comparison_chart(df, breakdown, measure, top_segment, title, font_path)` —
|
|
141
|
+
box plot by group; top segment highlighted in red, groups beyond top-9 collapsed into "อื่น ๆ".
|
|
142
|
+
- `create_insight_trend_chart(segments, direction, tau, title, font_path)` —
|
|
143
|
+
line chart with filled area, direction arrow annotation, and τ value in the title.
|
|
144
|
+
- `create_insight_chart(card_dict, df, font_path)` — dispatcher that picks the right
|
|
145
|
+
chart based on the card's `pattern` field.
|
|
146
|
+
- **`ProfileReport._build_insight_charts()`** — generates a chart for each insight card
|
|
147
|
+
during `profile()` and embeds it in the HTML report alongside the text + evidence table.
|
|
148
|
+
- **HTML template** — insight cards now render an `<img>` (base64 PNG) when a chart is
|
|
149
|
+
available; evidence table is hidden for trend cards (the line chart replaces it).
|
|
150
|
+
|
|
151
|
+
### Changed
|
|
152
|
+
- Roadmap shifted: v0.8 = LLM Q&A, v0.9 = interactive dashboard (was v0.7/v0.8).
|
|
153
|
+
|
|
154
|
+
### Fixed
|
|
155
|
+
- Trend chart annotation: replaced Unicode arrows (↗↘) with ASCII text ("Trend UP"/"Trend DOWN")
|
|
156
|
+
to avoid missing-glyph warnings on Thai fonts.
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
## [0.6.0] - 2026-06-25
|
|
161
|
+
|
|
162
|
+
Headline feature: **cross-column insight engine** — a *discoverer* (vs. the existing
|
|
163
|
+
`insight` *interpreter*) that combines columns (group-by + aggregate + statistical scoring)
|
|
164
|
+
to surface non-obvious business findings in Thai, ranked by an interestingness pipeline.
|
|
165
|
+
Works on **any** dataset — zero column-name logic, no domain overfitting; everything is
|
|
166
|
+
driven by `ColumnType` + cardinality + value ranges.
|
|
167
|
+
|
|
168
|
+
### Added
|
|
169
|
+
- **`thaieda.insight_engine` module** — discovers insights from column combinations:
|
|
170
|
+
- `discover_insights(df, column_types, *, top_n=8, sample_size=100_000, min_segment=30, progress=)`
|
|
171
|
+
— builds perspectives (breakdown × measure × agg) and detects **4 patterns**:
|
|
172
|
+
- **outstanding** — one segment dominates (`top/second ≥ 1.5`),
|
|
173
|
+
- **attribution** — one segment is a large share (`≥ 50%`) of a total (`≥ 3` segments),
|
|
174
|
+
- **comparison** — top segment differs significantly from the rest (ANOVA/Kruskal + JSD),
|
|
175
|
+
- **trend** — monotonic movement over an ordered (datetime-bucketed) axis (Mann-Kendall).
|
|
176
|
+
- Dataclasses `Perspective`, `InsightCard`, `InsightEngineResult` (all with `to_dict()`).
|
|
177
|
+
- **Interestingness pipeline**: `gate → score → penalize → rank` —
|
|
178
|
+
`final = gate × (0.5·pattern_score + 0.5·effect_size) × novelty × (1 − triviality)`.
|
|
179
|
+
- **Benjamini-Hochberg correction** across all candidate significance tests (FDR control
|
|
180
|
+
for hundreds of comparisons). Mann-Kendall p-values are computed with `math.erf`, so trend
|
|
181
|
+
significance works even without scipy; ANOVA p-values degrade to effect-size-only + a note.
|
|
182
|
+
- **Two-phase**: scores candidates on a stratified sample (~100k rows), then recomputes exact
|
|
183
|
+
numbers on the full data for the top-N only — handles 1M+ rows (804k rows in ~5s).
|
|
184
|
+
- **Triviality / non-additive guards**: excludes ID / near-unique / single-group breakdowns;
|
|
185
|
+
skips `sum` for measures bounded in `[0,1]` (or `[0,100]` percentage-like floats).
|
|
186
|
+
- Category keys are normalized before group-by (Thai numerals → Arabic, zero-width stripped,
|
|
187
|
+
trailing `.0` removed) via the shared `schema._normalize_key_series` — prevents split groups.
|
|
188
|
+
- **`ProfileReport(insights_engine=True, insights_top=8)`** — new analysis stage (after target
|
|
189
|
+
analysis, before timeseries), wrapped in try/except → `notes`. Top 3 cards feed the existing
|
|
190
|
+
`InsightSummary` (new `business` category) so they appear in the executive summary.
|
|
191
|
+
- **Dedicated HTML report section** "ข้อค้นพบจากการวิเคราะห์คอลัมน์ผสม" (Cross-Column Insights)
|
|
192
|
+
with pattern badges, per-card evidence mini-tables (top segments, share, lift, p-value, τ).
|
|
193
|
+
- **CLI** `--no-insights` and `--insights-top N` flags on `profile` and `run` (on by default).
|
|
194
|
+
- New `thaieda` top-level exports: `discover_insights`, `InsightCard`, `InsightEngineResult`,
|
|
195
|
+
`Perspective`.
|
|
196
|
+
|
|
197
|
+
## [0.5.0] - 2026-06-25
|
|
198
|
+
|
|
199
|
+
Headline feature: **multi-file schema discovery** — analyze a whole folder of related
|
|
200
|
+
files as one dataset, automatically infer how the tables connect, and render a combined
|
|
201
|
+
report with an interactive ER diagram.
|
|
202
|
+
|
|
203
|
+
### Added
|
|
204
|
+
- **`thaieda.schema` module** — discovers relationships between multiple data files:
|
|
205
|
+
- `discover_keys(df, table_name)` — finds primary/foreign key candidate columns
|
|
206
|
+
(name hints via `_name_hints_id` + uniqueness ≥ 95%, excluding boolean/constant columns).
|
|
207
|
+
- `match_relationships(tables, profiles, validate_values=, sample_size=)` — matches
|
|
208
|
+
columns across tables by normalized name, infers direction (unique side = parent/PK,
|
|
209
|
+
non-unique side = child/FK), and confirms with real value overlap. Both-non-unique
|
|
210
|
+
pairs are never linked (prevents `date ↔ date` false positives).
|
|
211
|
+
- `profile_dataset(paths, ...)` — reads a directory or list of files (`.csv/.json/.jsonl/.ndjson`),
|
|
212
|
+
profiles each table, and returns a `DatasetProfile`.
|
|
213
|
+
- Dataclasses `KeyCandidate`, `Relationship`, `TableProfile`, `DatasetProfile`
|
|
214
|
+
(all with `to_dict()`); `DatasetProfile.to_mermaid()` / `to_json()`.
|
|
215
|
+
- Key values are normalized before comparison (Thai numerals → Arabic, zero-width
|
|
216
|
+
characters stripped, trailing `.0` float artifacts removed) — vectorized for million-row tables.
|
|
217
|
+
- Orphan detection: foreign-key values with no matching primary key are reported in Thai.
|
|
218
|
+
- **`DatasetReport`** (`thaieda.report._dataset`) — self-contained HTML report with a schema
|
|
219
|
+
overview, a Mermaid.js ER diagram (loaded via CDN), per-table summaries with key columns,
|
|
220
|
+
a relationships table (overlap %, orphans, cardinality, confidence), and orphan findings.
|
|
221
|
+
- **CLI `thaieda dataset <dir|files...>`** — multi-file analysis to an HTML report
|
|
222
|
+
(`--no-validate`, `--json`, `--lang`, `--quiet`). `thaieda run`/`thaieda profile` now
|
|
223
|
+
auto-route to dataset mode when given a directory containing ≥ 2 supported files.
|
|
224
|
+
- New `thaieda` top-level exports: `profile_dataset`, `DatasetProfile`, `Relationship`,
|
|
225
|
+
`KeyCandidate`, `TableProfile`, `DatasetReport`.
|
|
226
|
+
|
|
227
|
+
## [0.4.1] - 2026-06-25
|
|
228
|
+
|
|
229
|
+
Performance fixes from real-world testing on large datasets (1M+ rows), plus UX/visual polish.
|
|
230
|
+
|
|
231
|
+
### Performance
|
|
232
|
+
- **Vectorized text cleaning** (`clean._apply_str_transform`): replaced the per-row Python
|
|
233
|
+
loop with vectorized pandas operations (`.str` accessors / bulk assignment). A single
|
|
234
|
+
cleaning operation on 1M rows now runs in ~1s (was 100s+). Encoding repair (ftfy) now
|
|
235
|
+
pre-filters rows so it only runs on likely-mojibake cells (~14.5s → ~0.5s on 1M rows).
|
|
236
|
+
- **Sampled ML anomaly detection**: Isolation Forest / LOF now sample down to 10,000 rows
|
|
237
|
+
on large columns (statistical z-score/MAD/IQR still run on full data). LOF on 1.3M rows
|
|
238
|
+
went from >180s to <1s. The result notes when sampling was used.
|
|
239
|
+
- **LOF on duplicate-heavy data**: skips LOF when >50% of values are duplicates, bumps
|
|
240
|
+
`n_neighbors` otherwise, and suppresses the noisy sklearn duplicate-distance warning.
|
|
241
|
+
|
|
242
|
+
### Fixed
|
|
243
|
+
- Cleaning suggestions no longer fail on numeric/ID columns (e.g. `int64` `customer_id`) —
|
|
244
|
+
suggestions now run only on text columns (THAI_TEXT / MIXED_TEXT / ENGLISH_TEXT /
|
|
245
|
+
CATEGORICAL), and the cleaning machinery returns object dtype safely.
|
|
246
|
+
|
|
247
|
+
### Added
|
|
248
|
+
- CLI `--sample N` flag (`run`, `profile`) — randomly sample N rows before analysis.
|
|
249
|
+
- CLI `--quiet` flag — minimal output (just result file paths).
|
|
250
|
+
- Progress feedback during processing (`อ่านไฟล์...`, `ตรวจจับประเภทคอลัมน์...`, …),
|
|
251
|
+
row count after reading large (>10MB) files, and a "large file" speed hint.
|
|
252
|
+
- `ProfileReport(progress=...)` callback hook for step-by-step progress.
|
|
253
|
+
- HTML report redesign: sticky section navigation, severity emoji + colored borders on
|
|
254
|
+
insight cards, type-colored column badges, a timeseries trend/seasonality banner, a
|
|
255
|
+
visual before→after cleaning diff (red strikethrough → green), responsive/mobile layout,
|
|
256
|
+
and print-friendly CSS.
|
|
257
|
+
- Better error messages: encoding failures list the encodings tried; cleaning failures
|
|
258
|
+
include the column dtype.
|
|
259
|
+
|
|
260
|
+
## [Unreleased]
|
|
261
|
+
|
|
262
|
+
### Added
|
|
263
|
+
- Initial project structure
|
|
264
|
+
- Thai text column detection (script-ratio based)
|
|
265
|
+
- Tier-1 data quality checks (Buddhist era, Thai numerals, zero-width spaces, script composition)
|
|
266
|
+
- Thai text metrics (length in chars/tokens/words, top tokens, n-grams)
|
|
267
|
+
- Word cloud with bundled Thai font
|
|
268
|
+
- HTML report generation (Jinja2)
|
|
269
|
+
- CLI interface (`thaieda profile data.csv`)
|
|
270
|
+
- Bilingual UI labels (Thai/English)
|
|
271
|
+
- Tokenizer adapter interface (pythainlp / nlpo3 / attacut)
|
|
272
|
+
|
|
273
|
+
## [0.1.0] - Unreleased
|
|
274
|
+
|
|
275
|
+
Initial alpha release.
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# Contributor Covenant Code of Conduct
|
|
2
|
+
|
|
3
|
+
## Our Pledge
|
|
4
|
+
|
|
5
|
+
We as members, contributors, and leaders pledge to make participation in our
|
|
6
|
+
community a harassment-free experience for everyone, regardless of age, body
|
|
7
|
+
size, visible or invisible disability, ethnicity, sex characteristics, gender
|
|
8
|
+
identity and expression, level of experience, education, socio-economic status,
|
|
9
|
+
nationality, personal appearance, race, religion, or sexual identity
|
|
10
|
+
and orientation.
|
|
11
|
+
|
|
12
|
+
We pledge to act and interact in ways that contribute to an open, welcoming,
|
|
13
|
+
diverse, inclusive, and healthy community.
|
|
14
|
+
|
|
15
|
+
## Our Standards
|
|
16
|
+
|
|
17
|
+
Examples of behavior that contributes to a positive environment:
|
|
18
|
+
|
|
19
|
+
* Demonstrating empathy and kindness toward other people
|
|
20
|
+
* Being respectful of differing opinions, viewpoints, and experiences
|
|
21
|
+
* Giving and gracefully accepting constructive feedback
|
|
22
|
+
* Accepting responsibility and apologizing to those affected by our mistakes
|
|
23
|
+
* Focusing on what is best for the overall community
|
|
24
|
+
|
|
25
|
+
Examples of unacceptable behavior:
|
|
26
|
+
|
|
27
|
+
* The use of sexualized language or imagery, and sexual attention or advances
|
|
28
|
+
* Trolling, insulting or derogatory comments, and personal or political attacks
|
|
29
|
+
* Public or private harassment
|
|
30
|
+
* Publishing others' private information without explicit permission
|
|
31
|
+
* Other conduct which could reasonably be considered inappropriate
|
|
32
|
+
|
|
33
|
+
## Enforcement Responsibilities
|
|
34
|
+
|
|
35
|
+
Community leaders are responsible for clarifying and enforcing our standards
|
|
36
|
+
and will take appropriate and fair corrective action in response to any
|
|
37
|
+
behavior that they deem inappropriate, threatening, offensive, or harmful.
|
|
38
|
+
|
|
39
|
+
## Scope
|
|
40
|
+
|
|
41
|
+
This Code of Conduct applies within all community spaces, and also applies
|
|
42
|
+
when an individual is officially representing the community in public spaces.
|
|
43
|
+
|
|
44
|
+
## Enforcement
|
|
45
|
+
|
|
46
|
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
|
47
|
+
reported to the community leaders responsible for enforcement at
|
|
48
|
+
**[open a private issue]**.
|
|
49
|
+
All complaints will be reviewed and investigated promptly and fairly.
|
|
50
|
+
|
|
51
|
+
## Attribution
|
|
52
|
+
|
|
53
|
+
This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org),
|
|
54
|
+
version 2.1, available at
|
|
55
|
+
[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html).
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# Contributing to ThaiEDA
|
|
2
|
+
|
|
3
|
+
ขอบคุณที่สนใจร่วมพัฒนา! 🙏
|
|
4
|
+
|
|
5
|
+
## การร่วมพัฒนา
|
|
6
|
+
|
|
7
|
+
1. **Fork** repo และสร้าง branch ใหม่
|
|
8
|
+
```bash
|
|
9
|
+
git checkout -b feat/your-feature-name
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
2. **ติดตั้งสำหรับ development**
|
|
13
|
+
```bash
|
|
14
|
+
pip install -e ".[dev,thai,viz]"
|
|
15
|
+
pre-commit install
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
3. **เขียน tests** สำหรับ code ใหม่ทุกครั้ง
|
|
19
|
+
```bash
|
|
20
|
+
pytest tests/ -v
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
4. **Code style** — ใช้ ruff ตรวจสอบ
|
|
24
|
+
```bash
|
|
25
|
+
ruff check src/ tests/
|
|
26
|
+
ruff format src/ tests/
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
5. **สร้าง Pull Request** พร้อมคำอธิบายชัดเจน
|
|
30
|
+
|
|
31
|
+
## Code conventions
|
|
32
|
+
|
|
33
|
+
- Python 3.10+
|
|
34
|
+
- Type hints ทุก public function
|
|
35
|
+
- Docstrings เป็นภาษาไทยหรืออังกฤษก็ได้ (แต่ต้องชัดเจน)
|
|
36
|
+
- Thai text processing: ระบุ engine ที่ใช้เสมอ (transparency)
|
|
37
|
+
- ไม่ทำ silent fallback — ถ้า tokenizer ไม่พร้อม ให้ fail พร้อมข้อความชัดเจน
|
|
38
|
+
|
|
39
|
+
## การรายงาน bug
|
|
40
|
+
|
|
41
|
+
เปิด issue พร้อม:
|
|
42
|
+
- ข้อมูลตัวอย่าง (ถ้าเป็นไปได้)
|
|
43
|
+
- โค้ดที่ทำให้เกิดปัญหา
|
|
44
|
+
- ผลที่ได้ vs ผลที่คาดหวัง
|
|
45
|
+
- เวอร์ชัน Python และ ThaiEDA
|
|
46
|
+
|
|
47
|
+
## Code of Conduct
|
|
48
|
+
|
|
49
|
+
โปรดอ่าน [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md) — ทุกคนต้องปฏิบัติตาม
|