thaieda 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. thaieda-1.0.0/.github/ISSUE_TEMPLATE/bug_report.md +31 -0
  2. thaieda-1.0.0/.github/ISSUE_TEMPLATE/feature_request.md +19 -0
  3. thaieda-1.0.0/.github/workflows/ci.yml +41 -0
  4. thaieda-1.0.0/.gitignore +60 -0
  5. thaieda-1.0.0/AGENTS.md +55 -0
  6. thaieda-1.0.0/CHANGELOG.md +275 -0
  7. thaieda-1.0.0/CODE_OF_CONDUCT.md +55 -0
  8. thaieda-1.0.0/CONTRIBUTING.md +49 -0
  9. thaieda-1.0.0/LICENSE +201 -0
  10. thaieda-1.0.0/PKG-INFO +270 -0
  11. thaieda-1.0.0/README.md +193 -0
  12. thaieda-1.0.0/assets/fonts/README.md +13 -0
  13. thaieda-1.0.0/assets/fonts/Sarabun-OFL.txt +93 -0
  14. thaieda-1.0.0/assets/fonts/Sarabun-Regular.ttf +0 -0
  15. thaieda-1.0.0/eval/README.md +130 -0
  16. thaieda-1.0.0/eval/fixtures/build_fixtures.py +243 -0
  17. thaieda-1.0.0/eval/fixtures/clean-thai.csv +101 -0
  18. thaieda-1.0.0/eval/fixtures/coffee-chain/CUSTOMER.csv +6001 -0
  19. thaieda-1.0.0/eval/fixtures/coffee-chain/DATE_DIM.csv +732 -0
  20. thaieda-1.0.0/eval/fixtures/coffee-chain/INVENTORY.csv +10001 -0
  21. thaieda-1.0.0/eval/fixtures/coffee-chain/LOCAL_EVENT.csv +1441 -0
  22. thaieda-1.0.0/eval/fixtures/coffee-chain/ORDER.csv +10001 -0
  23. thaieda-1.0.0/eval/fixtures/coffee-chain/PRODUCT.csv +61 -0
  24. thaieda-1.0.0/eval/fixtures/coffee-chain/PROMOTION.csv +8001 -0
  25. thaieda-1.0.0/eval/fixtures/coffee-chain/STORE.csv +21 -0
  26. thaieda-1.0.0/eval/fixtures/coffee-chain/TRANSACTION.csv +20978 -0
  27. thaieda-1.0.0/eval/fixtures/dirty-thai-labeled.csv +81 -0
  28. thaieda-1.0.0/eval/fixtures/superstore.csv +10801 -0
  29. thaieda-1.0.0/eval/manifests/coffee-chain-schema.expected.json +25 -0
  30. thaieda-1.0.0/eval/manifests/dirty-thai-labeled.expected.json +12 -0
  31. thaieda-1.0.0/eval/results/REPORT.md +58 -0
  32. thaieda-1.0.0/eval/results/results.json +60 -0
  33. thaieda-1.0.0/eval/run_eval.py +253 -0
  34. thaieda-1.0.0/eval/scenarios/__init__.py +1 -0
  35. thaieda-1.0.0/eval/scenarios/s1_thai_quality.py +115 -0
  36. thaieda-1.0.0/eval/scenarios/s2_relationships.py +79 -0
  37. thaieda-1.0.0/eval/scenarios/s3_insight_honesty.py +106 -0
  38. thaieda-1.0.0/pyproject.toml +126 -0
  39. thaieda-1.0.0/research/README.md +8 -0
  40. thaieda-1.0.0/research/eda-anomaly-research.md +194 -0
  41. thaieda-1.0.0/research/llm-privacy-design.md +776 -0
  42. thaieda-1.0.0/research/viz-cleaning-research.md +49 -0
  43. thaieda-1.0.0/src/thaieda/__init__.py +295 -0
  44. thaieda-1.0.0/src/thaieda/analysis/__init__.py +266 -0
  45. thaieda-1.0.0/src/thaieda/anomaly/__init__.py +1369 -0
  46. thaieda-1.0.0/src/thaieda/clean/__init__.py +948 -0
  47. thaieda-1.0.0/src/thaieda/cli.py +812 -0
  48. thaieda-1.0.0/src/thaieda/detect/__init__.py +357 -0
  49. thaieda-1.0.0/src/thaieda/i18n/__init__.py +268 -0
  50. thaieda-1.0.0/src/thaieda/insight/__init__.py +694 -0
  51. thaieda-1.0.0/src/thaieda/insight_engine/__init__.py +1248 -0
  52. thaieda-1.0.0/src/thaieda/io/__init__.py +235 -0
  53. thaieda-1.0.0/src/thaieda/llm/__init__.py +128 -0
  54. thaieda-1.0.0/src/thaieda/llm/_anonymize.py +212 -0
  55. thaieda-1.0.0/src/thaieda/llm/_prepare.py +279 -0
  56. thaieda-1.0.0/src/thaieda/llm/_prompt.py +248 -0
  57. thaieda-1.0.0/src/thaieda/llm/_provider.py +250 -0
  58. thaieda-1.0.0/src/thaieda/ner/__init__.py +260 -0
  59. thaieda-1.0.0/src/thaieda/quality/__init__.py +636 -0
  60. thaieda-1.0.0/src/thaieda/report/__init__.py +921 -0
  61. thaieda-1.0.0/src/thaieda/report/_dataset.py +143 -0
  62. thaieda-1.0.0/src/thaieda/report/_dataset_template.py +214 -0
  63. thaieda-1.0.0/src/thaieda/report/_template.py +583 -0
  64. thaieda-1.0.0/src/thaieda/schema/__init__.py +656 -0
  65. thaieda-1.0.0/src/thaieda/text/__init__.py +295 -0
  66. thaieda-1.0.0/src/thaieda/timeseries/__init__.py +620 -0
  67. thaieda-1.0.0/src/thaieda/tokenize/__init__.py +150 -0
  68. thaieda-1.0.0/src/thaieda/viz/__init__.py +1306 -0
  69. thaieda-1.0.0/tests/test_analysis.py +124 -0
  70. thaieda-1.0.0/tests/test_anomaly.py +391 -0
  71. thaieda-1.0.0/tests/test_clean.py +346 -0
  72. thaieda-1.0.0/tests/test_cli.py +183 -0
  73. thaieda-1.0.0/tests/test_detect.py +209 -0
  74. thaieda-1.0.0/tests/test_insight.py +295 -0
  75. thaieda-1.0.0/tests/test_insight_engine.py +377 -0
  76. thaieda-1.0.0/tests/test_io.py +141 -0
  77. thaieda-1.0.0/tests/test_llm.py +663 -0
  78. thaieda-1.0.0/tests/test_ner.py +150 -0
  79. thaieda-1.0.0/tests/test_oneliner.py +361 -0
  80. thaieda-1.0.0/tests/test_quality.py +166 -0
  81. thaieda-1.0.0/tests/test_report.py +168 -0
  82. thaieda-1.0.0/tests/test_schema.py +378 -0
  83. thaieda-1.0.0/tests/test_text.py +86 -0
  84. thaieda-1.0.0/tests/test_timeseries.py +221 -0
  85. thaieda-1.0.0/tests/test_tokenize.py +68 -0
  86. thaieda-1.0.0/tests/test_v08.py +277 -0
  87. thaieda-1.0.0/tests/test_viz.py +249 -0
@@ -0,0 +1,31 @@
1
+ ---
2
+ name: Bug report
3
+ about: รายงานปัญหาการใช้งาน
4
+ title: "[BUG] "
5
+ labels: bug
6
+ assignees: ''
7
+ ---
8
+
9
+ ## อธิบายปัญหา
10
+ คำอธิบายสั้น ๆ ว่าเกิดอะไรขึ้น
11
+
12
+ ## ข้อมูลสำหรับทำซ้ำ
13
+ ขั้นตอนที่ทำให้เกิดปัญหา:
14
+ 1. ...
15
+ 2. ...
16
+
17
+ ```python
18
+ # โค้ดตัวอย่าง
19
+ ```
20
+
21
+ ## ผลที่ได้
22
+ ...
23
+
24
+ ## ผลที่คาดหวัง
25
+ ...
26
+
27
+ ## สภาพแวดล้อม
28
+ - OS: [เช่น Windows 11, Ubuntu 22.04]
29
+ - Python version:
30
+ - ThaiEDA version:
31
+ - Tokenizer engine: [pythainlp / nlpo3 / attacut / none]
@@ -0,0 +1,19 @@
1
+ ---
2
+ name: Feature request
3
+ about: เสนอ feature ใหม่
4
+ title: "[FEAT] "
5
+ labels: enhancement
6
+ assignees: ''
7
+ ---
8
+
9
+ ## Feature ที่อยากเสนอ
10
+ ...
11
+
12
+ ## ทำไมถึงมีประโยชน์
13
+ ...
14
+
15
+ ## ทางเลือกที่พอมี
16
+ ...
17
+
18
+ ## ข้อมูลเพิ่มเติม
19
+ ...
@@ -0,0 +1,41 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main, develop]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ${{ matrix.os }}
12
+ strategy:
13
+ fail-fast: false
14
+ matrix:
15
+ os: [ubuntu-latest, windows-latest]
16
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
17
+
18
+ steps:
19
+ - uses: actions/checkout@v4
20
+
21
+ - name: Set up Python ${{ matrix.python-version }}
22
+ uses: actions/setup-python@v5
23
+ with:
24
+ python-version: ${{ matrix.python-version }}
25
+
26
+ - name: Install dependencies
27
+ run: |
28
+ python -m pip install --upgrade pip
29
+ pip install -e ".[dev,thai,viz]"
30
+
31
+ - name: Lint with ruff
32
+ run: |
33
+ ruff check src/ tests/
34
+ ruff format --check src/ tests/
35
+
36
+ - name: Type check with mypy
37
+ run: mypy src/thaieda --ignore-missing-imports
38
+ continue-on-error: true
39
+
40
+ - name: Test with pytest
41
+ run: pytest tests/ -v --tb=short
@@ -0,0 +1,60 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ *.egg-info/
7
+ *.egg
8
+ dist/
9
+ build/
10
+ .eggs/
11
+ wheels/
12
+
13
+ # Virtual environments
14
+ .venv/
15
+ venv/
16
+ env/
17
+ ENV/
18
+
19
+ # IDE
20
+ .idea/
21
+ .vscode/
22
+ *.swp
23
+ *.swo
24
+ *~
25
+ .DS_Store
26
+
27
+ # Testing
28
+ .pytest_cache/
29
+ .coverage
30
+ htmlcov/
31
+ .tox/
32
+ .mypy_cache/
33
+ .ruff_cache/
34
+
35
+ # Jupyter
36
+ .ipynb_checkpoints/
37
+ *.ipynb_checkpoints
38
+
39
+ # OS
40
+ Thumbs.db
41
+ desktop.ini
42
+
43
+ # Secrets — ห้าม commit เด็ดขาด
44
+ .env
45
+ .env.*
46
+ *.pem
47
+ *.key
48
+ secret*
49
+ credentials*
50
+
51
+ # Generated reports (output จากการรัน)
52
+ *.thaieda.html
53
+ *.cleaned.csv
54
+ reports_output/
55
+
56
+ # Test/example data — ห้าม commit (ไฟล์ใหญ่ ใช้ทดสอบเท่านั้น)
57
+ data-example/
58
+
59
+ # Git worktrees ภายใน (สำหรับงาน agent)
60
+ _worktrees/
@@ -0,0 +1,55 @@
1
+ # AGENTS.md — ThaiEDA
2
+
3
+ ## Project
4
+ ThaiEDA คือ open-source AutoEDA library สำหรับข้อมูลภาษาไทย
5
+ - Repo: https://github.com/peetwan/thaieda
6
+ - License: Apache-2.0
7
+ - Python 3.10+, pandas + matplotlib + Jinja2 + pythainlp (optional)
8
+ - Current version: v0.8 (commit 3351e9c)
9
+ - Tests: 356 passed, 1 skipped, ruff clean
10
+
11
+ ## Structure
12
+ ```
13
+ src/thaieda/
14
+ detect/ column type detection + Thai month name detection (v0.8)
15
+ tokenize/ tokenizer adapter (pythainlp/nlpo3/attacut)
16
+ text/ text metrics
17
+ quality/ Thai data quality checks + placeholder/constant detection (v0.8) + vectorized BE
18
+ anomaly/ anomaly detection (numeric + text + Thai-specific)
19
+ clean/ data cleaning: encoding, zwspace, numerals, BE→CE, dates, duplicates, missing (v0.8)
20
+ ner/ Thai NER (v0.2)
21
+ analysis/ target variable analysis (v0.2)
22
+ insight/ auto insight summary — interpreter (v0.3) + distribution/correlation (v0.4)
23
+ insight_engine/ cross-column insight discovery — 6 patterns + BH correction (v0.6+v0.8)
24
+ timeseries/ timeseries analysis (v0.4)
25
+ schema/ multi-file schema discovery — PK/FK + relationship matching (v0.5)
26
+ viz/ visualization + Thai font + insight charts (v0.7)
27
+ report/ HTML report (Jinja2) + DatasetReport (v0.5)
28
+ i18n/ TH/EN labels
29
+ llm/ placeholder (v0.9+)
30
+ tests/ pytest (356 tests)
31
+ research/ research notes (cron-generated, NOT source code)
32
+ ```
33
+
34
+ ## v0.8 additions
35
+ - clean/: coerce_numeric_column, convert_buddhist_era, normalize_dates, remove_duplicate_rows, handle_missing_values
36
+ - insight_engine/: _detect_strong_correlations, _detect_outlier_insights, cross-pattern novelty, adaptive min_segment, all_buckets in trend evidence
37
+ - quality/: check_placeholder_values, check_constant_column, vectorized check_buddhist_era
38
+ - detect/: Thai month name detection in _looks_like_datetime
39
+ - io/: Excel (.xlsx/.xls) support via openpyxl
40
+
41
+ ## Cron job rules
42
+ - Cron jobs แก้ไขเฉพาะ `research/` directory เท่านั้น
43
+ - ห้ามแก้ src/, tests/, pyproject.toml, หรือ config อื่น ๆ
44
+ - ทุก finding ต้องมี source URL
45
+ - Git commit message: `docs: research update - <topic> (YYYY-MM-DD)`
46
+ - Push to origin main
47
+
48
+ ## Coding conventions
49
+ - Python 3.10+ with type hints
50
+ - Thai docstrings/comments
51
+ - Lazy imports for optional deps
52
+ - No silent fallbacks — fail loudly with helpful message
53
+ - matplotlib Agg backend (no GUI)
54
+ - Vectorized operations preferred (pandas .str accessors over row-by-row loops)
55
+ - Use contextlib.suppress instead of try/except/pass
@@ -0,0 +1,275 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [0.9.0] - 2026-06-25
9
+
10
+ Headline feature: **privacy-preserving LLM analysis** — feed your EDA results to
11
+ any LLM (OpenAI / Anthropic / Ollama) with selectable privacy modes so you
12
+ control exactly what data leaves your machine.
13
+
14
+ ### Added — LLM Analysis Module
15
+
16
+ Four **privacy modes** for `analyze_with_llm()` — ranked from safest to riskiest:
17
+
18
+ | Mode | What leaves the machine | Privacy guarantee | Use case |
19
+ |------|-------------------------|-------------------|----------|
20
+ | `insight_only` (default) | Only summary statistics + insight cards sent | Raw data never leaves the machine | Regulated/PDPA data, default for cautious users |
21
+ | `anonymized` | PII replaced with reversible tokens before sending | Names/phones/ID cards replaced with `[NAME_1]`, `[PHONE_1]` | Need LLM to see structure/patterns without raw PII |
22
+ | `dp_noise` | Stats with Laplace mechanism noise (ε configurable) | DP noise prevents re-identification from small stats | Small datasets where stats alone may leak identity |
23
+ | `full` | All raw data sent (user accepts risk) | None — user accepts tradeoff for max accuracy | Public/non-sensitive data; dev/demo workflows |
24
+
25
+ ### New Files (v0.9)
26
+
27
+ - `src/thaieda/llm/__init__.py` — public API: `analyze_with_llm(df, privacy, provider, model, language)`
28
+ - `src/thaieda/llm/_prepare.py` — `prepare_for_llm()` — prepares data per privacy mode (4 modes)
29
+ - `src/thaieda/llm/_anonymize.py` — `anonymize_dataframe()` — PII detection (phone, ID card, NER)
30
+ - `src/thaieda/llm/_prompt.py` — `build_prompt()` — Thai/English prompt builder
31
+ - `src/thaieda/llm/_provider.py` — `call_llm()` — lazy import OpenAI/Anthropic/Ollama
32
+ - `tests/test_llm.py` — 59 tests covering all modes + protocol contracts
33
+
34
+ ### Added — Three LLM Providers (all optional / lazy)
35
+
36
+ | Provider | Package | Default model | API key env var |
37
+ |----------|---------|---------------|-----------------|
38
+ | `openai` (default) | `pip install openai` | `gpt-4o-mini` | `OPENAI_API_KEY` |
39
+ | `anthropic` | `pip install anthropic` | `claude-3-5-sonnet-20241022` | `ANTHROPIC_API_KEY` |
40
+ | `ollama` | `pip install ollama` *(or built-in HTTP fallback)* | `llama3.1` | `OLLAMA_HOST` (default `http://localhost:11434`) |
41
+
42
+ **All LLM dependencies are optional and lazy-imported** — the library imports
43
+ fine without `openai`/`anthropic`/`ollama` installed. The provider raises a
44
+ helpful `ImportError`/`RuntimeError` only when actually called.
45
+
46
+ ### Privacy Guarantees Per Mode
47
+
48
+ - **`insight_only`** — data DataFrame is never sent; only `summary` and insight
49
+ cards computed locally. The LLM sees aggregated stats, not rows. **Nothing
50
+ user-identifiable leaves the machine** beyond aggregate statistics.
51
+ - **`anonymized`** — phone numbers (all Thai formats: `081-234-5678`,
52
+ `0812345678`, `+668****5678`), ID cards (`X-XXXX-XXXXX-XX-X`), and named
53
+ entities (PERSON/LOCATION/ORGANIZATION via NER) are replaced with consistent
54
+ tokens. The `token_map` is returned so users can reverse-lookup tokens to
55
+ originals locally. Numeric columns are untouched.
56
+ - **`dp_noise`** — Laplace mechanism noise added to numeric stats (mean, min,
57
+ max, count) and categorical counts. Configurable `epsilon` (smaller = more
58
+ noise = more privacy). `epsilon=0` raises `ValueError`.
59
+ - **`full`** — raw DataFrame copied and sent in prompt. No privacy guarantee.
60
+
61
+ ### Tests
62
+
63
+ - 23 new tests in `test_llm.py` covering real-world Thai government dataset,
64
+ token reversibility, prompt branding, full pipeline mock, dp_noise with fixed
65
+ seed, and ollama dispatch.
66
+ - Total LLM tests: 59 (36 original + 23 new), all passing.
67
+ - Ruff clean.
68
+
69
+ ---
70
+
71
+ ## [0.8.0] - 2026-06-25
72
+
73
+ Headline feature: **clean data + actionable insights** — 12 improvements addressing
74
+ real-world gaps found by code review (Claude Code) and testing on messy Thai data.
75
+
76
+ ### Added — Data Cleaning
77
+ - **`coerce_numeric_column()`** — converts string columns with Thai numerals to proper
78
+ numeric dtype (not NaN). Handles placeholder values (`-`, `N/A`, `ไม่มี`) → NaN first.
79
+ - **`convert_buddhist_era()`** — converts BE years (2440–2599) to CE in both numeric
80
+ columns (`2530` → `1987`) and date-string columns (`2567-01-15` → `2024-01-15`).
81
+ - **`normalize_dates()`** — standardizes Thai month name dates to ISO format
82
+ (`15 มกราคม 2567` → `15/01/2024`) and converts BE→CE in date strings.
83
+ - **`remove_duplicate_rows()`** — detects and removes fully duplicated rows from a DataFrame.
84
+ - **`handle_missing_values()`** — 5 strategies: `flag` (0/ไม่ระบุ), `drop`, `median`,
85
+ `mode`, `unknown`. Type-aware (numeric vs text).
86
+
87
+ ### Added — Insight Engine
88
+ - **Correlation pattern** — detects strong pairwise correlations (|r| ≥ 0.7) between
89
+ numeric columns and generates insight cards with r-value and direction.
90
+ - **Outlier pattern** — detects row-level statistical outliers (z-score ≥ 3) and
91
+ generates insight cards with outlier count, percentage, and max z-score.
92
+ - **Cross-pattern novelty penalty** — when multiple patterns point to the same
93
+ breakdown × measure × segment (e.g., an outlier causing outstanding + comparison +
94
+ attribution), only the highest-ranked keeps full score.
95
+ - **Adaptive `min_segment`** — auto-adjusts for small datasets (`max(5, total_n // 20)`)
96
+ with a note when triggered.
97
+ - **Trend evidence `all_buckets`** — trend findings now include the full bucket series
98
+ in evidence, fixing the v0.7 bug where trend charts showed only 2 data points.
99
+ - **`_recompute_full` for correlation/outlier** — correlation and outlier evidence is
100
+ recomputed on the full dataset (not just sample) during two-phase scoring.
101
+
102
+ ### Added — Quality Checks
103
+ - **`check_placeholder_values()`** — flags placeholder values (`-`, `N/A`, `NULL`,
104
+ `ไม่มี`, `ไม่ระบุ`) that should be NaN. Vectorized with `.isin()`.
105
+ - **`check_constant_column()`** — flags columns with zero variance (single unique value)
106
+ as useless for analysis.
107
+
108
+ ### Added — Detection
109
+ - **Thai month name detection** — `_looks_like_datetime` now recognizes Thai month names
110
+ (`15 มกราคม 2567`, `1 ก.พ. 67`) in addition to numeric date formats.
111
+
112
+ ### Added — I/O
113
+ - **Excel support** — `.xlsx` and `.xls` files via `pd.read_excel` (requires `openpyxl`).
114
+ Auto-detected from file extension; error message guides installation if missing.
115
+
116
+ ### Changed
117
+ - **Vectorized `check_buddhist_era`** — uses `.str.extractall` instead of row-by-row
118
+ loop; falls back to original method on error. Counts rows (not matches) to prevent
119
+ percentage > 100%.
120
+ - Roadmap shifted: v0.9 = LLM Q&A, v1.0 = interactive dashboard (was v0.8/v0.9).
121
+ - Architecture section updated to reflect v0.8 modules.
122
+
123
+ ### Tests
124
+ - 35 new tests in `test_v08.py` covering all new functions (happy path + edge cases).
125
+ - Total: 356 passed, 1 skipped, ruff clean.
126
+
127
+ ---
128
+
129
+ ## [0.7.0] - 2026-06-25
130
+
131
+ Headline feature: **insight visualization** — every cross-column insight card now
132
+ includes an auto-generated chart matched to its pattern, embedded in the HTML report.
133
+
134
+ ### Added
135
+ - **`thaieda.viz` insight chart functions**:
136
+ - `create_insight_outstanding_chart(segments, top_segment, title, font_path)` —
137
+ horizontal bar chart; top segment highlighted in green, others in muted gray.
138
+ - `create_insight_attribution_chart(segments, top_segment, share, title, font_path)` —
139
+ donut chart with the dominant segment's share % in the center.
140
+ - `create_insight_comparison_chart(df, breakdown, measure, top_segment, title, font_path)` —
141
+ box plot by group; top segment highlighted in red, groups beyond top-9 collapsed into "อื่น ๆ".
142
+ - `create_insight_trend_chart(segments, direction, tau, title, font_path)` —
143
+ line chart with filled area, direction arrow annotation, and τ value in the title.
144
+ - `create_insight_chart(card_dict, df, font_path)` — dispatcher that picks the right
145
+ chart based on the card's `pattern` field.
146
+ - **`ProfileReport._build_insight_charts()`** — generates a chart for each insight card
147
+ during `profile()` and embeds it in the HTML report alongside the text + evidence table.
148
+ - **HTML template** — insight cards now render an `<img>` (base64 PNG) when a chart is
149
+ available; evidence table is hidden for trend cards (the line chart replaces it).
150
+
151
+ ### Changed
152
+ - Roadmap shifted: v0.8 = LLM Q&A, v0.9 = interactive dashboard (was v0.7/v0.8).
153
+
154
+ ### Fixed
155
+ - Trend chart annotation: replaced Unicode arrows (↗↘) with ASCII text ("Trend UP"/"Trend DOWN")
156
+ to avoid missing-glyph warnings on Thai fonts.
157
+
158
+ ---
159
+
160
+ ## [0.6.0] - 2026-06-25
161
+
162
+ Headline feature: **cross-column insight engine** — a *discoverer* (vs. the existing
163
+ `insight` *interpreter*) that combines columns (group-by + aggregate + statistical scoring)
164
+ to surface non-obvious business findings in Thai, ranked by an interestingness pipeline.
165
+ Works on **any** dataset — zero column-name logic, no domain overfitting; everything is
166
+ driven by `ColumnType` + cardinality + value ranges.
167
+
168
+ ### Added
169
+ - **`thaieda.insight_engine` module** — discovers insights from column combinations:
170
+ - `discover_insights(df, column_types, *, top_n=8, sample_size=100_000, min_segment=30, progress=)`
171
+ — builds perspectives (breakdown × measure × agg) and detects **4 patterns**:
172
+ - **outstanding** — one segment dominates (`top/second ≥ 1.5`),
173
+ - **attribution** — one segment is a large share (`≥ 50%`) of a total (`≥ 3` segments),
174
+ - **comparison** — top segment differs significantly from the rest (ANOVA/Kruskal + JSD),
175
+ - **trend** — monotonic movement over an ordered (datetime-bucketed) axis (Mann-Kendall).
176
+ - Dataclasses `Perspective`, `InsightCard`, `InsightEngineResult` (all with `to_dict()`).
177
+ - **Interestingness pipeline**: `gate → score → penalize → rank` —
178
+ `final = gate × (0.5·pattern_score + 0.5·effect_size) × novelty × (1 − triviality)`.
179
+ - **Benjamini-Hochberg correction** across all candidate significance tests (FDR control
180
+ for hundreds of comparisons). Mann-Kendall p-values are computed with `math.erf`, so trend
181
+ significance works even without scipy; ANOVA p-values degrade to effect-size-only + a note.
182
+ - **Two-phase**: scores candidates on a stratified sample (~100k rows), then recomputes exact
183
+ numbers on the full data for the top-N only — handles 1M+ rows (804k rows in ~5s).
184
+ - **Triviality / non-additive guards**: excludes ID / near-unique / single-group breakdowns;
185
+ skips `sum` for measures bounded in `[0,1]` (or `[0,100]` percentage-like floats).
186
+ - Category keys are normalized before group-by (Thai numerals → Arabic, zero-width stripped,
187
+ trailing `.0` removed) via the shared `schema._normalize_key_series` — prevents split groups.
188
+ - **`ProfileReport(insights_engine=True, insights_top=8)`** — new analysis stage (after target
189
+ analysis, before timeseries), wrapped in try/except → `notes`. Top 3 cards feed the existing
190
+ `InsightSummary` (new `business` category) so they appear in the executive summary.
191
+ - **Dedicated HTML report section** "ข้อค้นพบจากการวิเคราะห์คอลัมน์ผสม" (Cross-Column Insights)
192
+ with pattern badges, per-card evidence mini-tables (top segments, share, lift, p-value, τ).
193
+ - **CLI** `--no-insights` and `--insights-top N` flags on `profile` and `run` (on by default).
194
+ - New `thaieda` top-level exports: `discover_insights`, `InsightCard`, `InsightEngineResult`,
195
+ `Perspective`.
196
+
197
+ ## [0.5.0] - 2026-06-25
198
+
199
+ Headline feature: **multi-file schema discovery** — analyze a whole folder of related
200
+ files as one dataset, automatically infer how the tables connect, and render a combined
201
+ report with an interactive ER diagram.
202
+
203
+ ### Added
204
+ - **`thaieda.schema` module** — discovers relationships between multiple data files:
205
+ - `discover_keys(df, table_name)` — finds primary/foreign key candidate columns
206
+ (name hints via `_name_hints_id` + uniqueness ≥ 95%, excluding boolean/constant columns).
207
+ - `match_relationships(tables, profiles, validate_values=, sample_size=)` — matches
208
+ columns across tables by normalized name, infers direction (unique side = parent/PK,
209
+ non-unique side = child/FK), and confirms with real value overlap. Both-non-unique
210
+ pairs are never linked (prevents `date ↔ date` false positives).
211
+ - `profile_dataset(paths, ...)` — reads a directory or list of files (`.csv/.json/.jsonl/.ndjson`),
212
+ profiles each table, and returns a `DatasetProfile`.
213
+ - Dataclasses `KeyCandidate`, `Relationship`, `TableProfile`, `DatasetProfile`
214
+ (all with `to_dict()`); `DatasetProfile.to_mermaid()` / `to_json()`.
215
+ - Key values are normalized before comparison (Thai numerals → Arabic, zero-width
216
+ characters stripped, trailing `.0` float artifacts removed) — vectorized for million-row tables.
217
+ - Orphan detection: foreign-key values with no matching primary key are reported in Thai.
218
+ - **`DatasetReport`** (`thaieda.report._dataset`) — self-contained HTML report with a schema
219
+ overview, a Mermaid.js ER diagram (loaded via CDN), per-table summaries with key columns,
220
+ a relationships table (overlap %, orphans, cardinality, confidence), and orphan findings.
221
+ - **CLI `thaieda dataset <dir|files...>`** — multi-file analysis to an HTML report
222
+ (`--no-validate`, `--json`, `--lang`, `--quiet`). `thaieda run`/`thaieda profile` now
223
+ auto-route to dataset mode when given a directory containing ≥ 2 supported files.
224
+ - New `thaieda` top-level exports: `profile_dataset`, `DatasetProfile`, `Relationship`,
225
+ `KeyCandidate`, `TableProfile`, `DatasetReport`.
226
+
227
+ ## [0.4.1] - 2026-06-25
228
+
229
+ Performance fixes from real-world testing on large datasets (1M+ rows), plus UX/visual polish.
230
+
231
+ ### Performance
232
+ - **Vectorized text cleaning** (`clean._apply_str_transform`): replaced the per-row Python
233
+ loop with vectorized pandas operations (`.str` accessors / bulk assignment). A single
234
+ cleaning operation on 1M rows now runs in ~1s (was 100s+). Encoding repair (ftfy) now
235
+ pre-filters rows so it only runs on likely-mojibake cells (~14.5s → ~0.5s on 1M rows).
236
+ - **Sampled ML anomaly detection**: Isolation Forest / LOF now sample down to 10,000 rows
237
+ on large columns (statistical z-score/MAD/IQR still run on full data). LOF on 1.3M rows
238
+ went from >180s to <1s. The result notes when sampling was used.
239
+ - **LOF on duplicate-heavy data**: skips LOF when >50% of values are duplicates, bumps
240
+ `n_neighbors` otherwise, and suppresses the noisy sklearn duplicate-distance warning.
241
+
242
+ ### Fixed
243
+ - Cleaning suggestions no longer fail on numeric/ID columns (e.g. `int64` `customer_id`) —
244
+ suggestions now run only on text columns (THAI_TEXT / MIXED_TEXT / ENGLISH_TEXT /
245
+ CATEGORICAL), and the cleaning machinery returns object dtype safely.
246
+
247
+ ### Added
248
+ - CLI `--sample N` flag (`run`, `profile`) — randomly sample N rows before analysis.
249
+ - CLI `--quiet` flag — minimal output (just result file paths).
250
+ - Progress feedback during processing (`อ่านไฟล์...`, `ตรวจจับประเภทคอลัมน์...`, …),
251
+ row count after reading large (>10MB) files, and a "large file" speed hint.
252
+ - `ProfileReport(progress=...)` callback hook for step-by-step progress.
253
+ - HTML report redesign: sticky section navigation, severity emoji + colored borders on
254
+ insight cards, type-colored column badges, a timeseries trend/seasonality banner, a
255
+ visual before→after cleaning diff (red strikethrough → green), responsive/mobile layout,
256
+ and print-friendly CSS.
257
+ - Better error messages: encoding failures list the encodings tried; cleaning failures
258
+ include the column dtype.
259
+
260
+ ## [Unreleased]
261
+
262
+ ### Added
263
+ - Initial project structure
264
+ - Thai text column detection (script-ratio based)
265
+ - Tier-1 data quality checks (Buddhist era, Thai numerals, zero-width spaces, script composition)
266
+ - Thai text metrics (length in chars/tokens/words, top tokens, n-grams)
267
+ - Word cloud with bundled Thai font
268
+ - HTML report generation (Jinja2)
269
+ - CLI interface (`thaieda profile data.csv`)
270
+ - Bilingual UI labels (Thai/English)
271
+ - Tokenizer adapter interface (pythainlp / nlpo3 / attacut)
272
+
273
+ ## [0.1.0] - Unreleased
274
+
275
+ Initial alpha release.
@@ -0,0 +1,55 @@
1
+ # Contributor Covenant Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ We as members, contributors, and leaders pledge to make participation in our
6
+ community a harassment-free experience for everyone, regardless of age, body
7
+ size, visible or invisible disability, ethnicity, sex characteristics, gender
8
+ identity and expression, level of experience, education, socio-economic status,
9
+ nationality, personal appearance, race, religion, or sexual identity
10
+ and orientation.
11
+
12
+ We pledge to act and interact in ways that contribute to an open, welcoming,
13
+ diverse, inclusive, and healthy community.
14
+
15
+ ## Our Standards
16
+
17
+ Examples of behavior that contributes to a positive environment:
18
+
19
+ * Demonstrating empathy and kindness toward other people
20
+ * Being respectful of differing opinions, viewpoints, and experiences
21
+ * Giving and gracefully accepting constructive feedback
22
+ * Accepting responsibility and apologizing to those affected by our mistakes
23
+ * Focusing on what is best for the overall community
24
+
25
+ Examples of unacceptable behavior:
26
+
27
+ * The use of sexualized language or imagery, and sexual attention or advances
28
+ * Trolling, insulting or derogatory comments, and personal or political attacks
29
+ * Public or private harassment
30
+ * Publishing others' private information without explicit permission
31
+ * Other conduct which could reasonably be considered inappropriate
32
+
33
+ ## Enforcement Responsibilities
34
+
35
+ Community leaders are responsible for clarifying and enforcing our standards
36
+ and will take appropriate and fair corrective action in response to any
37
+ behavior that they deem inappropriate, threatening, offensive, or harmful.
38
+
39
+ ## Scope
40
+
41
+ This Code of Conduct applies within all community spaces, and also applies
42
+ when an individual is officially representing the community in public spaces.
43
+
44
+ ## Enforcement
45
+
46
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
47
+ reported to the community leaders responsible for enforcement at
48
+ **[open a private issue]**.
49
+ All complaints will be reviewed and investigated promptly and fairly.
50
+
51
+ ## Attribution
52
+
53
+ This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org),
54
+ version 2.1, available at
55
+ [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html).
@@ -0,0 +1,49 @@
1
+ # Contributing to ThaiEDA
2
+
3
+ ขอบคุณที่สนใจร่วมพัฒนา! 🙏
4
+
5
+ ## การร่วมพัฒนา
6
+
7
+ 1. **Fork** repo และสร้าง branch ใหม่
8
+ ```bash
9
+ git checkout -b feat/your-feature-name
10
+ ```
11
+
12
+ 2. **ติดตั้งสำหรับ development**
13
+ ```bash
14
+ pip install -e ".[dev,thai,viz]"
15
+ pre-commit install
16
+ ```
17
+
18
+ 3. **เขียน tests** สำหรับ code ใหม่ทุกครั้ง
19
+ ```bash
20
+ pytest tests/ -v
21
+ ```
22
+
23
+ 4. **Code style** — ใช้ ruff ตรวจสอบ
24
+ ```bash
25
+ ruff check src/ tests/
26
+ ruff format src/ tests/
27
+ ```
28
+
29
+ 5. **สร้าง Pull Request** พร้อมคำอธิบายชัดเจน
30
+
31
+ ## Code conventions
32
+
33
+ - Python 3.10+
34
+ - Type hints ทุก public function
35
+ - Docstrings เป็นภาษาไทยหรืออังกฤษก็ได้ (แต่ต้องชัดเจน)
36
+ - Thai text processing: ระบุ engine ที่ใช้เสมอ (transparency)
37
+ - ไม่ทำ silent fallback — ถ้า tokenizer ไม่พร้อม ให้ fail พร้อมข้อความชัดเจน
38
+
39
+ ## การรายงาน bug
40
+
41
+ เปิด issue พร้อม:
42
+ - ข้อมูลตัวอย่าง (ถ้าเป็นไปได้)
43
+ - โค้ดที่ทำให้เกิดปัญหา
44
+ - ผลที่ได้ vs ผลที่คาดหวัง
45
+ - เวอร์ชัน Python และ ThaiEDA
46
+
47
+ ## Code of Conduct
48
+
49
+ โปรดอ่าน [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md) — ทุกคนต้องปฏิบัติตาม