theme-extractor 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. theme_extractor-0.1.0/.gitignore +255 -0
  2. theme_extractor-0.1.0/LICENSE.md +21 -0
  3. theme_extractor-0.1.0/PKG-INFO +175 -0
  4. theme_extractor-0.1.0/README.md +127 -0
  5. theme_extractor-0.1.0/howto/README.md +13 -0
  6. theme_extractor-0.1.0/pyproject.toml +117 -0
  7. theme_extractor-0.1.0/src/theme_extractor/__init__.py +3 -0
  8. theme_extractor-0.1.0/src/theme_extractor/cli.py +1232 -0
  9. theme_extractor-0.1.0/src/theme_extractor/domain/__init__.py +59 -0
  10. theme_extractor-0.1.0/src/theme_extractor/domain/contracts.py +80 -0
  11. theme_extractor-0.1.0/src/theme_extractor/domain/enums.py +285 -0
  12. theme_extractor-0.1.0/src/theme_extractor/errors.py +33 -0
  13. theme_extractor-0.1.0/src/theme_extractor/evaluation/__init__.py +5 -0
  14. theme_extractor-0.1.0/src/theme_extractor/evaluation/metrics.py +270 -0
  15. theme_extractor-0.1.0/src/theme_extractor/extraction/__init__.py +39 -0
  16. theme_extractor-0.1.0/src/theme_extractor/extraction/baselines.py +400 -0
  17. theme_extractor-0.1.0/src/theme_extractor/extraction/bertopic.py +536 -0
  18. theme_extractor-0.1.0/src/theme_extractor/extraction/characterization.py +183 -0
  19. theme_extractor-0.1.0/src/theme_extractor/extraction/embedding_cache.py +187 -0
  20. theme_extractor-0.1.0/src/theme_extractor/extraction/keybert.py +282 -0
  21. theme_extractor-0.1.0/src/theme_extractor/extraction/llm.py +329 -0
  22. theme_extractor-0.1.0/src/theme_extractor/extraction/utils.py +36 -0
  23. theme_extractor-0.1.0/src/theme_extractor/ingestion/__init__.py +5 -0
  24. theme_extractor-0.1.0/src/theme_extractor/ingestion/cleaning.py +589 -0
  25. theme_extractor-0.1.0/src/theme_extractor/ingestion/extractors.py +429 -0
  26. theme_extractor-0.1.0/src/theme_extractor/ingestion/pipeline.py +597 -0
  27. theme_extractor-0.1.0/src/theme_extractor/reporting/__init__.py +5 -0
  28. theme_extractor-0.1.0/src/theme_extractor/reporting/markdown.py +281 -0
  29. theme_extractor-0.1.0/src/theme_extractor/resources/__init__.py +1 -0
  30. theme_extractor-0.1.0/src/theme_extractor/resources/stopword_column_names.json +7 -0
  31. theme_extractor-0.1.0/src/theme_extractor/resources/stopwords_fr_fallback.json +66 -0
  32. theme_extractor-0.1.0/src/theme_extractor/search/__init__.py +13 -0
  33. theme_extractor-0.1.0/src/theme_extractor/search/adapters.py +251 -0
  34. theme_extractor-0.1.0/src/theme_extractor/search/factory.py +76 -0
  35. theme_extractor-0.1.0/src/theme_extractor/search/protocols.py +69 -0
@@ -0,0 +1,255 @@
1
+ # File created using '.gitignore Generator' for Visual Studio Code: https://bit.ly/vscode-gig
2
+ # Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode,macos,dotenv,python
3
+ # Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode,macos,dotenv,python
4
+
5
+ ### dotenv ###
6
+ .env
7
+
8
+ ### macOS ###
9
+ # General
10
+ .DS_Store
11
+ .AppleDouble
12
+ .LSOverride
13
+
14
+ # Icon must end with two \r
15
+ Icon
16
+
17
+
18
+ # Thumbnails
19
+ ._*
20
+
21
+ # Files that might appear in the root of a volume
22
+ .DocumentRevisions-V100
23
+ .fseventsd
24
+ .Spotlight-V100
25
+ .TemporaryItems
26
+ .Trashes
27
+ .VolumeIcon.icns
28
+ .com.apple.timemachine.donotpresent
29
+
30
+ # Directories potentially created on remote AFP share
31
+ .AppleDB
32
+ .AppleDesktop
33
+ Network Trash Folder
34
+ Temporary Items
35
+ .apdisk
36
+
37
+ ### macOS Patch ###
38
+ # iCloud generated files
39
+ *.icloud
40
+
41
+ ### Python ###
42
+ # Byte-compiled / optimized / DLL files
43
+ __pycache__/
44
+ *.py[cod]
45
+ *$py.class
46
+
47
+ # C extensions
48
+ *.so
49
+
50
+ # Distribution / packaging
51
+ .Python
52
+ build/
53
+ develop-eggs/
54
+ dist/
55
+ downloads/
56
+ eggs/
57
+ .eggs/
58
+ lib/
59
+ lib64/
60
+ parts/
61
+ sdist/
62
+ var/
63
+ wheels/
64
+ share/python-wheels/
65
+ *.egg-info/
66
+ .installed.cfg
67
+ *.egg
68
+ MANIFEST
69
+
70
+ # PyInstaller
71
+ # Usually these files are written by a python script from a template
72
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
73
+ *.manifest
74
+ *.spec
75
+
76
+ # Installer logs
77
+ pip-log.txt
78
+ pip-delete-this-directory.txt
79
+
80
+ # Unit test / coverage reports
81
+ htmlcov/
82
+ .tox/
83
+ .nox/
84
+ .coverage
85
+ .coverage.*
86
+ .cache
87
+ nosetests.xml
88
+ coverage.xml
89
+ *.cover
90
+ *.py,cover
91
+ .hypothesis/
92
+ .pytest_cache/
93
+ cover/
94
+
95
+ # Translations
96
+ *.mo
97
+ *.pot
98
+
99
+ # Django stuff:
100
+ *.log
101
+ local_settings.py
102
+ db.sqlite3
103
+ db.sqlite3-journal
104
+
105
+ # Flask stuff:
106
+ instance/
107
+ .webassets-cache
108
+
109
+ # Scrapy stuff:
110
+ .scrapy
111
+
112
+ # Sphinx documentation
113
+ docs/_build/
114
+ docs/_generated/
115
+
116
+ # PyBuilder
117
+ .pybuilder/
118
+ target/
119
+
120
+ # Jupyter Notebook
121
+ .ipynb_checkpoints
122
+
123
+ # IPython
124
+ profile_default/
125
+ ipython_config.py
126
+
127
+ # pyenv
128
+ # For a library or package, you might want to ignore these files since the code is
129
+ # intended to run in multiple environments; otherwise, check them in:
130
+ # .python-version
131
+
132
+ # pipenv
133
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
134
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
135
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
136
+ # install all needed dependencies.
137
+ #Pipfile.lock
138
+
139
+ # poetry
140
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
141
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
142
+ # commonly ignored for libraries.
143
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
144
+ #poetry.lock
145
+
146
+ # pdm
147
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
148
+ #pdm.lock
149
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
150
+ # in version control.
151
+ # https://pdm.fming.dev/#use-with-ide
152
+ .pdm.toml
153
+
154
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
155
+ __pypackages__/
156
+
157
+ # Celery stuff
158
+ celerybeat-schedule
159
+ celerybeat.pid
160
+
161
+ # SageMath parsed files
162
+ *.sage.py
163
+
164
+ # Environments
165
+ .venv
166
+ env/
167
+ venv/
168
+ ENV/
169
+ env.bak/
170
+ venv.bak/
171
+
172
+ # Spyder project settings
173
+ .spyderproject
174
+ .spyproject
175
+
176
+ # Rope project settings
177
+ .ropeproject
178
+
179
+ # mkdocs documentation
180
+ /site
181
+
182
+ # mypy
183
+ .mypy_cache/
184
+ .dmypy.json
185
+ dmypy.json
186
+
187
+ # Pyre type checker
188
+ .pyre/
189
+
190
+ # pytype static type analyzer
191
+ .pytype/
192
+
193
+ # Cython debug symbols
194
+ cython_debug/
195
+
196
+ # PyCharm
197
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
198
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
199
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
200
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
201
+ #.idea/
202
+
203
+ ### Python Patch ###
204
+ # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
205
+ poetry.toml
206
+
207
+ # ruff
208
+ .ruff_cache/
209
+
210
+ # LSP config files
211
+ pyrightconfig.json
212
+
213
+ ### VisualStudioCode ###
214
+ .vscode/*
215
+ !.vscode/settings.json
216
+ !.vscode/tasks.json
217
+ !.vscode/launch.json
218
+ !.vscode/extensions.json
219
+ !.vscode/*.code-snippets
220
+
221
+ # Local History for Visual Studio Code
222
+ .history/
223
+
224
+ # Built Visual Studio Code Extensions
225
+ *.vsix
226
+
227
+ ### VisualStudioCode Patch ###
228
+ # Ignore all local history of files
229
+ .history
230
+ .ionide
231
+
232
+ # End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,macos,dotenv,python
233
+
234
+ # Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option)
235
+
236
+ # Local IDE metadata
237
+ .idea/
238
+
239
+ # Local environment files (keep template versioned)
240
+ .env.*
241
+ !.env.template
242
+
243
+ # Local runtime artifacts
244
+ *.sqlite
245
+ *.sqlite3
246
+
247
+ # Local project datasets, generated outputs and caches
248
+ data/out/**
249
+ data/raw/**
250
+ data/models/**
251
+ data/cache/**
252
+ !data/out/.gitkeep
253
+ !data/raw/.gitkeep
254
+ !data/models/.gitkeep
255
+ !data/cache/.gitkeep
@@ -0,0 +1,21 @@
1
+ # MIT License
2
+
3
+ Copyright (c) 2026 Guillaume Lombardo
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,175 @@
1
+ Metadata-Version: 2.4
2
+ Name: theme-extractor
3
+ Version: 0.1.0
4
+ Summary: Parse a corpus, manage it and extract themes from a corpus.
5
+ Project-URL: Homepage, https://github.com/Guillaume-Lombardo/theme-extractor
6
+ Project-URL: Repository, https://github.com/Guillaume-Lombardo/theme-extractor
7
+ Project-URL: Issues, https://github.com/Guillaume-Lombardo/theme-extractor/issues
8
+ License: # MIT License
9
+
10
+ Copyright (c) 2026 Guillaume Lombardo
11
+
12
+ Permission is hereby granted, free of charge, to any person obtaining a copy
13
+ of this software and associated documentation files (the "Software"), to deal
14
+ in the Software without restriction, including without limitation the rights
15
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16
+ copies of the Software, and to permit persons to whom the Software is
17
+ furnished to do so, subject to the following conditions:
18
+
19
+ The above copyright notice and this permission notice shall be included in all
20
+ copies or substantial portions of the Software.
21
+
22
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28
+ SOFTWARE.
29
+ License-File: LICENSE.md
30
+ Keywords: BERTopics,cli,corpus management,elasticsearch,keyBERT,llm,opensearch,python,sentence-transformers,text analysis,theme extraction
31
+ Classifier: Development Status :: 3 - Alpha
32
+ Classifier: Environment :: Console
33
+ Classifier: Intended Audience :: Developers
34
+ Classifier: License :: OSI Approved :: MIT License
35
+ Classifier: Programming Language :: Python :: 3
36
+ Classifier: Programming Language :: Python :: 3.13
37
+ Classifier: Topic :: Software Development :: Build Tools
38
+ Classifier: Topic :: Text Processing :: Markup
39
+ Requires-Python: >=3.13
40
+ Requires-Dist: httpx>=0.28.1
41
+ Requires-Dist: numpy>=2.0.0
42
+ Requires-Dist: pydantic-settings>=2.12.0
43
+ Requires-Dist: pydantic>=2.12.0
44
+ Requires-Dist: pyyaml>=6.0.3
45
+ Requires-Dist: scikit-learn>=1.3.0
46
+ Requires-Dist: structlog>=25.5.0
47
+ Description-Content-Type: text/markdown
48
+
49
+ # theme-extractor
50
+
51
+ Python toolkit to compare theme/topic extraction strategies on the same corpus, with:
52
+
53
+ - one unified CLI
54
+ - one unified JSON output schema
55
+ - offline-first execution options
56
+ - Elasticsearch/OpenSearch backend support
57
+
58
+ ## Why this project
59
+
60
+ `theme-extractor` helps you answer one practical question:
61
+ "Which extraction strategy works best for my corpus and constraints?"
62
+
63
+ It lets you run baseline lexical methods, embedding-based methods, and LLM-assisted methods with consistent outputs for easier comparison.
64
+
65
+ ## Core Commands
66
+
67
+ - `theme-extractor ingest`
68
+ - `theme-extractor extract`
69
+ - `theme-extractor benchmark`
70
+ - `theme-extractor evaluate`
71
+ - `theme-extractor report`
72
+ - `theme-extractor doctor`
73
+
74
+ ## Quickstart
75
+
76
+ Prerequisite: start one backend first (local Docker guide: [`/howto/docker-local.md`](howto/docker-local.md)).
77
+
78
+ ```bash
79
+ uv sync --group elasticsearch
80
+ uv run theme-extractor doctor --output data/out/doctor.json
81
+ uv run theme-extractor ingest --input data/raw --output data/out/ingest.json
82
+ uv run theme-extractor benchmark \
83
+ --methods baseline_tfidf,terms,significant_terms,keybert,bertopic \
84
+ --backend elasticsearch \
85
+ --backend-url http://localhost:9200 \
86
+ --index theme_extractor \
87
+ --focus both \
88
+ --output data/out/benchmark.json
89
+ uv run theme-extractor evaluate \
90
+ --input data/out/benchmark.json \
91
+ --output data/out/evaluation.json
92
+ uv run theme-extractor report \
93
+ --input data/out/benchmark.json \
94
+ --output data/out/report_benchmark.md
95
+ ```
96
+
97
+ Run `significant_text` separately with `--agg-field content` (see [`/howto/benchmark.md`](howto/benchmark.md)).
98
+
99
+ ## Methods Available
100
+
101
+ - Baselines:
102
+ - `baseline_tfidf`
103
+ - `terms`
104
+ - `significant_terms`
105
+ - `significant_text`
106
+ - Semantic:
107
+ - `keybert`
108
+ - `bertopic` (embedding on/off, reduction `none/svd/nmf/umap`, clustering `kmeans/hdbscan`)
109
+ - Generative:
110
+ - `llm` (offline fallback behavior supported)
111
+
112
+ ## What You Get
113
+
114
+ - Unified extraction schema:
115
+ - topic-first output (`topics`)
116
+ - optional document-topic links (`document_topics`)
117
+ - execution notes and metadata (`notes`, `metadata`)
118
+ - Benchmark output:
119
+ - per-method outputs
120
+ - comparison block
121
+ - Quantitative proxies via `evaluate`:
122
+ - topic coherence proxy
123
+ - inter-topic diversity
124
+ - run-to-run stability
125
+
126
+ ## Documentation Map (How-To)
127
+
128
+ Detailed operations are intentionally kept in `/howto`:
129
+
130
+ - [`/howto/ingest.md`](howto/ingest.md): ingestion, cleaning, stopwords, streaming mode
131
+ - [`/howto/extract.md`](howto/extract.md): single-method extraction and interpretation
132
+ - [`/howto/benchmark.md`](howto/benchmark.md): multi-method comparison workflow
133
+ - [`/howto/report.md`](howto/report.md): markdown reporting workflow
134
+ - [`/howto/release.md`](howto/release.md): PyPI/TestPyPI release workflow
135
+ - [`/howto/docker-local.md`](howto/docker-local.md): local Docker stacks (Elasticsearch/OpenSearch)
136
+ - [`/howto/troubleshooting.md`](howto/troubleshooting.md): common failures and fixes
137
+
138
+ Sphinx documentation is available under `/docs` (includes README, how-to pages, and API docstrings).
139
+
140
+ Build locally:
141
+
142
+ ```bash
143
+ uv run sphinx-build -b html docs docs/_build/html
144
+ ```
145
+
146
+ ## Configuration
147
+
148
+ Use `.env.template` as bootstrap:
149
+
150
+ ```bash
151
+ cp .env.template .env
152
+ set -a; source .env; set +a
153
+ ```
154
+
155
+ Important variable groups:
156
+
157
+ - backend/runtime (`THEME_EXTRACTOR_BACKEND*`, `THEME_EXTRACTOR_PROXY_URL`)
158
+ - ingestion stopwords (`THEME_EXTRACTOR_DEFAULT_STOPWORDS_ENABLED`, `THEME_EXTRACTOR_AUTO_STOPWORDS_*`)
159
+ - PDF OCR fallback (`THEME_EXTRACTOR_PDF_OCR_*`) for scanned PDFs
160
+ - `.msg` extraction (`THEME_EXTRACTOR_MSG_*`) for metadata and attachment policy
161
+ - local model resolution (`THEME_EXTRACTOR_LOCAL_MODELS_DIR`)
162
+ - BERTopic embedding cache (`THEME_EXTRACTOR_BERTOPIC_EMBEDDING_CACHE_*`)
163
+
164
+ ## Project Governance
165
+
166
+ - [`AGENTS.md`](AGENTS.md): operating rules
167
+ - [`plan.md`](plan.md): phased roadmap
168
+ - [`agent.md`](agent.md): agent charter
169
+
170
+ ## Community Standards
171
+
172
+ - [`CONTRIBUTING.md`](CONTRIBUTING.md)
173
+ - [`CODE_OF_CONDUCT.md`](CODE_OF_CONDUCT.md)
174
+ - [`SECURITY.md`](SECURITY.md)
175
+ - [`SUPPORT.md`](SUPPORT.md)
@@ -0,0 +1,127 @@
1
+ # theme-extractor
2
+
3
+ Python toolkit to compare theme/topic extraction strategies on the same corpus, with:
4
+
5
+ - one unified CLI
6
+ - one unified JSON output schema
7
+ - offline-first execution options
8
+ - Elasticsearch/OpenSearch backend support
9
+
10
+ ## Why this project
11
+
12
+ `theme-extractor` helps you answer one practical question:
13
+ "Which extraction strategy works best for my corpus and constraints?"
14
+
15
+ It lets you run baseline lexical methods, embedding-based methods, and LLM-assisted methods with consistent outputs for easier comparison.
16
+
17
+ ## Core Commands
18
+
19
+ - `theme-extractor ingest`
20
+ - `theme-extractor extract`
21
+ - `theme-extractor benchmark`
22
+ - `theme-extractor evaluate`
23
+ - `theme-extractor report`
24
+ - `theme-extractor doctor`
25
+
26
+ ## Quickstart
27
+
28
+ Prerequisite: start one backend first (local Docker guide: [`/howto/docker-local.md`](howto/docker-local.md)).
29
+
30
+ ```bash
31
+ uv sync --group elasticsearch
32
+ uv run theme-extractor doctor --output data/out/doctor.json
33
+ uv run theme-extractor ingest --input data/raw --output data/out/ingest.json
34
+ uv run theme-extractor benchmark \
35
+ --methods baseline_tfidf,terms,significant_terms,keybert,bertopic \
36
+ --backend elasticsearch \
37
+ --backend-url http://localhost:9200 \
38
+ --index theme_extractor \
39
+ --focus both \
40
+ --output data/out/benchmark.json
41
+ uv run theme-extractor evaluate \
42
+ --input data/out/benchmark.json \
43
+ --output data/out/evaluation.json
44
+ uv run theme-extractor report \
45
+ --input data/out/benchmark.json \
46
+ --output data/out/report_benchmark.md
47
+ ```
48
+
49
+ Run `significant_text` separately with `--agg-field content` (see [`/howto/benchmark.md`](howto/benchmark.md)).
50
+
51
+ ## Methods Available
52
+
53
+ - Baselines:
54
+ - `baseline_tfidf`
55
+ - `terms`
56
+ - `significant_terms`
57
+ - `significant_text`
58
+ - Semantic:
59
+ - `keybert`
60
+ - `bertopic` (embedding on/off, reduction `none/svd/nmf/umap`, clustering `kmeans/hdbscan`)
61
+ - Generative:
62
+ - `llm` (offline fallback behavior supported)
63
+
64
+ ## What You Get
65
+
66
+ - Unified extraction schema:
67
+ - topic-first output (`topics`)
68
+ - optional document-topic links (`document_topics`)
69
+ - execution notes and metadata (`notes`, `metadata`)
70
+ - Benchmark output:
71
+ - per-method outputs
72
+ - comparison block
73
+ - Quantitative proxies via `evaluate`:
74
+ - topic coherence proxy
75
+ - inter-topic diversity
76
+ - run-to-run stability
77
+
78
+ ## Documentation Map (How-To)
79
+
80
+ Detailed operations are intentionally kept in `/howto`:
81
+
82
+ - [`/howto/ingest.md`](howto/ingest.md): ingestion, cleaning, stopwords, streaming mode
83
+ - [`/howto/extract.md`](howto/extract.md): single-method extraction and interpretation
84
+ - [`/howto/benchmark.md`](howto/benchmark.md): multi-method comparison workflow
85
+ - [`/howto/report.md`](howto/report.md): markdown reporting workflow
86
+ - [`/howto/release.md`](howto/release.md): PyPI/TestPyPI release workflow
87
+ - [`/howto/docker-local.md`](howto/docker-local.md): local Docker stacks (Elasticsearch/OpenSearch)
88
+ - [`/howto/troubleshooting.md`](howto/troubleshooting.md): common failures and fixes
89
+
90
+ Sphinx documentation is available under `/docs` (includes README, how-to pages, and API docstrings).
91
+
92
+ Build locally:
93
+
94
+ ```bash
95
+ uv run sphinx-build -b html docs docs/_build/html
96
+ ```
97
+
98
+ ## Configuration
99
+
100
+ Use `.env.template` as bootstrap:
101
+
102
+ ```bash
103
+ cp .env.template .env
104
+ set -a; source .env; set +a
105
+ ```
106
+
107
+ Important variable groups:
108
+
109
+ - backend/runtime (`THEME_EXTRACTOR_BACKEND*`, `THEME_EXTRACTOR_PROXY_URL`)
110
+ - ingestion stopwords (`THEME_EXTRACTOR_DEFAULT_STOPWORDS_ENABLED`, `THEME_EXTRACTOR_AUTO_STOPWORDS_*`)
111
+ - PDF OCR fallback (`THEME_EXTRACTOR_PDF_OCR_*`) for scanned PDFs
112
+ - `.msg` extraction (`THEME_EXTRACTOR_MSG_*`) for metadata and attachment policy
113
+ - local model resolution (`THEME_EXTRACTOR_LOCAL_MODELS_DIR`)
114
+ - BERTopic embedding cache (`THEME_EXTRACTOR_BERTOPIC_EMBEDDING_CACHE_*`)
115
+
116
+ ## Project Governance
117
+
118
+ - [`AGENTS.md`](AGENTS.md): operating rules
119
+ - [`plan.md`](plan.md): phased roadmap
120
+ - [`agent.md`](agent.md): agent charter
121
+
122
+ ## Community Standards
123
+
124
+ - [`CONTRIBUTING.md`](CONTRIBUTING.md)
125
+ - [`CODE_OF_CONDUCT.md`](CODE_OF_CONDUCT.md)
126
+ - [`SECURITY.md`](SECURITY.md)
127
+ - [`SUPPORT.md`](SUPPORT.md)
@@ -0,0 +1,13 @@
1
+ # How-To Guides
2
+
3
+ This folder contains practical command recipes to run the package end-to-end.
4
+
5
+ - `ingest.md`: build an ingestion output from local files.
6
+ - `extract.md`: run one extraction strategy and interpret the output.
7
+ - `benchmark.md`: compare multiple strategies and choose a candidate.
8
+ - `report.md`: generate and read markdown reports from JSON outputs.
9
+ - `release.md`: publish package distributions to TestPyPI/PyPI.
10
+ - `docker-local.md`: start/stop local Elasticsearch or OpenSearch stacks.
11
+ - `troubleshooting.md`: common issues and practical fixes.
12
+
13
+ All commands are executed from the project root.
@@ -0,0 +1,117 @@
1
+ [project]
2
+ name = "theme-extractor"
3
+ version = "0.1.0"
4
+ description = "Parse a corpus, manage it and extract themes from a corpus."
5
+ readme = "README.md"
6
+ requires-python = ">=3.13"
7
+ license = { file = "LICENSE.md" }
8
+ keywords = ["theme extraction", "corpus management", "text analysis", "python", "elasticsearch", "cli", "opensearch", "llm", "keyBERT", "sentence-transformers", "BERTopics"]
9
+ classifiers = [
10
+ "Development Status :: 3 - Alpha",
11
+ "Environment :: Console",
12
+ "Intended Audience :: Developers",
13
+ "License :: OSI Approved :: MIT License",
14
+ "Programming Language :: Python :: 3",
15
+ "Programming Language :: Python :: 3.13",
16
+ "Topic :: Software Development :: Build Tools",
17
+ "Topic :: Text Processing :: Markup",
18
+ ]
19
+ dependencies = [
20
+ "pydantic>=2.12.0",
21
+ "pydantic-settings>=2.12.0",
22
+ "httpx>=0.28.1",
23
+ "structlog>=25.5.0",
24
+ "pyyaml>=6.0.3",
25
+ "scikit-learn>=1.3.0",
26
+ "numpy>=2.0.0",
27
+ ]
28
+
29
+ [dependency-groups]
30
+ dev = [
31
+ "ruff>=0.15.0",
32
+ "ty>=0.0.15",
33
+ "pytest>=9.0.2",
34
+ "pytest-cov>=7.0.0",
35
+ "pytest-mock>=3.15.1",
36
+ "pytest-sugar>=1.1.1",
37
+ "pre-commit>=4.5.1",
38
+ "detect-secrets>=1.5.0",
39
+ "sphinx>=9.1.0",
40
+ "myst-parser>=5.0.0",
41
+ ]
42
+ bert = [
43
+ "keyBERT>=0.7.0",
44
+ "sentence-transformers>=5.2.2",
45
+ "BERTopic>=0.15.0",
46
+ "umap-learn>=0.5.3",
47
+ "numba>=0.63.1",
48
+ "llvmlite>=0.46.0",
49
+ ]
50
+ elasticsearch = [
51
+ "elasticsearch>=9.3.0",
52
+ ]
53
+ opensearch = [
54
+ "opensearch-py>=3.1.0",
55
+ ]
56
+ llm = [
57
+ "openai>=1.68.2",
58
+ ]
59
+
60
+ [build-system]
61
+ requires = ["hatchling", "build"]
62
+ build-backend = "hatchling.build"
63
+
64
+ [tool.hatch.build.targets.wheel]
65
+ packages = ["src/theme_extractor"]
66
+ include = ["src/theme_extractor/**"]
67
+ exclude = ["src/theme_extractor/**/*.pyc", "src/theme_extractor/__pycache__/**"]
68
+
69
+ [tool.hatch.build.targets.sdist]
70
+ include = ["src/theme_extractor/**", "README.md", "LICENSE.md", "pyproject.toml"]
71
+ exclude = ["src/theme_extractor/**/*.pyc", "src/theme_extractor/__pycache__/**"]
72
+
73
+
74
+ [project.scripts]
75
+ theme-extractor = "theme_extractor.cli:main"
76
+
77
+ [project.urls]
78
+ Homepage = "https://github.com/Guillaume-Lombardo/theme-extractor"
79
+ Repository = "https://github.com/Guillaume-Lombardo/theme-extractor"
80
+ Issues = "https://github.com/Guillaume-Lombardo/theme-extractor/issues"
81
+
82
+
83
+ [tool.pytest.ini_options]
84
+ minversion = "9.0"
85
+ testpaths = ["tests"]
86
+ addopts = "-ra -q -m unit --strict-markers --cov=src --cov-report=term-missing:skip-covered --cov-report=html"
87
+ pythonpath = ["src"]
88
+ markers = [
89
+ "unit: mark a test as a unit test.",
90
+ "integration: mark a test as an integration test.",
91
+ "end2end: mark a test as an end-to-end test.",
92
+ ]
93
+
94
+ [tool.coverage.run]
95
+ branch = true
96
+ source = ["src/theme_extractor"]
97
+ omit = ["*/tests/*"]
98
+
99
+ [tool.coverage.report]
100
+ exclude_lines = [
101
+ "def __repr__",
102
+ "def __str__",
103
+ "if self.debug",
104
+ "if settings.DEBUG",
105
+ "if __name__ .__main__.:",
106
+ "pragma: no cover",
107
+ "raise AssertionError",
108
+ "raise NotImplementedError",
109
+ ]
110
+ show_missing = true
111
+ skip_covered = true
112
+ precision = 2
113
+ fail_under = 80
114
+
115
+ [tool.coverage.html]
116
+ directory = "htmlcov"
117
+ title = "Theme Extractor Coverage Report"