textflow-ir 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. textflow_ir-0.1.0/.gitignore +11 -0
  2. textflow_ir-0.1.0/CMakeLists.txt +171 -0
  3. textflow_ir-0.1.0/PKG-INFO +283 -0
  4. textflow_ir-0.1.0/README.md +265 -0
  5. textflow_ir-0.1.0/bindings/python/module.cpp +296 -0
  6. textflow_ir-0.1.0/cmake/termflowConfig.cmake.in +6 -0
  7. textflow_ir-0.1.0/docker/Dockerfile +30 -0
  8. textflow_ir-0.1.0/docker-compose.yml +9 -0
  9. textflow_ir-0.1.0/docs/customization.md +228 -0
  10. textflow_ir-0.1.0/docs/installation-roadmap.md +118 -0
  11. textflow_ir-0.1.0/docs/installation.md +189 -0
  12. textflow_ir-0.1.0/docs/usage.md +219 -0
  13. textflow_ir-0.1.0/examples/analyze_query.cpp +94 -0
  14. textflow_ir-0.1.0/examples/analyze_text.cpp +39 -0
  15. textflow_ir-0.1.0/examples/custom_analyzer.cpp +66 -0
  16. textflow_ir-0.1.0/examples/extract_terms.cpp +44 -0
  17. textflow_ir-0.1.0/examples/find_package_consumer/CMakeLists.txt +9 -0
  18. textflow_ir-0.1.0/examples/find_package_consumer/main.cpp +15 -0
  19. textflow_ir-0.1.0/include/termflow/analysis/analyzer.hpp +53 -0
  20. textflow_ir-0.1.0/include/termflow/analysis/ascii_folding_filter.hpp +12 -0
  21. textflow_ir-0.1.0/include/termflow/analysis/char_filter.hpp +21 -0
  22. textflow_ir-0.1.0/include/termflow/analysis/english_analyzer.hpp +90 -0
  23. textflow_ir-0.1.0/include/termflow/analysis/english_possessive_filter.hpp +12 -0
  24. textflow_ir-0.1.0/include/termflow/analysis/lower_case_filter.hpp +12 -0
  25. textflow_ir-0.1.0/include/termflow/analysis/porter_stem_filter.hpp +12 -0
  26. textflow_ir-0.1.0/include/termflow/analysis/standard_tokenizer.hpp +34 -0
  27. textflow_ir-0.1.0/include/termflow/analysis/stop_filter.hpp +22 -0
  28. textflow_ir-0.1.0/include/termflow/analysis/term_extractor.hpp +55 -0
  29. textflow_ir-0.1.0/include/termflow/analysis/token.hpp +27 -0
  30. textflow_ir-0.1.0/include/termflow/analysis/token_filter.hpp +22 -0
  31. textflow_ir-0.1.0/include/termflow/analysis/tokenizer.hpp +24 -0
  32. textflow_ir-0.1.0/include/termflow/analysis/unicode_normalize_filter.hpp +12 -0
  33. textflow_ir-0.1.0/include/termflow/query/query_analyzer.hpp +53 -0
  34. textflow_ir-0.1.0/include/termflow/query/query_parser.hpp +30 -0
  35. textflow_ir-0.1.0/include/termflow/query/query_types.hpp +96 -0
  36. textflow_ir-0.1.0/include/termflow/query/rewrite_loader.hpp +33 -0
  37. textflow_ir-0.1.0/include/termflow/query/rewrite_validator.hpp +78 -0
  38. textflow_ir-0.1.0/pyproject.toml +79 -0
  39. textflow_ir-0.1.0/python/termflow/__init__.py +32 -0
  40. textflow_ir-0.1.0/src/analysis/analyzer.cpp +71 -0
  41. textflow_ir-0.1.0/src/analysis/ascii_folding_filter.cpp +13 -0
  42. textflow_ir-0.1.0/src/analysis/english_analyzer.cpp +105 -0
  43. textflow_ir-0.1.0/src/analysis/english_possessive_filter.cpp +13 -0
  44. textflow_ir-0.1.0/src/analysis/lower_case_filter.cpp +13 -0
  45. textflow_ir-0.1.0/src/analysis/porter_stem_filter.cpp +15 -0
  46. textflow_ir-0.1.0/src/analysis/standard_tokenizer.cpp +199 -0
  47. textflow_ir-0.1.0/src/analysis/stop_filter.cpp +39 -0
  48. textflow_ir-0.1.0/src/analysis/term_extractor.cpp +149 -0
  49. textflow_ir-0.1.0/src/analysis/unicode_normalize_filter.cpp +13 -0
  50. textflow_ir-0.1.0/src/query/query_analyzer.cpp +105 -0
  51. textflow_ir-0.1.0/src/query/query_parser.cpp +70 -0
  52. textflow_ir-0.1.0/src/query/rewrite_internal.hpp +50 -0
  53. textflow_ir-0.1.0/src/query/rewrite_loader.cpp +215 -0
  54. textflow_ir-0.1.0/src/query/rewrite_validator.cpp +292 -0
  55. textflow_ir-0.1.0/src/util/porter_stemmer.cpp +531 -0
  56. textflow_ir-0.1.0/src/util/porter_stemmer.hpp +10 -0
  57. textflow_ir-0.1.0/src/util/unicode.cpp +190 -0
  58. textflow_ir-0.1.0/src/util/unicode.hpp +17 -0
  59. textflow_ir-0.1.0/tests/english_analyzer_tests.cpp +241 -0
  60. textflow_ir-0.1.0/tests/filter_tests.cpp +109 -0
  61. textflow_ir-0.1.0/tests/python_smoke_test.py +98 -0
  62. textflow_ir-0.1.0/tests/query_tests.cpp +336 -0
  63. textflow_ir-0.1.0/tests/standard_tokenizer_tests.cpp +82 -0
  64. textflow_ir-0.1.0/tests/term_extractor_tests.cpp +42 -0
  65. textflow_ir-0.1.0/tests/test_framework.cpp +22 -0
  66. textflow_ir-0.1.0/tests/test_framework.hpp +80 -0
  67. textflow_ir-0.1.0/tests/test_helpers.hpp +38 -0
  68. textflow_ir-0.1.0/tests/test_main.cpp +23 -0
  69. textflow_ir-0.1.0/tools/build_python_dist.sh +5 -0
  70. textflow_ir-0.1.0/tools/publish_python_dist.sh +9 -0
  71. textflow_ir-0.1.0/tools/test_full_build.sh +31 -0
  72. textflow_ir-0.1.0/tools/test_in_docker.sh +4 -0
  73. textflow_ir-0.1.0/tools/test_installed_consumer.sh +47 -0
  74. textflow_ir-0.1.0/tools/test_python_wheel_install.sh +34 -0
@@ -0,0 +1,11 @@
1
+ /.DS_Store
2
+ /__pycache__/
3
+ *.pyc
4
+ *.egg-info/
5
+ /.venv/
6
+ /build/
7
+ /dist/
8
+ /wheelhouse/
9
+ /.local/
10
+ /.tmp_bench_corpus/
11
+ /.idea/
@@ -0,0 +1,171 @@
1
+ cmake_minimum_required(VERSION 3.24)
2
+
3
+ project(
4
+ termflow
5
+ VERSION 0.1.0
6
+ DESCRIPTION "English text analysis library for C++"
7
+ LANGUAGES CXX)
8
+
9
+ include(GNUInstallDirs)
10
+ include(CMakePackageConfigHelpers)
11
+
12
+ option(TERMFLOW_BUILD_TESTS "Build the termflow test suite" ON)
13
+ option(TERMFLOW_BUILD_EXAMPLES "Build the termflow example programs" ON)
14
+ option(TERMFLOW_BUILD_TOOLS "Build the termflow command-line tools" ON)
15
+ option(TERMFLOW_BUILD_PYTHON "Build the termflow Python bindings" OFF)
16
+ option(TERMFLOW_INSTALL_CPP_ARTIFACTS "Install C++ library and development artifacts" ON)
17
+
18
+ set(CMAKE_CXX_STANDARD 20)
19
+ set(CMAKE_CXX_STANDARD_REQUIRED ON)
20
+ set(CMAKE_CXX_EXTENSIONS OFF)
21
+
22
+ find_package(ICU REQUIRED COMPONENTS i18n uc)
23
+
24
+ if(TERMFLOW_BUILD_PYTHON)
25
+ find_package(Python3 REQUIRED COMPONENTS Interpreter Development.Module)
26
+ find_package(pybind11 CONFIG REQUIRED)
27
+ endif()
28
+
29
+ add_library(termflow
30
+ src/analysis/analyzer.cpp
31
+ src/analysis/ascii_folding_filter.cpp
32
+ src/analysis/english_analyzer.cpp
33
+ src/analysis/english_possessive_filter.cpp
34
+ src/analysis/lower_case_filter.cpp
35
+ src/analysis/porter_stem_filter.cpp
36
+ src/analysis/standard_tokenizer.cpp
37
+ src/analysis/stop_filter.cpp
38
+ src/analysis/term_extractor.cpp
39
+ src/analysis/unicode_normalize_filter.cpp
40
+ src/query/query_analyzer.cpp
41
+ src/query/query_parser.cpp
42
+ src/query/rewrite_loader.cpp
43
+ src/query/rewrite_validator.cpp
44
+ src/util/porter_stemmer.cpp
45
+ src/util/unicode.cpp)
46
+
47
+ add_library(termflow::termflow ALIAS termflow)
48
+
49
+ target_compile_features(termflow PUBLIC cxx_std_20)
50
+ target_link_libraries(termflow PUBLIC ICU::i18n ICU::uc)
51
+ target_include_directories(termflow
52
+ PUBLIC
53
+ $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
54
+ $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
55
+ PRIVATE
56
+ ${CMAKE_CURRENT_SOURCE_DIR}/src)
57
+
58
+ if(MSVC)
59
+ target_compile_options(termflow PRIVATE /W4 /permissive-)
60
+ else()
61
+ target_compile_options(termflow PRIVATE -Wall -Wextra -Wpedantic)
62
+ endif()
63
+
64
+ set_target_properties(termflow PROPERTIES
65
+ EXPORT_NAME termflow
66
+ POSITION_INDEPENDENT_CODE ON
67
+ VERSION ${PROJECT_VERSION}
68
+ SOVERSION 0)
69
+
70
+ if(TERMFLOW_BUILD_EXAMPLES)
71
+ add_executable(termflow_analyze examples/analyze_text.cpp)
72
+ target_link_libraries(termflow_analyze PRIVATE termflow::termflow)
73
+
74
+ add_executable(termflow_extract_terms examples/extract_terms.cpp)
75
+ target_link_libraries(termflow_extract_terms PRIVATE termflow::termflow)
76
+
77
+ add_executable(termflow_custom_analyzer examples/custom_analyzer.cpp)
78
+ target_link_libraries(termflow_custom_analyzer PRIVATE termflow::termflow)
79
+
80
+ add_executable(termflow_analyze_query examples/analyze_query.cpp)
81
+ target_link_libraries(termflow_analyze_query PRIVATE termflow::termflow)
82
+ endif()
83
+
84
+ if(TERMFLOW_BUILD_TOOLS)
85
+ endif()
86
+
87
+ if(TERMFLOW_BUILD_PYTHON)
88
+ set(TERMFLOW_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/python/termflow)
89
+ file(MAKE_DIRECTORY ${TERMFLOW_PYTHON_PACKAGE_DIR})
90
+ configure_file(
91
+ ${CMAKE_CURRENT_SOURCE_DIR}/python/termflow/__init__.py
92
+ ${TERMFLOW_PYTHON_PACKAGE_DIR}/__init__.py
93
+ COPYONLY)
94
+
95
+ pybind11_add_module(termflow_python MODULE bindings/python/module.cpp)
96
+ target_link_libraries(termflow_python PRIVATE termflow::termflow)
97
+ set_target_properties(termflow_python PROPERTIES
98
+ OUTPUT_NAME _termflow
99
+ LIBRARY_OUTPUT_DIRECTORY ${TERMFLOW_PYTHON_PACKAGE_DIR}
100
+ RUNTIME_OUTPUT_DIRECTORY ${TERMFLOW_PYTHON_PACKAGE_DIR})
101
+
102
+ install(
103
+ TARGETS termflow_python
104
+ LIBRARY DESTINATION termflow
105
+ RUNTIME DESTINATION termflow)
106
+
107
+ install(
108
+ FILES python/termflow/__init__.py
109
+ DESTINATION termflow)
110
+ endif()
111
+
112
+ if(TERMFLOW_BUILD_TESTS)
113
+ enable_testing()
114
+
115
+ add_executable(termflow_tests
116
+ tests/english_analyzer_tests.cpp
117
+ tests/filter_tests.cpp
118
+ tests/query_tests.cpp
119
+ tests/standard_tokenizer_tests.cpp
120
+ tests/term_extractor_tests.cpp
121
+ tests/test_framework.cpp
122
+ tests/test_main.cpp
123
+ tests/test_helpers.hpp)
124
+
125
+ target_link_libraries(termflow_tests PRIVATE termflow::termflow)
126
+
127
+ add_test(NAME termflow_tests COMMAND termflow_tests)
128
+
129
+ if(TERMFLOW_BUILD_PYTHON)
130
+ add_test(
131
+ NAME termflow_python_smoke
132
+ COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/tests/python_smoke_test.py)
133
+ set_tests_properties(
134
+ termflow_python_smoke
135
+ PROPERTIES ENVIRONMENT "PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/python")
136
+ endif()
137
+ endif()
138
+ if(TERMFLOW_INSTALL_CPP_ARTIFACTS)
139
+ install(
140
+ TARGETS termflow
141
+ EXPORT termflowTargets
142
+ ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
143
+ LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
144
+ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
145
+
146
+ install(
147
+ DIRECTORY include/
148
+ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
149
+
150
+ install(
151
+ EXPORT termflowTargets
152
+ FILE termflowTargets.cmake
153
+ NAMESPACE termflow::
154
+ DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/termflow)
155
+
156
+ write_basic_package_version_file(
157
+ ${CMAKE_CURRENT_BINARY_DIR}/termflowConfigVersion.cmake
158
+ VERSION ${PROJECT_VERSION}
159
+ COMPATIBILITY SameMajorVersion)
160
+
161
+ configure_package_config_file(
162
+ ${CMAKE_CURRENT_SOURCE_DIR}/cmake/termflowConfig.cmake.in
163
+ ${CMAKE_CURRENT_BINARY_DIR}/termflowConfig.cmake
164
+ INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/termflow)
165
+
166
+ install(
167
+ FILES
168
+ ${CMAKE_CURRENT_BINARY_DIR}/termflowConfig.cmake
169
+ ${CMAKE_CURRENT_BINARY_DIR}/termflowConfigVersion.cmake
170
+ DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/termflow)
171
+ endif()
@@ -0,0 +1,283 @@
1
+ Metadata-Version: 2.2
2
+ Name: textflow-ir
3
+ Version: 0.1.0
4
+ Summary: English text analysis for information retrieval
5
+ Keywords: text-analysis,information-retrieval,tokenization,stemming,search
6
+ Author: Mustafa Abualsaud
7
+ Classifier: Development Status :: 3 - Alpha
8
+ Classifier: Intended Audience :: Developers
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3 :: Only
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: C++
15
+ Classifier: Topic :: Text Processing :: Linguistic
16
+ Requires-Python: >=3.10
17
+ Description-Content-Type: text/markdown
18
+
19
+ # termflow
20
+
21
+ `termflow` is a standalone C++20 library for English text analysis aimed at information retrieval workloads.
22
+
23
+ ## Documentation
24
+
25
+ - [docs/usage.md](docs/usage.md) for day-to-day analyzer, term extraction, query, and Python usage
26
+ - [docs/customization.md](docs/customization.md) for pipeline tuning, query rewrites, and custom analyzers in C++
27
+ - [docs/installation.md](docs/installation.md) for current C++ and Python installation paths
28
+ - [docs/installation-roadmap.md](docs/installation-roadmap.md) for the prioritized installation improvement plan
29
+
30
+ ## Goals
31
+
32
+ - Library-first design with a small public API
33
+ - CMake-based build and install flow
34
+ - Clean, testable analysis components
35
+ - English-first analysis for v1
36
+ - Optional Python bindings on top of the same core library
37
+
38
+ ## Current Scope
39
+
40
+ `termflow` currently includes:
41
+
42
+ - `Token`
43
+ - `CharFilter`
44
+ - `Tokenizer`
45
+ - `TokenFilter`
46
+ - `Analyzer`
47
+ - `TermExtractor`
48
+ - `termflow::query::QueryParser`
49
+ - `termflow::query::RewriteLoader`
50
+ - `termflow::query::RewriteValidator`
51
+ - `termflow::query::QueryAnalyzer`
52
+ - `StandardTokenizer`
53
+ - `UnicodeNormalizeFilter`
54
+ - `LowerCaseFilter`
55
+ - `EnglishPossessiveFilter`
56
+ - `StopFilter`
57
+ - `PorterStemFilter`
58
+ - `AsciiFoldingFilter`
59
+ - `EnglishAnalyzer`
60
+
61
+ Core analyzer APIs:
62
+
63
+ - `analyze(text) -> std::vector<Token>`
64
+ - `analyze_terms(text) -> std::vector<std::string>`
65
+ - `analyze_term(text) -> std::optional<std::string>`
66
+ - `analyze_to_string(text, separator = " ") -> std::string`
67
+ - `normalize(text) -> std::string`
68
+
69
+ `analyze()` runs the full token pipeline. `normalize()` is deliberately lighter-weight and does not tokenize.
70
+
71
+ The query module is intentionally text-focused. It can parse lightweight query intent, analyze clauses through an existing analyzer, and attach optional query-time rewrites. It does not score documents or execute retrieval.
72
+
73
+ ## Behavior Contract
74
+
75
+ For `EnglishAnalyzer`, `analyze()` applies stages in this order:
76
+
77
+ 1. `StandardTokenizer`
78
+ 2. optional Unicode normalization
79
+ 3. optional English possessive stripping
80
+ 4. optional lowercasing
81
+ 5. optional stop-word filtering
82
+ 6. keyword marking from `stem_exclusion_set`
83
+ 7. optional Porter stemming
84
+ 8. optional ASCII folding
85
+
86
+ `normalize()` is intentionally narrower. It honors only:
87
+
88
+ - `unicode_normalization`
89
+ - `lowercase`
90
+ - `ascii_folding`
91
+
92
+ It does not tokenize, strip possessives, remove stop words, mark keywords, or stem.
93
+
94
+ Stage toggles are explicit. Disabling a stage removes only that stage. For example, if `lowercase=false` and `stemming=true`, stemming runs on the token text produced by the earlier enabled stages without forcing lowercase.
95
+
96
+ Custom `stop_words` and `stem_exclusion_set` are normalized using the enabled pre-stem stages so they match the token text seen by stop-word filtering and keyword marking.
97
+
98
+ ## Term Extraction
99
+
100
+ `TermExtractor` is the public helper for consumers that want finalized term strings with explicit post-analysis filtering. It can enforce minimum length, numeric-term handling, and character policy filters without changing the analyzer itself.
101
+
102
+ Example:
103
+
104
+ ```cpp
105
+ termflow::EnglishAnalyzerOptions analyzer_options;
106
+ analyzer_options.use_default_stop_words = false;
107
+ termflow::EnglishAnalyzer analyzer(analyzer_options);
108
+
109
+ termflow::TermExtractionOptions extraction_options;
110
+ extraction_options.min_term_length = 2;
111
+ extraction_options.keep_numeric_terms = false;
112
+ extraction_options.character_policy = termflow::TermCharacterPolicy::ascii_alphabetic;
113
+
114
+ termflow::TermExtractor extractor(analyzer, extraction_options);
115
+ auto terms = extractor.extract("Running beaches 2024 e-mail");
116
+ ```
117
+
118
+ A runnable example is included at [examples/extract_terms.cpp](examples/extract_terms.cpp).
119
+
120
+ ## Query Analysis
121
+
122
+ `termflow::query` provides a small query-intent layer on top of existing analyzers. The current scope is:
123
+
124
+ - clause parsing for bare terms, `+required`, `-excluded`, and quoted groups
125
+ - analyzed query terms produced by an existing `Analyzer`
126
+ - optional query-time rewrites for canonicalization, equivalent alternatives, and expansions
127
+ - analyzer-aware validation for rewrite packs
128
+
129
+ This is intentionally not a search engine query model. The query module does not score, rank, filter, or execute queries against an index.
130
+
131
+ Example:
132
+
133
+ ```cpp
134
+ #include "termflow/analysis/english_analyzer.hpp"
135
+ #include "termflow/query/query_analyzer.hpp"
136
+ #include "termflow/query/rewrite_loader.hpp"
137
+
138
+ termflow::EnglishAnalyzer analyzer;
139
+ termflow::query::RewriteLoader loader;
140
+ termflow::query::RewriteValidator validator(analyzer);
141
+
142
+ termflow::query::QueryAnalysisOptions options;
143
+ options.rewrites = loader.load_file("config/query-rewrites.tfq");
144
+ const auto validation = validator.validate(options);
145
+
146
+ termflow::query::QueryAnalyzer query_analyzer(analyzer, options);
147
+ const auto query = query_analyzer.analyze(R"(tv +"car rentals" -expired)");
148
+ ```
149
+
150
+ In that example:
151
+
152
+ - `tv` is canonically rewritten before it reaches downstream scoring logic
153
+ - `car rentals` keeps its primary analyzed terms and also receives an equivalent alternative
154
+ - `-expired` is parsed as an excluded clause
155
+
156
+ Rewrites match exact analyzed term sequences at the clause level. Quoted groups remain grouped in the parsed and analyzed query representation, but `termflow` does not implement phrase execution logic.
157
+
158
+ `RewriteValidator` reports rewrite rules that analyze to no terms, canonical conflicts and cycles, and alternatives that are duplicated or shadowed by the configured deduplication policy. It is a reporting layer only; it does not rewrite or reorder the supplied rules.
159
+
160
+ Rewrite files use a small declarative syntax:
161
+
162
+ ```text
163
+ # Canonical rewrites
164
+ lotr -> lord of the rings;
165
+ colour -> color;
166
+
167
+ # Equivalent alternatives
168
+ car rentals => automobile rentals;
169
+
170
+ # Expansion alternatives
171
+ car rentals ~> vehicle rentals;
172
+
173
+ # Includes are resolved relative to the current file
174
+ include "common-rewrites.tfq";
175
+ ```
176
+
177
+ Supported forms are intentionally limited to text rewrites and includes. `termflow` does not interpret field syntax, conditional macros, stop-word directives, boosts, or other higher-level query language constructs in rewrite files.
178
+
179
+ A runnable example is included at [examples/analyze_query.cpp](examples/analyze_query.cpp).
180
+
181
+ ## API Contracts
182
+
183
+ - Offsets are represented as UTF-16 code-unit offsets.
184
+ - `Analyzer`, `Tokenizer`, and `TermExtractor` are batch-oriented APIs that materialize results into `std::vector` containers.
185
+ - `QueryParser`, `RewriteLoader`, `RewriteValidator`, and `QueryAnalyzer` are also batch-oriented APIs that materialize parsed clauses, rewrite rules, validation reports, and analyzed term sequences into `std::vector` containers.
186
+ - v1 does not expose a streaming token API.
187
+ - Implementations may throw `std::runtime_error` when ICU cannot be initialized or fails while processing text.
188
+ - Invalid configuration is reported with standard exceptions such as `std::invalid_argument`.
189
+ - Constructed analyzers and filters are safe for concurrent const use. A typical parallel usage pattern is to share one configured analyzer across worker threads and analyze different documents concurrently.
190
+ - `QueryParser`, `RewriteLoader`, `RewriteValidator`, and `QueryAnalyzer` are also safe for concurrent const use after construction.
191
+ - Do not reconfigure mutable objects, such as a tokenizer with a new max token length, concurrently with analysis.
192
+
193
+ ## Design Notes
194
+
195
+ - Tokenization uses ICU word boundaries.
196
+ - ASCII folding is optional and disabled by default.
197
+ - The Porter stemmer is embedded and scoped for English terms.
198
+ - The built-in English analyzer uses an optimized internal fast path for its pre-stem normalization stages. Tests compare that path against an explicitly composed public-filter pipeline to keep them aligned.
199
+
200
+ ## Custom Analyzers
201
+
202
+ The public tokenizer and filter interfaces are intended to support explicit analyzer construction. A runnable example is included at [examples/custom_analyzer.cpp](examples/custom_analyzer.cpp).
203
+
204
+ That example shows how to:
205
+
206
+ - subclass `Analyzer`
207
+ - reuse `StandardTokenizer`
208
+ - compose public token filters directly
209
+ - define a separate `normalize()` path for the custom analyzer
210
+
211
+ The query module is designed to compose with custom analyzers as well. `QueryAnalyzer` accepts any configured `Analyzer` and uses that analyzer's `analyze_terms()` behavior when normalizing query clauses and rewrite rules.
212
+
213
+ ## Limitations
214
+
215
+ This is intentionally a small v1.
216
+
217
+ - English only
218
+ - No token graphs
219
+ - No token-graph-style synonym processing
220
+ - No phrase execution logic
221
+ - Query parsing is intentionally lightweight: no escaping, precedence rules, boosts, or field syntax
222
+ - No index structures
223
+ - No multilingual analyzers
224
+ - No concrete `CharFilter` implementations yet
225
+
226
+ ## Future Direction
227
+
228
+ The core abstractions are language-agnostic. The intended direction is:
229
+
230
+ - keep `termflow` core focused on shared analysis primitives
231
+ - keep query-intent analysis focused on parsing and rewrite preparation rather than scoring
232
+ - add language analyzers as independent modules or packages
233
+ - keep analyzer construction explicit rather than reflection-driven
234
+ - reuse shared Unicode handling where it improves correctness
235
+
236
+ ## Building
237
+
238
+ Local build:
239
+
240
+ ```bash
241
+ cmake -S . -B build -G Ninja
242
+ cmake --build build
243
+ ctest --test-dir build --output-on-failure
244
+ ```
245
+
246
+ The examples are also built by default:
247
+
248
+ - `termflow_analyze`
249
+ - `termflow_extract_terms`
250
+ - `termflow_custom_analyzer`
251
+ - `termflow_analyze_query`
252
+
253
+ ## Python Bindings
254
+
255
+ Optional Python bindings can be built on top of the same core library:
256
+
257
+ ```bash
258
+ cmake -S . -B build -G Ninja -DTERMFLOW_BUILD_PYTHON=ON
259
+ cmake --build build
260
+ PYTHONPATH=build/python python3 -c 'import termflow; print(termflow.EnglishAnalyzer().analyze_terms("Running Cars"))'
261
+ ```
262
+
263
+ Python wheel and sdist build:
264
+
265
+ ```bash
266
+ python3 -m build --sdist --wheel
267
+ python3 -m twine check dist/*
268
+ ```
269
+
270
+ The repository includes [pyproject.toml](pyproject.toml) and two helper scripts:
271
+
272
+ - [tools/build_python_dist.sh](tools/build_python_dist.sh)
273
+ - [tools/publish_python_dist.sh](tools/publish_python_dist.sh)
274
+
275
+ The optional Python bindings expose the same query module under `termflow.query`.
276
+
277
+ ## Container Build
278
+
279
+ The repository also includes a containerized dev/test environment for reproducible builds:
280
+
281
+ ```bash
282
+ docker compose run --rm dev bash -lc './tools/test_in_docker.sh'
283
+ ```