textflow-ir 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- textflow_ir-0.1.0/.gitignore +11 -0
- textflow_ir-0.1.0/CMakeLists.txt +171 -0
- textflow_ir-0.1.0/PKG-INFO +283 -0
- textflow_ir-0.1.0/README.md +265 -0
- textflow_ir-0.1.0/bindings/python/module.cpp +296 -0
- textflow_ir-0.1.0/cmake/termflowConfig.cmake.in +6 -0
- textflow_ir-0.1.0/docker/Dockerfile +30 -0
- textflow_ir-0.1.0/docker-compose.yml +9 -0
- textflow_ir-0.1.0/docs/customization.md +228 -0
- textflow_ir-0.1.0/docs/installation-roadmap.md +118 -0
- textflow_ir-0.1.0/docs/installation.md +189 -0
- textflow_ir-0.1.0/docs/usage.md +219 -0
- textflow_ir-0.1.0/examples/analyze_query.cpp +94 -0
- textflow_ir-0.1.0/examples/analyze_text.cpp +39 -0
- textflow_ir-0.1.0/examples/custom_analyzer.cpp +66 -0
- textflow_ir-0.1.0/examples/extract_terms.cpp +44 -0
- textflow_ir-0.1.0/examples/find_package_consumer/CMakeLists.txt +9 -0
- textflow_ir-0.1.0/examples/find_package_consumer/main.cpp +15 -0
- textflow_ir-0.1.0/include/termflow/analysis/analyzer.hpp +53 -0
- textflow_ir-0.1.0/include/termflow/analysis/ascii_folding_filter.hpp +12 -0
- textflow_ir-0.1.0/include/termflow/analysis/char_filter.hpp +21 -0
- textflow_ir-0.1.0/include/termflow/analysis/english_analyzer.hpp +90 -0
- textflow_ir-0.1.0/include/termflow/analysis/english_possessive_filter.hpp +12 -0
- textflow_ir-0.1.0/include/termflow/analysis/lower_case_filter.hpp +12 -0
- textflow_ir-0.1.0/include/termflow/analysis/porter_stem_filter.hpp +12 -0
- textflow_ir-0.1.0/include/termflow/analysis/standard_tokenizer.hpp +34 -0
- textflow_ir-0.1.0/include/termflow/analysis/stop_filter.hpp +22 -0
- textflow_ir-0.1.0/include/termflow/analysis/term_extractor.hpp +55 -0
- textflow_ir-0.1.0/include/termflow/analysis/token.hpp +27 -0
- textflow_ir-0.1.0/include/termflow/analysis/token_filter.hpp +22 -0
- textflow_ir-0.1.0/include/termflow/analysis/tokenizer.hpp +24 -0
- textflow_ir-0.1.0/include/termflow/analysis/unicode_normalize_filter.hpp +12 -0
- textflow_ir-0.1.0/include/termflow/query/query_analyzer.hpp +53 -0
- textflow_ir-0.1.0/include/termflow/query/query_parser.hpp +30 -0
- textflow_ir-0.1.0/include/termflow/query/query_types.hpp +96 -0
- textflow_ir-0.1.0/include/termflow/query/rewrite_loader.hpp +33 -0
- textflow_ir-0.1.0/include/termflow/query/rewrite_validator.hpp +78 -0
- textflow_ir-0.1.0/pyproject.toml +79 -0
- textflow_ir-0.1.0/python/termflow/__init__.py +32 -0
- textflow_ir-0.1.0/src/analysis/analyzer.cpp +71 -0
- textflow_ir-0.1.0/src/analysis/ascii_folding_filter.cpp +13 -0
- textflow_ir-0.1.0/src/analysis/english_analyzer.cpp +105 -0
- textflow_ir-0.1.0/src/analysis/english_possessive_filter.cpp +13 -0
- textflow_ir-0.1.0/src/analysis/lower_case_filter.cpp +13 -0
- textflow_ir-0.1.0/src/analysis/porter_stem_filter.cpp +15 -0
- textflow_ir-0.1.0/src/analysis/standard_tokenizer.cpp +199 -0
- textflow_ir-0.1.0/src/analysis/stop_filter.cpp +39 -0
- textflow_ir-0.1.0/src/analysis/term_extractor.cpp +149 -0
- textflow_ir-0.1.0/src/analysis/unicode_normalize_filter.cpp +13 -0
- textflow_ir-0.1.0/src/query/query_analyzer.cpp +105 -0
- textflow_ir-0.1.0/src/query/query_parser.cpp +70 -0
- textflow_ir-0.1.0/src/query/rewrite_internal.hpp +50 -0
- textflow_ir-0.1.0/src/query/rewrite_loader.cpp +215 -0
- textflow_ir-0.1.0/src/query/rewrite_validator.cpp +292 -0
- textflow_ir-0.1.0/src/util/porter_stemmer.cpp +531 -0
- textflow_ir-0.1.0/src/util/porter_stemmer.hpp +10 -0
- textflow_ir-0.1.0/src/util/unicode.cpp +190 -0
- textflow_ir-0.1.0/src/util/unicode.hpp +17 -0
- textflow_ir-0.1.0/tests/english_analyzer_tests.cpp +241 -0
- textflow_ir-0.1.0/tests/filter_tests.cpp +109 -0
- textflow_ir-0.1.0/tests/python_smoke_test.py +98 -0
- textflow_ir-0.1.0/tests/query_tests.cpp +336 -0
- textflow_ir-0.1.0/tests/standard_tokenizer_tests.cpp +82 -0
- textflow_ir-0.1.0/tests/term_extractor_tests.cpp +42 -0
- textflow_ir-0.1.0/tests/test_framework.cpp +22 -0
- textflow_ir-0.1.0/tests/test_framework.hpp +80 -0
- textflow_ir-0.1.0/tests/test_helpers.hpp +38 -0
- textflow_ir-0.1.0/tests/test_main.cpp +23 -0
- textflow_ir-0.1.0/tools/build_python_dist.sh +5 -0
- textflow_ir-0.1.0/tools/publish_python_dist.sh +9 -0
- textflow_ir-0.1.0/tools/test_full_build.sh +31 -0
- textflow_ir-0.1.0/tools/test_in_docker.sh +4 -0
- textflow_ir-0.1.0/tools/test_installed_consumer.sh +47 -0
- textflow_ir-0.1.0/tools/test_python_wheel_install.sh +34 -0
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
cmake_minimum_required(VERSION 3.24)
|
|
2
|
+
|
|
3
|
+
project(
|
|
4
|
+
termflow
|
|
5
|
+
VERSION 0.1.0
|
|
6
|
+
DESCRIPTION "English text analysis library for C++"
|
|
7
|
+
LANGUAGES CXX)
|
|
8
|
+
|
|
9
|
+
include(GNUInstallDirs)
|
|
10
|
+
include(CMakePackageConfigHelpers)
|
|
11
|
+
|
|
12
|
+
option(TERMFLOW_BUILD_TESTS "Build the termflow test suite" ON)
|
|
13
|
+
option(TERMFLOW_BUILD_EXAMPLES "Build the termflow example programs" ON)
|
|
14
|
+
option(TERMFLOW_BUILD_TOOLS "Build the termflow command-line tools" ON)
|
|
15
|
+
option(TERMFLOW_BUILD_PYTHON "Build the termflow Python bindings" OFF)
|
|
16
|
+
option(TERMFLOW_INSTALL_CPP_ARTIFACTS "Install C++ library and development artifacts" ON)
|
|
17
|
+
|
|
18
|
+
set(CMAKE_CXX_STANDARD 20)
|
|
19
|
+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
|
20
|
+
set(CMAKE_CXX_EXTENSIONS OFF)
|
|
21
|
+
|
|
22
|
+
find_package(ICU REQUIRED COMPONENTS i18n uc)
|
|
23
|
+
|
|
24
|
+
if(TERMFLOW_BUILD_PYTHON)
|
|
25
|
+
find_package(Python3 REQUIRED COMPONENTS Interpreter Development.Module)
|
|
26
|
+
find_package(pybind11 CONFIG REQUIRED)
|
|
27
|
+
endif()
|
|
28
|
+
|
|
29
|
+
add_library(termflow
|
|
30
|
+
src/analysis/analyzer.cpp
|
|
31
|
+
src/analysis/ascii_folding_filter.cpp
|
|
32
|
+
src/analysis/english_analyzer.cpp
|
|
33
|
+
src/analysis/english_possessive_filter.cpp
|
|
34
|
+
src/analysis/lower_case_filter.cpp
|
|
35
|
+
src/analysis/porter_stem_filter.cpp
|
|
36
|
+
src/analysis/standard_tokenizer.cpp
|
|
37
|
+
src/analysis/stop_filter.cpp
|
|
38
|
+
src/analysis/term_extractor.cpp
|
|
39
|
+
src/analysis/unicode_normalize_filter.cpp
|
|
40
|
+
src/query/query_analyzer.cpp
|
|
41
|
+
src/query/query_parser.cpp
|
|
42
|
+
src/query/rewrite_loader.cpp
|
|
43
|
+
src/query/rewrite_validator.cpp
|
|
44
|
+
src/util/porter_stemmer.cpp
|
|
45
|
+
src/util/unicode.cpp)
|
|
46
|
+
|
|
47
|
+
add_library(termflow::termflow ALIAS termflow)
|
|
48
|
+
|
|
49
|
+
target_compile_features(termflow PUBLIC cxx_std_20)
|
|
50
|
+
target_link_libraries(termflow PUBLIC ICU::i18n ICU::uc)
|
|
51
|
+
target_include_directories(termflow
|
|
52
|
+
PUBLIC
|
|
53
|
+
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
|
|
54
|
+
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
|
|
55
|
+
PRIVATE
|
|
56
|
+
${CMAKE_CURRENT_SOURCE_DIR}/src)
|
|
57
|
+
|
|
58
|
+
if(MSVC)
|
|
59
|
+
target_compile_options(termflow PRIVATE /W4 /permissive-)
|
|
60
|
+
else()
|
|
61
|
+
target_compile_options(termflow PRIVATE -Wall -Wextra -Wpedantic)
|
|
62
|
+
endif()
|
|
63
|
+
|
|
64
|
+
set_target_properties(termflow PROPERTIES
|
|
65
|
+
EXPORT_NAME termflow
|
|
66
|
+
POSITION_INDEPENDENT_CODE ON
|
|
67
|
+
VERSION ${PROJECT_VERSION}
|
|
68
|
+
SOVERSION 0)
|
|
69
|
+
|
|
70
|
+
if(TERMFLOW_BUILD_EXAMPLES)
|
|
71
|
+
add_executable(termflow_analyze examples/analyze_text.cpp)
|
|
72
|
+
target_link_libraries(termflow_analyze PRIVATE termflow::termflow)
|
|
73
|
+
|
|
74
|
+
add_executable(termflow_extract_terms examples/extract_terms.cpp)
|
|
75
|
+
target_link_libraries(termflow_extract_terms PRIVATE termflow::termflow)
|
|
76
|
+
|
|
77
|
+
add_executable(termflow_custom_analyzer examples/custom_analyzer.cpp)
|
|
78
|
+
target_link_libraries(termflow_custom_analyzer PRIVATE termflow::termflow)
|
|
79
|
+
|
|
80
|
+
add_executable(termflow_analyze_query examples/analyze_query.cpp)
|
|
81
|
+
target_link_libraries(termflow_analyze_query PRIVATE termflow::termflow)
|
|
82
|
+
endif()
|
|
83
|
+
|
|
84
|
+
if(TERMFLOW_BUILD_TOOLS)
|
|
85
|
+
endif()
|
|
86
|
+
|
|
87
|
+
if(TERMFLOW_BUILD_PYTHON)
|
|
88
|
+
set(TERMFLOW_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/python/termflow)
|
|
89
|
+
file(MAKE_DIRECTORY ${TERMFLOW_PYTHON_PACKAGE_DIR})
|
|
90
|
+
configure_file(
|
|
91
|
+
${CMAKE_CURRENT_SOURCE_DIR}/python/termflow/__init__.py
|
|
92
|
+
${TERMFLOW_PYTHON_PACKAGE_DIR}/__init__.py
|
|
93
|
+
COPYONLY)
|
|
94
|
+
|
|
95
|
+
pybind11_add_module(termflow_python MODULE bindings/python/module.cpp)
|
|
96
|
+
target_link_libraries(termflow_python PRIVATE termflow::termflow)
|
|
97
|
+
set_target_properties(termflow_python PROPERTIES
|
|
98
|
+
OUTPUT_NAME _termflow
|
|
99
|
+
LIBRARY_OUTPUT_DIRECTORY ${TERMFLOW_PYTHON_PACKAGE_DIR}
|
|
100
|
+
RUNTIME_OUTPUT_DIRECTORY ${TERMFLOW_PYTHON_PACKAGE_DIR})
|
|
101
|
+
|
|
102
|
+
install(
|
|
103
|
+
TARGETS termflow_python
|
|
104
|
+
LIBRARY DESTINATION termflow
|
|
105
|
+
RUNTIME DESTINATION termflow)
|
|
106
|
+
|
|
107
|
+
install(
|
|
108
|
+
FILES python/termflow/__init__.py
|
|
109
|
+
DESTINATION termflow)
|
|
110
|
+
endif()
|
|
111
|
+
|
|
112
|
+
if(TERMFLOW_BUILD_TESTS)
|
|
113
|
+
enable_testing()
|
|
114
|
+
|
|
115
|
+
add_executable(termflow_tests
|
|
116
|
+
tests/english_analyzer_tests.cpp
|
|
117
|
+
tests/filter_tests.cpp
|
|
118
|
+
tests/query_tests.cpp
|
|
119
|
+
tests/standard_tokenizer_tests.cpp
|
|
120
|
+
tests/term_extractor_tests.cpp
|
|
121
|
+
tests/test_framework.cpp
|
|
122
|
+
tests/test_main.cpp
|
|
123
|
+
tests/test_helpers.hpp)
|
|
124
|
+
|
|
125
|
+
target_link_libraries(termflow_tests PRIVATE termflow::termflow)
|
|
126
|
+
|
|
127
|
+
add_test(NAME termflow_tests COMMAND termflow_tests)
|
|
128
|
+
|
|
129
|
+
if(TERMFLOW_BUILD_PYTHON)
|
|
130
|
+
add_test(
|
|
131
|
+
NAME termflow_python_smoke
|
|
132
|
+
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/tests/python_smoke_test.py)
|
|
133
|
+
set_tests_properties(
|
|
134
|
+
termflow_python_smoke
|
|
135
|
+
PROPERTIES ENVIRONMENT "PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/python")
|
|
136
|
+
endif()
|
|
137
|
+
endif()
|
|
138
|
+
if(TERMFLOW_INSTALL_CPP_ARTIFACTS)
|
|
139
|
+
install(
|
|
140
|
+
TARGETS termflow
|
|
141
|
+
EXPORT termflowTargets
|
|
142
|
+
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
|
143
|
+
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
|
144
|
+
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
|
|
145
|
+
|
|
146
|
+
install(
|
|
147
|
+
DIRECTORY include/
|
|
148
|
+
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
|
149
|
+
|
|
150
|
+
install(
|
|
151
|
+
EXPORT termflowTargets
|
|
152
|
+
FILE termflowTargets.cmake
|
|
153
|
+
NAMESPACE termflow::
|
|
154
|
+
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/termflow)
|
|
155
|
+
|
|
156
|
+
write_basic_package_version_file(
|
|
157
|
+
${CMAKE_CURRENT_BINARY_DIR}/termflowConfigVersion.cmake
|
|
158
|
+
VERSION ${PROJECT_VERSION}
|
|
159
|
+
COMPATIBILITY SameMajorVersion)
|
|
160
|
+
|
|
161
|
+
configure_package_config_file(
|
|
162
|
+
${CMAKE_CURRENT_SOURCE_DIR}/cmake/termflowConfig.cmake.in
|
|
163
|
+
${CMAKE_CURRENT_BINARY_DIR}/termflowConfig.cmake
|
|
164
|
+
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/termflow)
|
|
165
|
+
|
|
166
|
+
install(
|
|
167
|
+
FILES
|
|
168
|
+
${CMAKE_CURRENT_BINARY_DIR}/termflowConfig.cmake
|
|
169
|
+
${CMAKE_CURRENT_BINARY_DIR}/termflowConfigVersion.cmake
|
|
170
|
+
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/termflow)
|
|
171
|
+
endif()
|
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: textflow-ir
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: English text analysis for information retrieval
|
|
5
|
+
Keywords: text-analysis,information-retrieval,tokenization,stemming,search
|
|
6
|
+
Author: Mustafa Abualsaud
|
|
7
|
+
Classifier: Development Status :: 3 - Alpha
|
|
8
|
+
Classifier: Intended Audience :: Developers
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: C++
|
|
15
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
16
|
+
Requires-Python: >=3.10
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
19
|
+
# termflow
|
|
20
|
+
|
|
21
|
+
`termflow` is a standalone C++20 library for English text analysis aimed at information retrieval workloads.
|
|
22
|
+
|
|
23
|
+
## Documentation
|
|
24
|
+
|
|
25
|
+
- [docs/usage.md](docs/usage.md) for day-to-day analyzer, term extraction, query, and Python usage
|
|
26
|
+
- [docs/customization.md](docs/customization.md) for pipeline tuning, query rewrites, and custom analyzers in C++
|
|
27
|
+
- [docs/installation.md](docs/installation.md) for current C++ and Python installation paths
|
|
28
|
+
- [docs/installation-roadmap.md](docs/installation-roadmap.md) for the prioritized installation improvement plan
|
|
29
|
+
|
|
30
|
+
## Goals
|
|
31
|
+
|
|
32
|
+
- Library-first design with a small public API
|
|
33
|
+
- CMake-based build and install flow
|
|
34
|
+
- Clean, testable analysis components
|
|
35
|
+
- English-first analysis for v1
|
|
36
|
+
- Optional Python bindings on top of the same core library
|
|
37
|
+
|
|
38
|
+
## Current Scope
|
|
39
|
+
|
|
40
|
+
`termflow` currently includes:
|
|
41
|
+
|
|
42
|
+
- `Token`
|
|
43
|
+
- `CharFilter`
|
|
44
|
+
- `Tokenizer`
|
|
45
|
+
- `TokenFilter`
|
|
46
|
+
- `Analyzer`
|
|
47
|
+
- `TermExtractor`
|
|
48
|
+
- `termflow::query::QueryParser`
|
|
49
|
+
- `termflow::query::RewriteLoader`
|
|
50
|
+
- `termflow::query::RewriteValidator`
|
|
51
|
+
- `termflow::query::QueryAnalyzer`
|
|
52
|
+
- `StandardTokenizer`
|
|
53
|
+
- `UnicodeNormalizeFilter`
|
|
54
|
+
- `LowerCaseFilter`
|
|
55
|
+
- `EnglishPossessiveFilter`
|
|
56
|
+
- `StopFilter`
|
|
57
|
+
- `PorterStemFilter`
|
|
58
|
+
- `AsciiFoldingFilter`
|
|
59
|
+
- `EnglishAnalyzer`
|
|
60
|
+
|
|
61
|
+
Core analyzer APIs:
|
|
62
|
+
|
|
63
|
+
- `analyze(text) -> std::vector<Token>`
|
|
64
|
+
- `analyze_terms(text) -> std::vector<std::string>`
|
|
65
|
+
- `analyze_term(text) -> std::optional<std::string>`
|
|
66
|
+
- `analyze_to_string(text, separator = " ") -> std::string`
|
|
67
|
+
- `normalize(text) -> std::string`
|
|
68
|
+
|
|
69
|
+
`analyze()` runs the full token pipeline. `normalize()` is deliberately lighter-weight and does not tokenize.
|
|
70
|
+
|
|
71
|
+
The query module is intentionally text-focused. It can parse lightweight query intent, analyze clauses through an existing analyzer, and attach optional query-time rewrites. It does not score documents or execute retrieval.
|
|
72
|
+
|
|
73
|
+
## Behavior Contract
|
|
74
|
+
|
|
75
|
+
For `EnglishAnalyzer`, `analyze()` applies stages in this order:
|
|
76
|
+
|
|
77
|
+
1. `StandardTokenizer`
|
|
78
|
+
2. optional Unicode normalization
|
|
79
|
+
3. optional English possessive stripping
|
|
80
|
+
4. optional lowercasing
|
|
81
|
+
5. optional stop-word filtering
|
|
82
|
+
6. keyword marking from `stem_exclusion_set`
|
|
83
|
+
7. optional Porter stemming
|
|
84
|
+
8. optional ASCII folding
|
|
85
|
+
|
|
86
|
+
`normalize()` is intentionally narrower. It honors only:
|
|
87
|
+
|
|
88
|
+
- `unicode_normalization`
|
|
89
|
+
- `lowercase`
|
|
90
|
+
- `ascii_folding`
|
|
91
|
+
|
|
92
|
+
It does not tokenize, strip possessives, remove stop words, mark keywords, or stem.
|
|
93
|
+
|
|
94
|
+
Stage toggles are explicit. Disabling a stage removes only that stage. For example, if `lowercase=false` and `stemming=true`, stemming runs on the token text produced by the earlier enabled stages without forcing lowercase.
|
|
95
|
+
|
|
96
|
+
Custom `stop_words` and `stem_exclusion_set` are normalized using the enabled pre-stem stages so they match the token text seen by stop-word filtering and keyword marking.
|
|
97
|
+
|
|
98
|
+
## Term Extraction
|
|
99
|
+
|
|
100
|
+
`TermExtractor` is the public helper for consumers that want finalized term strings with explicit post-analysis filtering. It can enforce minimum length, numeric-term handling, and character policy filters without changing the analyzer itself.
|
|
101
|
+
|
|
102
|
+
Example:
|
|
103
|
+
|
|
104
|
+
```cpp
|
|
105
|
+
termflow::EnglishAnalyzerOptions analyzer_options;
|
|
106
|
+
analyzer_options.use_default_stop_words = false;
|
|
107
|
+
termflow::EnglishAnalyzer analyzer(analyzer_options);
|
|
108
|
+
|
|
109
|
+
termflow::TermExtractionOptions extraction_options;
|
|
110
|
+
extraction_options.min_term_length = 2;
|
|
111
|
+
extraction_options.keep_numeric_terms = false;
|
|
112
|
+
extraction_options.character_policy = termflow::TermCharacterPolicy::ascii_alphabetic;
|
|
113
|
+
|
|
114
|
+
termflow::TermExtractor extractor(analyzer, extraction_options);
|
|
115
|
+
auto terms = extractor.extract("Running beaches 2024 e-mail");
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
A runnable example is included at [examples/extract_terms.cpp](examples/extract_terms.cpp).
|
|
119
|
+
|
|
120
|
+
## Query Analysis
|
|
121
|
+
|
|
122
|
+
`termflow::query` provides a small query-intent layer on top of existing analyzers. The current scope is:
|
|
123
|
+
|
|
124
|
+
- clause parsing for bare terms, `+required`, `-excluded`, and quoted groups
|
|
125
|
+
- analyzed query terms produced by an existing `Analyzer`
|
|
126
|
+
- optional query-time rewrites for canonicalization, equivalent alternatives, and expansions
|
|
127
|
+
- analyzer-aware validation for rewrite packs
|
|
128
|
+
|
|
129
|
+
This is intentionally not a search engine query model. The query module does not score, rank, filter, or execute queries against an index.
|
|
130
|
+
|
|
131
|
+
Example:
|
|
132
|
+
|
|
133
|
+
```cpp
|
|
134
|
+
#include "termflow/analysis/english_analyzer.hpp"
|
|
135
|
+
#include "termflow/query/query_analyzer.hpp"
|
|
136
|
+
#include "termflow/query/rewrite_loader.hpp"
|
|
137
|
+
|
|
138
|
+
termflow::EnglishAnalyzer analyzer;
|
|
139
|
+
termflow::query::RewriteLoader loader;
|
|
140
|
+
termflow::query::RewriteValidator validator(analyzer);
|
|
141
|
+
|
|
142
|
+
termflow::query::QueryAnalysisOptions options;
|
|
143
|
+
options.rewrites = loader.load_file("config/query-rewrites.tfq");
|
|
144
|
+
const auto validation = validator.validate(options);
|
|
145
|
+
|
|
146
|
+
termflow::query::QueryAnalyzer query_analyzer(analyzer, options);
|
|
147
|
+
const auto query = query_analyzer.analyze(R"(tv +"car rentals" -expired)");
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
In that example:
|
|
151
|
+
|
|
152
|
+
- `tv` is canonically rewritten before it reaches downstream scoring logic
|
|
153
|
+
- `car rentals` keeps its primary analyzed terms and also receives an equivalent alternative
|
|
154
|
+
- `-expired` is parsed as an excluded clause
|
|
155
|
+
|
|
156
|
+
Rewrites match exact analyzed term sequences at the clause level. Quoted groups remain grouped in the parsed and analyzed query representation, but `termflow` does not implement phrase execution logic.
|
|
157
|
+
|
|
158
|
+
`RewriteValidator` reports rewrite rules that analyze to no terms, canonical conflicts and cycles, and alternatives that are duplicated or shadowed by the configured deduplication policy. It is a reporting layer only; it does not rewrite or reorder the supplied rules.
|
|
159
|
+
|
|
160
|
+
Rewrite files use a small declarative syntax:
|
|
161
|
+
|
|
162
|
+
```text
|
|
163
|
+
# Canonical rewrites
|
|
164
|
+
lotr -> lord of the rings;
|
|
165
|
+
colour -> color;
|
|
166
|
+
|
|
167
|
+
# Equivalent alternatives
|
|
168
|
+
car rentals => automobile rentals;
|
|
169
|
+
|
|
170
|
+
# Expansion alternatives
|
|
171
|
+
car rentals ~> vehicle rentals;
|
|
172
|
+
|
|
173
|
+
# Includes are resolved relative to the current file
|
|
174
|
+
include "common-rewrites.tfq";
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
Supported forms are intentionally limited to text rewrites and includes. `termflow` does not interpret field syntax, conditional macros, stop-word directives, boosts, or other higher-level query language constructs in rewrite files.
|
|
178
|
+
|
|
179
|
+
A runnable example is included at [examples/analyze_query.cpp](examples/analyze_query.cpp).
|
|
180
|
+
|
|
181
|
+
## API Contracts
|
|
182
|
+
|
|
183
|
+
- Offsets are represented as UTF-16 code-unit offsets.
|
|
184
|
+
- `Analyzer`, `Tokenizer`, and `TermExtractor` are batch-oriented APIs that materialize results into `std::vector` containers.
|
|
185
|
+
- `QueryParser`, `RewriteLoader`, `RewriteValidator`, and `QueryAnalyzer` are also batch-oriented APIs that materialize parsed clauses, rewrite rules, validation reports, and analyzed term sequences into `std::vector` containers.
|
|
186
|
+
- v1 does not expose a streaming token API.
|
|
187
|
+
- Implementations may throw `std::runtime_error` when ICU cannot be initialized or fails while processing text.
|
|
188
|
+
- Invalid configuration is reported with standard exceptions such as `std::invalid_argument`.
|
|
189
|
+
- Constructed analyzers and filters are safe for concurrent const use. A typical parallel usage pattern is to share one configured analyzer across worker threads and analyze different documents concurrently.
|
|
190
|
+
- `QueryParser`, `RewriteLoader`, `RewriteValidator`, and `QueryAnalyzer` are also safe for concurrent const use after construction.
|
|
191
|
+
- Do not reconfigure mutable objects, such as a tokenizer with a new max token length, concurrently with analysis.
|
|
192
|
+
|
|
193
|
+
## Design Notes
|
|
194
|
+
|
|
195
|
+
- Tokenization uses ICU word boundaries.
|
|
196
|
+
- ASCII folding is optional and disabled by default.
|
|
197
|
+
- The Porter stemmer is embedded and scoped for English terms.
|
|
198
|
+
- The built-in English analyzer uses an optimized internal fast path for its pre-stem normalization stages. Tests compare that path against an explicitly composed public-filter pipeline to keep them aligned.
|
|
199
|
+
|
|
200
|
+
## Custom Analyzers
|
|
201
|
+
|
|
202
|
+
The public tokenizer and filter interfaces are intended to support explicit analyzer construction. A runnable example is included at [examples/custom_analyzer.cpp](examples/custom_analyzer.cpp).
|
|
203
|
+
|
|
204
|
+
That example shows how to:
|
|
205
|
+
|
|
206
|
+
- subclass `Analyzer`
|
|
207
|
+
- reuse `StandardTokenizer`
|
|
208
|
+
- compose public token filters directly
|
|
209
|
+
- define a separate `normalize()` path for the custom analyzer
|
|
210
|
+
|
|
211
|
+
The query module is designed to compose with custom analyzers as well. `QueryAnalyzer` accepts any configured `Analyzer` and uses that analyzer's `analyze_terms()` behavior when normalizing query clauses and rewrite rules.
|
|
212
|
+
|
|
213
|
+
## Limitations
|
|
214
|
+
|
|
215
|
+
This is intentionally a small v1.
|
|
216
|
+
|
|
217
|
+
- English only
|
|
218
|
+
- No token graphs
|
|
219
|
+
- No token-graph-style synonym processing
|
|
220
|
+
- No phrase execution logic
|
|
221
|
+
- Query parsing is intentionally lightweight: no escaping, precedence rules, boosts, or field syntax
|
|
222
|
+
- No index structures
|
|
223
|
+
- No multilingual analyzers
|
|
224
|
+
- No concrete `CharFilter` implementations yet
|
|
225
|
+
|
|
226
|
+
## Future Direction
|
|
227
|
+
|
|
228
|
+
The core abstractions are language-agnostic. The intended direction is:
|
|
229
|
+
|
|
230
|
+
- keep `termflow` core focused on shared analysis primitives
|
|
231
|
+
- keep query-intent analysis focused on parsing and rewrite preparation rather than scoring
|
|
232
|
+
- add language analyzers as independent modules or packages
|
|
233
|
+
- keep analyzer construction explicit rather than reflection-driven
|
|
234
|
+
- reuse shared Unicode handling where it improves correctness
|
|
235
|
+
|
|
236
|
+
## Building
|
|
237
|
+
|
|
238
|
+
Local build:
|
|
239
|
+
|
|
240
|
+
```bash
|
|
241
|
+
cmake -S . -B build -G Ninja
|
|
242
|
+
cmake --build build
|
|
243
|
+
ctest --test-dir build --output-on-failure
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
The examples are also built by default:
|
|
247
|
+
|
|
248
|
+
- `termflow_analyze`
|
|
249
|
+
- `termflow_extract_terms`
|
|
250
|
+
- `termflow_custom_analyzer`
|
|
251
|
+
- `termflow_analyze_query`
|
|
252
|
+
|
|
253
|
+
## Python Bindings
|
|
254
|
+
|
|
255
|
+
Optional Python bindings can be built on top of the same core library:
|
|
256
|
+
|
|
257
|
+
```bash
|
|
258
|
+
cmake -S . -B build -G Ninja -DTERMFLOW_BUILD_PYTHON=ON
|
|
259
|
+
cmake --build build
|
|
260
|
+
PYTHONPATH=build/python python3 -c 'import termflow; print(termflow.EnglishAnalyzer().analyze_terms("Running Cars"))'
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
Python wheel and sdist build:
|
|
264
|
+
|
|
265
|
+
```bash
|
|
266
|
+
python3 -m build --sdist --wheel
|
|
267
|
+
python3 -m twine check dist/*
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
The repository includes [pyproject.toml](pyproject.toml) and two helper scripts:
|
|
271
|
+
|
|
272
|
+
- [tools/build_python_dist.sh](tools/build_python_dist.sh)
|
|
273
|
+
- [tools/publish_python_dist.sh](tools/publish_python_dist.sh)
|
|
274
|
+
|
|
275
|
+
The optional Python bindings expose the same query module under `termflow.query`.
|
|
276
|
+
|
|
277
|
+
## Container Build
|
|
278
|
+
|
|
279
|
+
The repository also includes a containerized dev/test environment for reproducible builds:
|
|
280
|
+
|
|
281
|
+
```bash
|
|
282
|
+
docker compose run --rm dev bash -lc './tools/test_in_docker.sh'
|
|
283
|
+
```
|