termflow-ir 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- termflow_ir-0.1.2/.gitignore +11 -0
- termflow_ir-0.1.2/CMakeLists.txt +178 -0
- termflow_ir-0.1.2/PKG-INFO +174 -0
- termflow_ir-0.1.2/README.md +151 -0
- termflow_ir-0.1.2/bindings/python/module.cpp +296 -0
- termflow_ir-0.1.2/cmake/termflowConfig.cmake.in +6 -0
- termflow_ir-0.1.2/docker/Dockerfile +30 -0
- termflow_ir-0.1.2/docker-compose.yml +9 -0
- termflow_ir-0.1.2/docs/customization.md +228 -0
- termflow_ir-0.1.2/docs/installation-roadmap.md +118 -0
- termflow_ir-0.1.2/docs/installation.md +194 -0
- termflow_ir-0.1.2/docs/usage.md +237 -0
- termflow_ir-0.1.2/examples/analyze_query.cpp +94 -0
- termflow_ir-0.1.2/examples/analyze_text.cpp +39 -0
- termflow_ir-0.1.2/examples/custom_analyzer.cpp +66 -0
- termflow_ir-0.1.2/examples/extract_terms.cpp +44 -0
- termflow_ir-0.1.2/examples/find_package_consumer/CMakeLists.txt +9 -0
- termflow_ir-0.1.2/examples/find_package_consumer/main.cpp +15 -0
- termflow_ir-0.1.2/include/termflow/analysis/analyzer.hpp +53 -0
- termflow_ir-0.1.2/include/termflow/analysis/ascii_folding_filter.hpp +12 -0
- termflow_ir-0.1.2/include/termflow/analysis/char_filter.hpp +21 -0
- termflow_ir-0.1.2/include/termflow/analysis/english_analyzer.hpp +90 -0
- termflow_ir-0.1.2/include/termflow/analysis/english_possessive_filter.hpp +12 -0
- termflow_ir-0.1.2/include/termflow/analysis/lower_case_filter.hpp +12 -0
- termflow_ir-0.1.2/include/termflow/analysis/porter_stem_filter.hpp +12 -0
- termflow_ir-0.1.2/include/termflow/analysis/standard_tokenizer.hpp +34 -0
- termflow_ir-0.1.2/include/termflow/analysis/stop_filter.hpp +22 -0
- termflow_ir-0.1.2/include/termflow/analysis/term_extractor.hpp +55 -0
- termflow_ir-0.1.2/include/termflow/analysis/token.hpp +27 -0
- termflow_ir-0.1.2/include/termflow/analysis/token_filter.hpp +22 -0
- termflow_ir-0.1.2/include/termflow/analysis/tokenizer.hpp +24 -0
- termflow_ir-0.1.2/include/termflow/analysis/unicode_normalize_filter.hpp +12 -0
- termflow_ir-0.1.2/include/termflow/query/query_analyzer.hpp +53 -0
- termflow_ir-0.1.2/include/termflow/query/query_parser.hpp +30 -0
- termflow_ir-0.1.2/include/termflow/query/query_types.hpp +96 -0
- termflow_ir-0.1.2/include/termflow/query/rewrite_loader.hpp +33 -0
- termflow_ir-0.1.2/include/termflow/query/rewrite_validator.hpp +78 -0
- termflow_ir-0.1.2/pyproject.toml +89 -0
- termflow_ir-0.1.2/python/termflow/__init__.py +32 -0
- termflow_ir-0.1.2/python/termflow/cli.py +258 -0
- termflow_ir-0.1.2/src/analysis/analyzer.cpp +71 -0
- termflow_ir-0.1.2/src/analysis/ascii_folding_filter.cpp +13 -0
- termflow_ir-0.1.2/src/analysis/english_analyzer.cpp +105 -0
- termflow_ir-0.1.2/src/analysis/english_possessive_filter.cpp +13 -0
- termflow_ir-0.1.2/src/analysis/lower_case_filter.cpp +13 -0
- termflow_ir-0.1.2/src/analysis/porter_stem_filter.cpp +15 -0
- termflow_ir-0.1.2/src/analysis/standard_tokenizer.cpp +199 -0
- termflow_ir-0.1.2/src/analysis/stop_filter.cpp +39 -0
- termflow_ir-0.1.2/src/analysis/term_extractor.cpp +149 -0
- termflow_ir-0.1.2/src/analysis/unicode_normalize_filter.cpp +13 -0
- termflow_ir-0.1.2/src/query/query_analyzer.cpp +105 -0
- termflow_ir-0.1.2/src/query/query_parser.cpp +70 -0
- termflow_ir-0.1.2/src/query/rewrite_internal.hpp +50 -0
- termflow_ir-0.1.2/src/query/rewrite_loader.cpp +215 -0
- termflow_ir-0.1.2/src/query/rewrite_validator.cpp +292 -0
- termflow_ir-0.1.2/src/util/porter_stemmer.cpp +531 -0
- termflow_ir-0.1.2/src/util/porter_stemmer.hpp +10 -0
- termflow_ir-0.1.2/src/util/unicode.cpp +190 -0
- termflow_ir-0.1.2/src/util/unicode.hpp +17 -0
- termflow_ir-0.1.2/tests/english_analyzer_tests.cpp +241 -0
- termflow_ir-0.1.2/tests/filter_tests.cpp +109 -0
- termflow_ir-0.1.2/tests/python_smoke_test.py +147 -0
- termflow_ir-0.1.2/tests/query_tests.cpp +336 -0
- termflow_ir-0.1.2/tests/standard_tokenizer_tests.cpp +82 -0
- termflow_ir-0.1.2/tests/term_extractor_tests.cpp +42 -0
- termflow_ir-0.1.2/tests/test_framework.cpp +22 -0
- termflow_ir-0.1.2/tests/test_framework.hpp +80 -0
- termflow_ir-0.1.2/tests/test_helpers.hpp +38 -0
- termflow_ir-0.1.2/tests/test_main.cpp +23 -0
- termflow_ir-0.1.2/tools/build_python_dist.sh +5 -0
- termflow_ir-0.1.2/tools/publish_python_dist.sh +9 -0
- termflow_ir-0.1.2/tools/test_full_build.sh +31 -0
- termflow_ir-0.1.2/tools/test_in_docker.sh +4 -0
- termflow_ir-0.1.2/tools/test_installed_consumer.sh +47 -0
- termflow_ir-0.1.2/tools/test_python_wheel_install.sh +35 -0
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
cmake_minimum_required(VERSION 3.24)
|
|
2
|
+
|
|
3
|
+
project(
|
|
4
|
+
termflow
|
|
5
|
+
VERSION 0.1.2
|
|
6
|
+
DESCRIPTION "English text analysis library for C++"
|
|
7
|
+
LANGUAGES CXX)
|
|
8
|
+
|
|
9
|
+
include(GNUInstallDirs)
|
|
10
|
+
include(CMakePackageConfigHelpers)
|
|
11
|
+
|
|
12
|
+
option(TERMFLOW_BUILD_TESTS "Build the termflow test suite" ON)
|
|
13
|
+
option(TERMFLOW_BUILD_EXAMPLES "Build the termflow example programs" ON)
|
|
14
|
+
option(TERMFLOW_BUILD_TOOLS "Build the termflow command-line tools" ON)
|
|
15
|
+
option(TERMFLOW_BUILD_PYTHON "Build the termflow Python bindings" OFF)
|
|
16
|
+
option(TERMFLOW_INSTALL_CPP_ARTIFACTS "Install C++ library and development artifacts" ON)
|
|
17
|
+
|
|
18
|
+
set(CMAKE_CXX_STANDARD 20)
|
|
19
|
+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
|
20
|
+
set(CMAKE_CXX_EXTENSIONS OFF)
|
|
21
|
+
|
|
22
|
+
find_package(ICU REQUIRED COMPONENTS i18n uc)
|
|
23
|
+
|
|
24
|
+
if(TERMFLOW_BUILD_PYTHON)
|
|
25
|
+
find_package(Python3 REQUIRED COMPONENTS Interpreter Development.Module)
|
|
26
|
+
find_package(pybind11 CONFIG REQUIRED)
|
|
27
|
+
endif()
|
|
28
|
+
|
|
29
|
+
add_library(termflow
|
|
30
|
+
src/analysis/analyzer.cpp
|
|
31
|
+
src/analysis/ascii_folding_filter.cpp
|
|
32
|
+
src/analysis/english_analyzer.cpp
|
|
33
|
+
src/analysis/english_possessive_filter.cpp
|
|
34
|
+
src/analysis/lower_case_filter.cpp
|
|
35
|
+
src/analysis/porter_stem_filter.cpp
|
|
36
|
+
src/analysis/standard_tokenizer.cpp
|
|
37
|
+
src/analysis/stop_filter.cpp
|
|
38
|
+
src/analysis/term_extractor.cpp
|
|
39
|
+
src/analysis/unicode_normalize_filter.cpp
|
|
40
|
+
src/query/query_analyzer.cpp
|
|
41
|
+
src/query/query_parser.cpp
|
|
42
|
+
src/query/rewrite_loader.cpp
|
|
43
|
+
src/query/rewrite_validator.cpp
|
|
44
|
+
src/util/porter_stemmer.cpp
|
|
45
|
+
src/util/unicode.cpp)
|
|
46
|
+
|
|
47
|
+
add_library(termflow::termflow ALIAS termflow)
|
|
48
|
+
|
|
49
|
+
target_compile_features(termflow PUBLIC cxx_std_20)
|
|
50
|
+
target_link_libraries(termflow PUBLIC ICU::i18n ICU::uc)
|
|
51
|
+
target_include_directories(termflow
|
|
52
|
+
PUBLIC
|
|
53
|
+
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
|
|
54
|
+
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
|
|
55
|
+
PRIVATE
|
|
56
|
+
${CMAKE_CURRENT_SOURCE_DIR}/src)
|
|
57
|
+
|
|
58
|
+
if(MSVC)
|
|
59
|
+
target_compile_options(termflow PRIVATE /W4 /permissive-)
|
|
60
|
+
else()
|
|
61
|
+
target_compile_options(termflow PRIVATE -Wall -Wextra -Wpedantic)
|
|
62
|
+
endif()
|
|
63
|
+
|
|
64
|
+
set_target_properties(termflow PROPERTIES
|
|
65
|
+
EXPORT_NAME termflow
|
|
66
|
+
POSITION_INDEPENDENT_CODE ON
|
|
67
|
+
VERSION ${PROJECT_VERSION}
|
|
68
|
+
SOVERSION 0)
|
|
69
|
+
|
|
70
|
+
if(TERMFLOW_BUILD_EXAMPLES)
|
|
71
|
+
add_executable(termflow_analyze examples/analyze_text.cpp)
|
|
72
|
+
target_link_libraries(termflow_analyze PRIVATE termflow::termflow)
|
|
73
|
+
|
|
74
|
+
add_executable(termflow_extract_terms examples/extract_terms.cpp)
|
|
75
|
+
target_link_libraries(termflow_extract_terms PRIVATE termflow::termflow)
|
|
76
|
+
|
|
77
|
+
add_executable(termflow_custom_analyzer examples/custom_analyzer.cpp)
|
|
78
|
+
target_link_libraries(termflow_custom_analyzer PRIVATE termflow::termflow)
|
|
79
|
+
|
|
80
|
+
add_executable(termflow_analyze_query examples/analyze_query.cpp)
|
|
81
|
+
target_link_libraries(termflow_analyze_query PRIVATE termflow::termflow)
|
|
82
|
+
endif()
|
|
83
|
+
|
|
84
|
+
if(TERMFLOW_BUILD_TOOLS)
|
|
85
|
+
endif()
|
|
86
|
+
|
|
87
|
+
if(TERMFLOW_BUILD_PYTHON)
|
|
88
|
+
set(TERMFLOW_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/python/termflow)
|
|
89
|
+
set(TERMFLOW_PYTHON_PACKAGE_FILES
|
|
90
|
+
__init__.py
|
|
91
|
+
cli.py)
|
|
92
|
+
file(MAKE_DIRECTORY ${TERMFLOW_PYTHON_PACKAGE_DIR})
|
|
93
|
+
foreach(package_file IN LISTS TERMFLOW_PYTHON_PACKAGE_FILES)
|
|
94
|
+
configure_file(
|
|
95
|
+
${CMAKE_CURRENT_SOURCE_DIR}/python/termflow/${package_file}
|
|
96
|
+
${TERMFLOW_PYTHON_PACKAGE_DIR}/${package_file}
|
|
97
|
+
COPYONLY)
|
|
98
|
+
endforeach()
|
|
99
|
+
|
|
100
|
+
pybind11_add_module(termflow_python MODULE bindings/python/module.cpp)
|
|
101
|
+
target_link_libraries(termflow_python PRIVATE termflow::termflow)
|
|
102
|
+
set_target_properties(termflow_python PROPERTIES
|
|
103
|
+
OUTPUT_NAME _termflow
|
|
104
|
+
LIBRARY_OUTPUT_DIRECTORY ${TERMFLOW_PYTHON_PACKAGE_DIR}
|
|
105
|
+
RUNTIME_OUTPUT_DIRECTORY ${TERMFLOW_PYTHON_PACKAGE_DIR})
|
|
106
|
+
|
|
107
|
+
install(
|
|
108
|
+
TARGETS termflow_python
|
|
109
|
+
LIBRARY DESTINATION termflow
|
|
110
|
+
RUNTIME DESTINATION termflow)
|
|
111
|
+
|
|
112
|
+
install(
|
|
113
|
+
FILES
|
|
114
|
+
python/termflow/__init__.py
|
|
115
|
+
python/termflow/cli.py
|
|
116
|
+
DESTINATION termflow)
|
|
117
|
+
endif()
|
|
118
|
+
|
|
119
|
+
if(TERMFLOW_BUILD_TESTS)
|
|
120
|
+
enable_testing()
|
|
121
|
+
|
|
122
|
+
add_executable(termflow_tests
|
|
123
|
+
tests/english_analyzer_tests.cpp
|
|
124
|
+
tests/filter_tests.cpp
|
|
125
|
+
tests/query_tests.cpp
|
|
126
|
+
tests/standard_tokenizer_tests.cpp
|
|
127
|
+
tests/term_extractor_tests.cpp
|
|
128
|
+
tests/test_framework.cpp
|
|
129
|
+
tests/test_main.cpp
|
|
130
|
+
tests/test_helpers.hpp)
|
|
131
|
+
|
|
132
|
+
target_link_libraries(termflow_tests PRIVATE termflow::termflow)
|
|
133
|
+
|
|
134
|
+
add_test(NAME termflow_tests COMMAND termflow_tests)
|
|
135
|
+
|
|
136
|
+
if(TERMFLOW_BUILD_PYTHON)
|
|
137
|
+
add_test(
|
|
138
|
+
NAME termflow_python_smoke
|
|
139
|
+
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/tests/python_smoke_test.py)
|
|
140
|
+
set_tests_properties(
|
|
141
|
+
termflow_python_smoke
|
|
142
|
+
PROPERTIES ENVIRONMENT "PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/python")
|
|
143
|
+
endif()
|
|
144
|
+
endif()
|
|
145
|
+
if(TERMFLOW_INSTALL_CPP_ARTIFACTS)
|
|
146
|
+
install(
|
|
147
|
+
TARGETS termflow
|
|
148
|
+
EXPORT termflowTargets
|
|
149
|
+
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
|
150
|
+
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
|
151
|
+
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
|
|
152
|
+
|
|
153
|
+
install(
|
|
154
|
+
DIRECTORY include/
|
|
155
|
+
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
|
156
|
+
|
|
157
|
+
install(
|
|
158
|
+
EXPORT termflowTargets
|
|
159
|
+
FILE termflowTargets.cmake
|
|
160
|
+
NAMESPACE termflow::
|
|
161
|
+
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/termflow)
|
|
162
|
+
|
|
163
|
+
write_basic_package_version_file(
|
|
164
|
+
${CMAKE_CURRENT_BINARY_DIR}/termflowConfigVersion.cmake
|
|
165
|
+
VERSION ${PROJECT_VERSION}
|
|
166
|
+
COMPATIBILITY SameMajorVersion)
|
|
167
|
+
|
|
168
|
+
configure_package_config_file(
|
|
169
|
+
${CMAKE_CURRENT_SOURCE_DIR}/cmake/termflowConfig.cmake.in
|
|
170
|
+
${CMAKE_CURRENT_BINARY_DIR}/termflowConfig.cmake
|
|
171
|
+
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/termflow)
|
|
172
|
+
|
|
173
|
+
install(
|
|
174
|
+
FILES
|
|
175
|
+
${CMAKE_CURRENT_BINARY_DIR}/termflowConfig.cmake
|
|
176
|
+
${CMAKE_CURRENT_BINARY_DIR}/termflowConfigVersion.cmake
|
|
177
|
+
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/termflow)
|
|
178
|
+
endif()
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: termflow-ir
|
|
3
|
+
Version: 0.1.2
|
|
4
|
+
Summary: English text analysis for information retrieval
|
|
5
|
+
Keywords: text-analysis,information-retrieval,tokenization,stemming,search
|
|
6
|
+
Author: Mustafa Abualsaud
|
|
7
|
+
Classifier: Development Status :: 3 - Alpha
|
|
8
|
+
Classifier: Intended Audience :: Developers
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: C++
|
|
15
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
16
|
+
Project-URL: Homepage, https://github.com/gathera/termflow
|
|
17
|
+
Project-URL: Documentation, https://github.com/gathera/termflow/blob/main/docs/installation.md
|
|
18
|
+
Project-URL: Repository, https://github.com/gathera/termflow
|
|
19
|
+
Project-URL: Issues, https://github.com/gathera/termflow/issues
|
|
20
|
+
Project-URL: Releases, https://github.com/gathera/termflow/releases
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
|
|
24
|
+
# termflow
|
|
25
|
+
|
|
26
|
+
English text analysis for information retrieval workloads in C++ and Python.
|
|
27
|
+
|
|
28
|
+
[](https://pypi.org/project/termflow-ir/)
|
|
29
|
+
[](https://pypi.org/project/termflow-ir/)
|
|
30
|
+
[](https://github.com/gathera/termflow/actions/workflows/python-package.yml)
|
|
31
|
+
|
|
32
|
+
`termflow` is a library-first analysis stack for search, indexing, tagging, and query normalization. It provides a built-in English analyzer, term extraction helpers, and a lightweight query rewrite layer without trying to be a full search engine.
|
|
33
|
+
|
|
34
|
+
## Why termflow
|
|
35
|
+
|
|
36
|
+
- C++20 core library with optional Python bindings
|
|
37
|
+
- English analyzer with configurable stemming, stop words, possessive handling, and ASCII folding
|
|
38
|
+
- Term extraction API for finalized search/index terms
|
|
39
|
+
- Query parser and rewrite support for canonicalization, equivalents, and expansions
|
|
40
|
+
- Installable Python wheels for Linux and macOS
|
|
41
|
+
- CMake install flow for downstream C++ consumers
|
|
42
|
+
|
|
43
|
+
## Install
|
|
44
|
+
|
|
45
|
+
Python package:
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
pip install termflow-ir
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Python import:
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
import termflow
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
CLI quick check:
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
termflow analyze "The Running Cars"
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
For C++ installation and `find_package(termflow)` usage, see [docs/installation.md](docs/installation.md).
|
|
64
|
+
|
|
65
|
+
## Quick Start
|
|
66
|
+
|
|
67
|
+
Python:
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
import termflow
|
|
71
|
+
|
|
72
|
+
analyzer = termflow.EnglishAnalyzer()
|
|
73
|
+
terms = analyzer.analyze_terms("The Running Cars")
|
|
74
|
+
normalized = analyzer.normalize("Running Café")
|
|
75
|
+
|
|
76
|
+
print(terms) # ['run', 'car']
|
|
77
|
+
print(normalized) # 'running café'
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
C++:
|
|
81
|
+
|
|
82
|
+
```cpp
|
|
83
|
+
#include <iostream>
|
|
84
|
+
#include "termflow/analysis/english_analyzer.hpp"
|
|
85
|
+
|
|
86
|
+
int main() {
|
|
87
|
+
termflow::EnglishAnalyzer analyzer;
|
|
88
|
+
const auto terms = analyzer.analyze_terms("The Running Cars");
|
|
89
|
+
|
|
90
|
+
for (const auto& term : terms) {
|
|
91
|
+
std::cout << term << "\n";
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Features
|
|
97
|
+
|
|
98
|
+
| Area | What it includes |
|
|
99
|
+
| --- | --- |
|
|
100
|
+
| Analysis | `EnglishAnalyzer`, token analysis, normalization, stemming, stop words, ASCII folding |
|
|
101
|
+
| Term extraction | `TermExtractor` with length, numeric, and character-policy filtering |
|
|
102
|
+
| Query processing | clause parsing, analyzed query terms, rewrite loading, validation, and alternatives |
|
|
103
|
+
| Python bindings | built-in analyzer, term extractor, and query module under `termflow.query` |
|
|
104
|
+
| CLI | `termflow analyze`, `termflow extract`, and `termflow analyze-query` for quick validation |
|
|
105
|
+
| C++ consumption | installable CMake package and external `find_package` example |
|
|
106
|
+
|
|
107
|
+
## Documentation
|
|
108
|
+
|
|
109
|
+
- [docs/usage.md](docs/usage.md) for day-to-day analyzer, term extraction, query, and Python usage
|
|
110
|
+
- [docs/customization.md](docs/customization.md) for pipeline tuning, query rewrites, and custom analyzers in C++
|
|
111
|
+
- [docs/installation.md](docs/installation.md) for Python and C++ installation paths
|
|
112
|
+
- [docs/installation-roadmap.md](docs/installation-roadmap.md) for packaging and distribution priorities
|
|
113
|
+
|
|
114
|
+
Runnable examples:
|
|
115
|
+
|
|
116
|
+
- [examples/analyze_text.cpp](examples/analyze_text.cpp)
|
|
117
|
+
- [examples/extract_terms.cpp](examples/extract_terms.cpp)
|
|
118
|
+
- [examples/custom_analyzer.cpp](examples/custom_analyzer.cpp)
|
|
119
|
+
- [examples/analyze_query.cpp](examples/analyze_query.cpp)
|
|
120
|
+
- [examples/find_package_consumer/CMakeLists.txt](examples/find_package_consumer/CMakeLists.txt)
|
|
121
|
+
|
|
122
|
+
## Scope
|
|
123
|
+
|
|
124
|
+
`termflow` currently focuses on:
|
|
125
|
+
|
|
126
|
+
- English text analysis
|
|
127
|
+
- Batch-oriented APIs
|
|
128
|
+
- Query parsing and rewrite preparation
|
|
129
|
+
- Reusable components for embedding in larger applications
|
|
130
|
+
|
|
131
|
+
`termflow` does not currently provide:
|
|
132
|
+
|
|
133
|
+
- indexing or retrieval
|
|
134
|
+
- ranking or scoring
|
|
135
|
+
- token graphs
|
|
136
|
+
- phrase execution logic
|
|
137
|
+
- multilingual analyzers
|
|
138
|
+
|
|
139
|
+
## Build From Source
|
|
140
|
+
|
|
141
|
+
Local build:
|
|
142
|
+
|
|
143
|
+
```bash
|
|
144
|
+
cmake -S . -B build -G Ninja
|
|
145
|
+
cmake --build build
|
|
146
|
+
ctest --test-dir build --output-on-failure
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
Build Python bindings from source:
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
cmake -S . -B build -G Ninja -DTERMFLOW_BUILD_PYTHON=ON
|
|
153
|
+
cmake --build build
|
|
154
|
+
PYTHONPATH=build/python python3 -c 'import termflow; print(termflow.EnglishAnalyzer().analyze_terms("Running Cars"))'
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
Build Python distributions:
|
|
158
|
+
|
|
159
|
+
```bash
|
|
160
|
+
python3 -m build --sdist --wheel
|
|
161
|
+
python3 -m twine check dist/*
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
## Project Status
|
|
165
|
+
|
|
166
|
+
`termflow` is early-stage and intentionally narrow in scope. The current focus is making the built-in English analysis and packaging story solid before expanding into more languages or broader IR features.
|
|
167
|
+
|
|
168
|
+
## Contributing
|
|
169
|
+
|
|
170
|
+
Issues and pull requests are welcome. If you want to make a larger API or packaging change, open an issue first so the direction is clear before implementation work starts.
|
|
171
|
+
|
|
172
|
+
## License
|
|
173
|
+
|
|
174
|
+
This repository does not yet include a `LICENSE` file. Until that is added, do not assume open source usage terms.
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
# termflow
|
|
2
|
+
|
|
3
|
+
English text analysis for information retrieval workloads in C++ and Python.
|
|
4
|
+
|
|
5
|
+
[](https://pypi.org/project/termflow-ir/)
|
|
6
|
+
[](https://pypi.org/project/termflow-ir/)
|
|
7
|
+
[](https://github.com/gathera/termflow/actions/workflows/python-package.yml)
|
|
8
|
+
|
|
9
|
+
`termflow` is a library-first analysis stack for search, indexing, tagging, and query normalization. It provides a built-in English analyzer, term extraction helpers, and a lightweight query rewrite layer without trying to be a full search engine.
|
|
10
|
+
|
|
11
|
+
## Why termflow
|
|
12
|
+
|
|
13
|
+
- C++20 core library with optional Python bindings
|
|
14
|
+
- English analyzer with configurable stemming, stop words, possessive handling, and ASCII folding
|
|
15
|
+
- Term extraction API for finalized search/index terms
|
|
16
|
+
- Query parser and rewrite support for canonicalization, equivalents, and expansions
|
|
17
|
+
- Installable Python wheels for Linux and macOS
|
|
18
|
+
- CMake install flow for downstream C++ consumers
|
|
19
|
+
|
|
20
|
+
## Install
|
|
21
|
+
|
|
22
|
+
Python package:
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install termflow-ir
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Python import:
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
import termflow
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
CLI quick check:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
termflow analyze "The Running Cars"
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
For C++ installation and `find_package(termflow)` usage, see [docs/installation.md](docs/installation.md).
|
|
41
|
+
|
|
42
|
+
## Quick Start
|
|
43
|
+
|
|
44
|
+
Python:
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
import termflow
|
|
48
|
+
|
|
49
|
+
analyzer = termflow.EnglishAnalyzer()
|
|
50
|
+
terms = analyzer.analyze_terms("The Running Cars")
|
|
51
|
+
normalized = analyzer.normalize("Running Café")
|
|
52
|
+
|
|
53
|
+
print(terms) # ['run', 'car']
|
|
54
|
+
print(normalized) # 'running café'
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
C++:
|
|
58
|
+
|
|
59
|
+
```cpp
|
|
60
|
+
#include <iostream>
|
|
61
|
+
#include "termflow/analysis/english_analyzer.hpp"
|
|
62
|
+
|
|
63
|
+
int main() {
|
|
64
|
+
termflow::EnglishAnalyzer analyzer;
|
|
65
|
+
const auto terms = analyzer.analyze_terms("The Running Cars");
|
|
66
|
+
|
|
67
|
+
for (const auto& term : terms) {
|
|
68
|
+
std::cout << term << "\n";
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Features
|
|
74
|
+
|
|
75
|
+
| Area | What it includes |
|
|
76
|
+
| --- | --- |
|
|
77
|
+
| Analysis | `EnglishAnalyzer`, token analysis, normalization, stemming, stop words, ASCII folding |
|
|
78
|
+
| Term extraction | `TermExtractor` with length, numeric, and character-policy filtering |
|
|
79
|
+
| Query processing | clause parsing, analyzed query terms, rewrite loading, validation, and alternatives |
|
|
80
|
+
| Python bindings | built-in analyzer, term extractor, and query module under `termflow.query` |
|
|
81
|
+
| CLI | `termflow analyze`, `termflow extract`, and `termflow analyze-query` for quick validation |
|
|
82
|
+
| C++ consumption | installable CMake package and external `find_package` example |
|
|
83
|
+
|
|
84
|
+
## Documentation
|
|
85
|
+
|
|
86
|
+
- [docs/usage.md](docs/usage.md) for day-to-day analyzer, term extraction, query, and Python usage
|
|
87
|
+
- [docs/customization.md](docs/customization.md) for pipeline tuning, query rewrites, and custom analyzers in C++
|
|
88
|
+
- [docs/installation.md](docs/installation.md) for Python and C++ installation paths
|
|
89
|
+
- [docs/installation-roadmap.md](docs/installation-roadmap.md) for packaging and distribution priorities
|
|
90
|
+
|
|
91
|
+
Runnable examples:
|
|
92
|
+
|
|
93
|
+
- [examples/analyze_text.cpp](examples/analyze_text.cpp)
|
|
94
|
+
- [examples/extract_terms.cpp](examples/extract_terms.cpp)
|
|
95
|
+
- [examples/custom_analyzer.cpp](examples/custom_analyzer.cpp)
|
|
96
|
+
- [examples/analyze_query.cpp](examples/analyze_query.cpp)
|
|
97
|
+
- [examples/find_package_consumer/CMakeLists.txt](examples/find_package_consumer/CMakeLists.txt)
|
|
98
|
+
|
|
99
|
+
## Scope
|
|
100
|
+
|
|
101
|
+
`termflow` currently focuses on:
|
|
102
|
+
|
|
103
|
+
- English text analysis
|
|
104
|
+
- Batch-oriented APIs
|
|
105
|
+
- Query parsing and rewrite preparation
|
|
106
|
+
- Reusable components for embedding in larger applications
|
|
107
|
+
|
|
108
|
+
`termflow` does not currently provide:
|
|
109
|
+
|
|
110
|
+
- indexing or retrieval
|
|
111
|
+
- ranking or scoring
|
|
112
|
+
- token graphs
|
|
113
|
+
- phrase execution logic
|
|
114
|
+
- multilingual analyzers
|
|
115
|
+
|
|
116
|
+
## Build From Source
|
|
117
|
+
|
|
118
|
+
Local build:
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
cmake -S . -B build -G Ninja
|
|
122
|
+
cmake --build build
|
|
123
|
+
ctest --test-dir build --output-on-failure
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
Build Python bindings from source:
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
cmake -S . -B build -G Ninja -DTERMFLOW_BUILD_PYTHON=ON
|
|
130
|
+
cmake --build build
|
|
131
|
+
PYTHONPATH=build/python python3 -c 'import termflow; print(termflow.EnglishAnalyzer().analyze_terms("Running Cars"))'
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
Build Python distributions:
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
python3 -m build --sdist --wheel
|
|
138
|
+
python3 -m twine check dist/*
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
## Project Status
|
|
142
|
+
|
|
143
|
+
`termflow` is early-stage and intentionally narrow in scope. The current focus is making the built-in English analysis and packaging story solid before expanding into more languages or broader IR features.
|
|
144
|
+
|
|
145
|
+
## Contributing
|
|
146
|
+
|
|
147
|
+
Issues and pull requests are welcome. If you want to make a larger API or packaging change, open an issue first so the direction is clear before implementation work starts.
|
|
148
|
+
|
|
149
|
+
## License
|
|
150
|
+
|
|
151
|
+
This repository does not yet include a `LICENSE` file. Until that is added, do not assume open source usage terms.
|