zaza-semantic-engine 3.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zaza_semantic_engine-3.0.0/LICENSE +21 -0
- zaza_semantic_engine-3.0.0/PKG-INFO +170 -0
- zaza_semantic_engine-3.0.0/README.md +127 -0
- zaza_semantic_engine-3.0.0/pyproject.toml +68 -0
- zaza_semantic_engine-3.0.0/setup.cfg +4 -0
- zaza_semantic_engine-3.0.0/src/zaza/__init__.py +3 -0
- zaza_semantic_engine-3.0.0/src/zaza/analysis.py +86 -0
- zaza_semantic_engine-3.0.0/src/zaza/api.py +177 -0
- zaza_semantic_engine-3.0.0/src/zaza/cli.py +198 -0
- zaza_semantic_engine-3.0.0/src/zaza/config.py +79 -0
- zaza_semantic_engine-3.0.0/src/zaza/database.py +176 -0
- zaza_semantic_engine-3.0.0/src/zaza/embeddings.py +134 -0
- zaza_semantic_engine-3.0.0/src/zaza/engine.py +179 -0
- zaza_semantic_engine-3.0.0/src/zaza/ingestion.py +287 -0
- zaza_semantic_engine-3.0.0/src/zaza/reporting.py +83 -0
- zaza_semantic_engine-3.0.0/src/zaza_semantic_engine.egg-info/PKG-INFO +170 -0
- zaza_semantic_engine-3.0.0/src/zaza_semantic_engine.egg-info/SOURCES.txt +27 -0
- zaza_semantic_engine-3.0.0/src/zaza_semantic_engine.egg-info/dependency_links.txt +1 -0
- zaza_semantic_engine-3.0.0/src/zaza_semantic_engine.egg-info/entry_points.txt +2 -0
- zaza_semantic_engine-3.0.0/src/zaza_semantic_engine.egg-info/requires.txt +23 -0
- zaza_semantic_engine-3.0.0/src/zaza_semantic_engine.egg-info/top_level.txt +1 -0
- zaza_semantic_engine-3.0.0/tests/test_analysis.py +69 -0
- zaza_semantic_engine-3.0.0/tests/test_cli.py +52 -0
- zaza_semantic_engine-3.0.0/tests/test_database.py +81 -0
- zaza_semantic_engine-3.0.0/tests/test_embeddings.py +142 -0
- zaza_semantic_engine-3.0.0/tests/test_formats_v3.py +272 -0
- zaza_semantic_engine-3.0.0/tests/test_ingestion.py +70 -0
- zaza_semantic_engine-3.0.0/tests/test_ingestion_extended.py +75 -0
- zaza_semantic_engine-3.0.0/tests/test_reporting.py +67 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Zaza
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: zaza-semantic-engine
|
|
3
|
+
Version: 3.0.0
|
|
4
|
+
Summary: Local-first multi-format document ingestion engine with semantic search using sentence-transformers and ChromaDB
|
|
5
|
+
Author: zaza6525
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/zaza6525/zaza-semantic-engine
|
|
8
|
+
Project-URL: Repository, https://github.com/zaza6525/zaza-semantic-engine
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: pyyaml>=6.0
|
|
24
|
+
Requires-Dist: pypdf>=4.0
|
|
25
|
+
Requires-Dist: chardet>=5.0
|
|
26
|
+
Requires-Dist: python-docx>=1.1
|
|
27
|
+
Requires-Dist: beautifulsoup4>=4.12
|
|
28
|
+
Requires-Dist: lxml>=5.0
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
31
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
32
|
+
Provides-Extra: api
|
|
33
|
+
Requires-Dist: fastapi>=0.104; extra == "api"
|
|
34
|
+
Requires-Dist: uvicorn>=0.24; extra == "api"
|
|
35
|
+
Requires-Dist: python-multipart>=0.0.6; extra == "api"
|
|
36
|
+
Provides-Extra: semantic
|
|
37
|
+
Requires-Dist: chromadb>=0.5; extra == "semantic"
|
|
38
|
+
Requires-Dist: sentence-transformers>=3.0; extra == "semantic"
|
|
39
|
+
Requires-Dist: ebooklib>=0.18; extra == "semantic"
|
|
40
|
+
Provides-Extra: all
|
|
41
|
+
Requires-Dist: zaza-semantic-engine[api,semantic]; extra == "all"
|
|
42
|
+
Dynamic: license-file
|
|
43
|
+
|
|
44
|
+
# Zaza Semantic Engine
|
|
45
|
+
|
|
46
|
+
Local-first multi-format document ingestion engine with **real semantic search**.
|
|
47
|
+
|
|
48
|
+
[](https://github.com/zaza6525/zaza-semantic-engine/actions)
|
|
49
|
+
[](https://opensource.org/licenses/MIT)
|
|
50
|
+
[](https://www.python.org/downloads/)
|
|
51
|
+
[](https://pypi.org/project/zaza-semantic-engine/)
|
|
52
|
+
|
|
53
|
+
## Why Zaza?
|
|
54
|
+
|
|
55
|
+
Most document tools fall into two camps: cloud-based SaaS (your docs leave your machine) or dumb keyword search (finds exact word matches, misses the point). Zaza does both **locally** and **semantically**.
|
|
56
|
+
|
|
57
|
+
- **Local-first** — your documents never leave your machine. No API keys, no data leaks.
|
|
58
|
+
- **Semantic search** — find documents by *meaning*, not just keywords. Search "budget" and it finds "financial analysis", "quarterly results".
|
|
59
|
+
- **Multi-format** — TXT, PDF, Markdown, DOCX, JSON, YAML, EPUB, CSV, HTML, XML. Ingest anything.
|
|
60
|
+
- **50+ languages** — built on `paraphrase-multilingual-MiniLM-L12-v2`. Search in French, English, Arabic, or any supported language.
|
|
61
|
+
- **Zero config** — `zaza ingest ./docs/` and you're done.
|
|
62
|
+
|
|
63
|
+
## Installation
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
# Core package
|
|
67
|
+
pip install -e .
|
|
68
|
+
|
|
69
|
+
# With API support
|
|
70
|
+
pip install -e ".[api]"
|
|
71
|
+
|
|
72
|
+
# With semantic search (embeddings + multilingual model)
|
|
73
|
+
pip install -e ".[semantic]"
|
|
74
|
+
|
|
75
|
+
# Full installation
|
|
76
|
+
pip install -e ".[all]"
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## Quick Start
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
# Ingest documents
|
|
83
|
+
zaza ingest ./my-documents/
|
|
84
|
+
|
|
85
|
+
# Keyword search (by filename)
|
|
86
|
+
zaza search "report"
|
|
87
|
+
|
|
88
|
+
# Semantic search (by meaning)
|
|
89
|
+
zaza search-semantic "financial analysis quarterly results" --top 5
|
|
90
|
+
|
|
91
|
+
# View stats
|
|
92
|
+
zaza stats
|
|
93
|
+
|
|
94
|
+
# Start API server (V3: either form works)
|
|
95
|
+
zaza api
|
|
96
|
+
zaza server
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## Semantic Search in Action
|
|
100
|
+
|
|
101
|
+
This project uses **sentence-transformers** (`paraphrase-multilingual-MiniLM-L12-v2`) to generate embeddings and **ChromaDB** for vector storage.
|
|
102
|
+
|
|
103
|
+
Unlike keyword search, semantic search finds documents with *related concepts* even when the exact words differ:
|
|
104
|
+
|
|
105
|
+
| Query | Keyword Search | Semantic Search |
|
|
106
|
+
|-------|---------------|-----------------|
|
|
107
|
+
| "budget" | Only files named "budget" | Finds "financial report", "quarterly analysis", "cost breakdown" |
|
|
108
|
+
| "rapport financier" | Only French files with exact match | Finds "financial analysis", "balance sheet", "revenue summary" |
|
|
109
|
+
|
|
110
|
+
## CLI Commands
|
|
111
|
+
|
|
112
|
+
| Command | Description |
|
|
113
|
+
|---------|-------------|
|
|
114
|
+
| `zaza ingest <path>` | Index documents from a directory or file |
|
|
115
|
+
| `zaza search <query>` | Search documents by filename (keyword) |
|
|
116
|
+
| `zaza search-semantic <query>` | Semantic search using embeddings |
|
|
117
|
+
| `zaza stats` | Show indexing statistics |
|
|
118
|
+
| `zaza documents` | List all indexed documents |
|
|
119
|
+
| `zaza report [format]` | Generate report (json/csv) |
|
|
120
|
+
| `zaza api` | Start the REST API server |
|
|
121
|
+
| `zaza server` | **V3 alias** — same as `zaza api` |
|
|
122
|
+
|
|
123
|
+
## API Endpoints
|
|
124
|
+
|
|
125
|
+
| Method | Path | Description |
|
|
126
|
+
|--------|------|-------------|
|
|
127
|
+
| GET | `/health` | Health check |
|
|
128
|
+
| GET | `/summary` | Engine summary |
|
|
129
|
+
| GET | `/documents` | List documents |
|
|
130
|
+
| GET | `/search?q=` | Keyword search |
|
|
131
|
+
| GET | `/search-semantic?q=&top=10` | Semantic search |
|
|
132
|
+
| GET | `/embeddings/status` | Check embedding store |
|
|
133
|
+
| POST | `/analyze` | Analyze raw text |
|
|
134
|
+
| POST | `/ingest/file` | Upload and ingest a file |
|
|
135
|
+
| POST | `/ingest/directory` | Ingest all files from directory |
|
|
136
|
+
|
|
137
|
+
## Supported Formats
|
|
138
|
+
|
|
139
|
+
| Format | Extension | Method |
|
|
140
|
+
|--------|-----------|--------|
|
|
141
|
+
| Plain text | `.txt` | Direct read |
|
|
142
|
+
| Markdown | `.md`, `.markdown` | Syntax stripped |
|
|
143
|
+
| PDF | `.pdf` | via `pypdf` |
|
|
144
|
+
| CSV | `.csv` | Converted to key-value |
|
|
145
|
+
| HTML | `.html`, `.htm` | via `BeautifulSoup` |
|
|
146
|
+
| XML | `.xml` | Standard library |
|
|
147
|
+
| Word | `.docx` | via `python-docx` |
|
|
148
|
+
| JSON | `.json` | Recursive key-value (V3) |
|
|
149
|
+
| YAML | `.yaml`, `.yml` | Recursive key-value (V3) |
|
|
150
|
+
| ePUB | `.epub` | via `ebooklib` (V3, requires `[semantic]`) |
|
|
151
|
+
|
|
152
|
+
## Model Caching (V3)
|
|
153
|
+
|
|
154
|
+
The embedding model is cached globally within a single process. `zaza ingest` + `zaza search-semantic` doesn't reload the model — it reuses the cached instance. Startup time drops significantly.
|
|
155
|
+
|
|
156
|
+
## Configuration
|
|
157
|
+
|
|
158
|
+
Edit `config.yaml` to customize paths, embedding models, and search settings.
|
|
159
|
+
|
|
160
|
+
```yaml
|
|
161
|
+
semantic:
|
|
162
|
+
enabled: true # Set false to disable embeddings
|
|
163
|
+
model_name: "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
|
164
|
+
embed_dir: "./data/embeddings" # ChromaDB persist directory
|
|
165
|
+
max_search_results: 10
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
## License
|
|
169
|
+
|
|
170
|
+
MIT
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
# Zaza Semantic Engine
|
|
2
|
+
|
|
3
|
+
Local-first multi-format document ingestion engine with **real semantic search**.
|
|
4
|
+
|
|
5
|
+
[](https://github.com/zaza6525/zaza-semantic-engine/actions)
|
|
6
|
+
[](https://opensource.org/licenses/MIT)
|
|
7
|
+
[](https://www.python.org/downloads/)
|
|
8
|
+
[](https://pypi.org/project/zaza-semantic-engine/)
|
|
9
|
+
|
|
10
|
+
## Why Zaza?
|
|
11
|
+
|
|
12
|
+
Most document tools fall into two camps: cloud-based SaaS (your docs leave your machine) or dumb keyword search (finds exact word matches, misses the point). Zaza does both **locally** and **semantically**.
|
|
13
|
+
|
|
14
|
+
- **Local-first** — your documents never leave your machine. No API keys, no data leaks.
|
|
15
|
+
- **Semantic search** — find documents by *meaning*, not just keywords. Search "budget" and it finds "financial analysis", "quarterly results".
|
|
16
|
+
- **Multi-format** — TXT, PDF, Markdown, DOCX, JSON, YAML, EPUB, CSV, HTML, XML. Ingest anything.
|
|
17
|
+
- **50+ languages** — built on `paraphrase-multilingual-MiniLM-L12-v2`. Search in French, English, Arabic, or any supported language.
|
|
18
|
+
- **Zero config** — `zaza ingest ./docs/` and you're done.
|
|
19
|
+
|
|
20
|
+
## Installation
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
# Core package
|
|
24
|
+
pip install -e .
|
|
25
|
+
|
|
26
|
+
# With API support
|
|
27
|
+
pip install -e ".[api]"
|
|
28
|
+
|
|
29
|
+
# With semantic search (embeddings + multilingual model)
|
|
30
|
+
pip install -e ".[semantic]"
|
|
31
|
+
|
|
32
|
+
# Full installation
|
|
33
|
+
pip install -e ".[all]"
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Quick Start
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
# Ingest documents
|
|
40
|
+
zaza ingest ./my-documents/
|
|
41
|
+
|
|
42
|
+
# Keyword search (by filename)
|
|
43
|
+
zaza search "report"
|
|
44
|
+
|
|
45
|
+
# Semantic search (by meaning)
|
|
46
|
+
zaza search-semantic "financial analysis quarterly results" --top 5
|
|
47
|
+
|
|
48
|
+
# View stats
|
|
49
|
+
zaza stats
|
|
50
|
+
|
|
51
|
+
# Start API server (V3: either form works)
|
|
52
|
+
zaza api
|
|
53
|
+
zaza server
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Semantic Search in Action
|
|
57
|
+
|
|
58
|
+
This project uses **sentence-transformers** (`paraphrase-multilingual-MiniLM-L12-v2`) to generate embeddings and **ChromaDB** for vector storage.
|
|
59
|
+
|
|
60
|
+
Unlike keyword search, semantic search finds documents with *related concepts* even when the exact words differ:
|
|
61
|
+
|
|
62
|
+
| Query | Keyword Search | Semantic Search |
|
|
63
|
+
|-------|---------------|-----------------|
|
|
64
|
+
| "budget" | Only files named "budget" | Finds "financial report", "quarterly analysis", "cost breakdown" |
|
|
65
|
+
| "rapport financier" | Only French files with exact match | Finds "financial analysis", "balance sheet", "revenue summary" |
|
|
66
|
+
|
|
67
|
+
## CLI Commands
|
|
68
|
+
|
|
69
|
+
| Command | Description |
|
|
70
|
+
|---------|-------------|
|
|
71
|
+
| `zaza ingest <path>` | Index documents from a directory or file |
|
|
72
|
+
| `zaza search <query>` | Search documents by filename (keyword) |
|
|
73
|
+
| `zaza search-semantic <query>` | Semantic search using embeddings |
|
|
74
|
+
| `zaza stats` | Show indexing statistics |
|
|
75
|
+
| `zaza documents` | List all indexed documents |
|
|
76
|
+
| `zaza report [format]` | Generate report (json/csv) |
|
|
77
|
+
| `zaza api` | Start the REST API server |
|
|
78
|
+
| `zaza server` | **V3 alias** — same as `zaza api` |
|
|
79
|
+
|
|
80
|
+
## API Endpoints
|
|
81
|
+
|
|
82
|
+
| Method | Path | Description |
|
|
83
|
+
|--------|------|-------------|
|
|
84
|
+
| GET | `/health` | Health check |
|
|
85
|
+
| GET | `/summary` | Engine summary |
|
|
86
|
+
| GET | `/documents` | List documents |
|
|
87
|
+
| GET | `/search?q=` | Keyword search |
|
|
88
|
+
| GET | `/search-semantic?q=&top=10` | Semantic search |
|
|
89
|
+
| GET | `/embeddings/status` | Check embedding store |
|
|
90
|
+
| POST | `/analyze` | Analyze raw text |
|
|
91
|
+
| POST | `/ingest/file` | Upload and ingest a file |
|
|
92
|
+
| POST | `/ingest/directory` | Ingest all files from directory |
|
|
93
|
+
|
|
94
|
+
## Supported Formats
|
|
95
|
+
|
|
96
|
+
| Format | Extension | Method |
|
|
97
|
+
|--------|-----------|--------|
|
|
98
|
+
| Plain text | `.txt` | Direct read |
|
|
99
|
+
| Markdown | `.md`, `.markdown` | Syntax stripped |
|
|
100
|
+
| PDF | `.pdf` | via `pypdf` |
|
|
101
|
+
| CSV | `.csv` | Converted to key-value |
|
|
102
|
+
| HTML | `.html`, `.htm` | via `BeautifulSoup` |
|
|
103
|
+
| XML | `.xml` | Standard library |
|
|
104
|
+
| Word | `.docx` | via `python-docx` |
|
|
105
|
+
| JSON | `.json` | Recursive key-value (V3) |
|
|
106
|
+
| YAML | `.yaml`, `.yml` | Recursive key-value (V3) |
|
|
107
|
+
| ePUB | `.epub` | via `ebooklib` (V3, requires `[semantic]`) |
|
|
108
|
+
|
|
109
|
+
## Model Caching (V3)
|
|
110
|
+
|
|
111
|
+
The embedding model is cached globally within a single process. `zaza ingest` + `zaza search-semantic` doesn't reload the model — it reuses the cached instance. Startup time drops significantly.
|
|
112
|
+
|
|
113
|
+
## Configuration
|
|
114
|
+
|
|
115
|
+
Edit `config.yaml` to customize paths, embedding models, and search settings.
|
|
116
|
+
|
|
117
|
+
```yaml
|
|
118
|
+
semantic:
|
|
119
|
+
enabled: true # Set false to disable embeddings
|
|
120
|
+
model_name: "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
|
121
|
+
embed_dir: "./data/embeddings" # ChromaDB persist directory
|
|
122
|
+
max_search_results: 10
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
## License
|
|
126
|
+
|
|
127
|
+
MIT
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "zaza-semantic-engine"
|
|
7
|
+
version = "3.0.0"
|
|
8
|
+
description = "Local-first multi-format document ingestion engine with semantic search using sentence-transformers and ChromaDB"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "zaza6525"},
|
|
14
|
+
]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Development Status :: 4 - Beta",
|
|
17
|
+
"Intended Audience :: Developers",
|
|
18
|
+
"License :: OSI Approved :: MIT License",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.10",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"Programming Language :: Python :: 3.13",
|
|
24
|
+
"Programming Language :: Python :: 3.14",
|
|
25
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
26
|
+
"Topic :: Text Processing :: Linguistic",
|
|
27
|
+
]
|
|
28
|
+
dependencies = [
|
|
29
|
+
"pyyaml>=6.0",
|
|
30
|
+
"pypdf>=4.0",
|
|
31
|
+
"chardet>=5.0",
|
|
32
|
+
"python-docx>=1.1",
|
|
33
|
+
"beautifulsoup4>=4.12",
|
|
34
|
+
"lxml>=5.0",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[project.optional-dependencies]
|
|
38
|
+
dev = [
|
|
39
|
+
"pytest>=7.0",
|
|
40
|
+
"pytest-cov>=4.0",
|
|
41
|
+
]
|
|
42
|
+
api = [
|
|
43
|
+
"fastapi>=0.104",
|
|
44
|
+
"uvicorn>=0.24",
|
|
45
|
+
"python-multipart>=0.0.6",
|
|
46
|
+
]
|
|
47
|
+
semantic = [
|
|
48
|
+
"chromadb>=0.5",
|
|
49
|
+
"sentence-transformers>=3.0",
|
|
50
|
+
"ebooklib>=0.18",
|
|
51
|
+
]
|
|
52
|
+
all = [
|
|
53
|
+
"zaza-semantic-engine[api,semantic]",
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
[project.urls]
|
|
57
|
+
Homepage = "https://github.com/zaza6525/zaza-semantic-engine"
|
|
58
|
+
Repository = "https://github.com/zaza6525/zaza-semantic-engine"
|
|
59
|
+
|
|
60
|
+
[project.scripts]
|
|
61
|
+
zaza = "zaza.cli:main"
|
|
62
|
+
|
|
63
|
+
[tool.setuptools.packages.find]
|
|
64
|
+
where = ["src"]
|
|
65
|
+
|
|
66
|
+
[tool.pytest.ini_options]
|
|
67
|
+
testpaths = ["tests"]
|
|
68
|
+
addopts = "-v --tb=short"
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""Semantic analysis engine."""
|
|
2
|
+
|
|
3
|
+
from collections import Counter
|
|
4
|
+
from typing import List, Dict, Tuple
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
# Simple stop words for French and English
|
|
8
|
+
STOP_WORDS = {
|
|
9
|
+
"fr": {
|
|
10
|
+
"au", "aux", "avec", "ce", "ces", "dans", "de", "des", "du", "elle", "en",
|
|
11
|
+
"et", "etant", "eu", "il", "ils", "je", "juste", "la", "le", "les", "leur",
|
|
12
|
+
"lui", "ma", "mais", "me", "mes", "mon", "ne", "nos", "notre", "nous", "on",
|
|
13
|
+
"ou", "par", "pas", "pour", "qu", "que", "qui", "sa", "se", "ses", "son",
|
|
14
|
+
"sur", "ta", "te", "tes", "toi", "ton", "tu", "un", "une", "vos", "votre",
|
|
15
|
+
"vous", "c", "d", "j", "l", "m", "n", "s", "t", "y", "est", "sont", "was",
|
|
16
|
+
"been", "has", "have", "had", "a", "i", "it", "at", "be", "this", "that",
|
|
17
|
+
"were", "are", "been", "being",
|
|
18
|
+
},
|
|
19
|
+
"en": {
|
|
20
|
+
"a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for", "of",
|
|
21
|
+
"with", "by", "from", "is", "are", "was", "were", "be", "been", "being",
|
|
22
|
+
"have", "has", "had", "do", "does", "did", "will", "would", "shall", "should",
|
|
23
|
+
"can", "could", "may", "might", "must", "this", "that", "these", "those",
|
|
24
|
+
"i", "you", "he", "she", "it", "we", "they", "what", "which", "who", "whom",
|
|
25
|
+
"if", "then", "than", "so", "as", "about", "up", "out", "into", "through",
|
|
26
|
+
},
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def analyze_text(content: str, top_words: int = 20, min_word_length: int = 3,
|
|
31
|
+
stop_words_lang: str = "fr") -> Dict:
|
|
32
|
+
"""Perform semantic analysis on text content.
|
|
33
|
+
|
|
34
|
+
Returns a dict with metrics and extracted data.
|
|
35
|
+
"""
|
|
36
|
+
if not content or not content.strip():
|
|
37
|
+
return {
|
|
38
|
+
"word_count": 0,
|
|
39
|
+
"char_count": 0,
|
|
40
|
+
"sentence_count": 0,
|
|
41
|
+
"unique_words": 0,
|
|
42
|
+
"lexical_density": 0.0,
|
|
43
|
+
"top_words": [],
|
|
44
|
+
"avg_word_length": 0.0,
|
|
45
|
+
"readability": {},
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
# Basic metrics
|
|
49
|
+
words = content.split()
|
|
50
|
+
chars = len(content)
|
|
51
|
+
sentences = len([s for s in content.replace('\n', ' ').split('.')
|
|
52
|
+
if s.strip()])
|
|
53
|
+
|
|
54
|
+
# Clean words for analysis
|
|
55
|
+
import re
|
|
56
|
+
clean_words = re.findall(r'[a-zA-Z\u00C0-\u024F\u0400-\u04FF]+', content.lower())
|
|
57
|
+
|
|
58
|
+
# Filter by min length
|
|
59
|
+
filtered_words = [w for w in clean_words if len(w) >= min_word_length]
|
|
60
|
+
|
|
61
|
+
# Stop words
|
|
62
|
+
sw = STOP_WORDS.get(stop_words_lang, STOP_WORDS["en"])
|
|
63
|
+
meaningful = [w for w in filtered_words if w not in sw]
|
|
64
|
+
|
|
65
|
+
# Word frequency
|
|
66
|
+
word_counts = Counter(meaningful)
|
|
67
|
+
top = word_counts.most_common(top_words)
|
|
68
|
+
|
|
69
|
+
# Lexical density
|
|
70
|
+
density = round(len(set(meaningful)) / max(len(meaningful), 1), 4)
|
|
71
|
+
|
|
72
|
+
# Average word length
|
|
73
|
+
avg_len = round(sum(len(w) for w in meaningful) / max(len(meaningful), 1), 2)
|
|
74
|
+
|
|
75
|
+
return {
|
|
76
|
+
"word_count": len(clean_words),
|
|
77
|
+
"char_count": chars,
|
|
78
|
+
"sentence_count": max(sentences, 1),
|
|
79
|
+
"unique_words": len(set(meaningful)),
|
|
80
|
+
"lexical_density": density,
|
|
81
|
+
"top_words": [{"word": w, "count": c} for w, c in top],
|
|
82
|
+
"avg_word_length": avg_len,
|
|
83
|
+
"readability": {
|
|
84
|
+
"words_per_sentence": round(len(meaningful) / max(sentences, 1), 2),
|
|
85
|
+
},
|
|
86
|
+
}
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"""FastAPI REST API for ZAZA Semantic Engine."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Optional, List, Dict, Any
|
|
5
|
+
from contextlib import asynccontextmanager
|
|
6
|
+
|
|
7
|
+
from fastapi import FastAPI, HTTPException, UploadFile, File, Form, Body
|
|
8
|
+
from pydantic import BaseModel
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@asynccontextmanager
|
|
12
|
+
async def lifespan(app: FastAPI):
|
|
13
|
+
"""Initialize engine on startup."""
|
|
14
|
+
from zaza.engine import SemanticEngine
|
|
15
|
+
app.state.engine = SemanticEngine()
|
|
16
|
+
yield
|
|
17
|
+
# Cleanup on shutdown
|
|
18
|
+
app.state.engine = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
app = FastAPI(
|
|
22
|
+
title="ZAZA Semantic Engine",
|
|
23
|
+
description="Multi-format document ingestion and semantic analysis API",
|
|
24
|
+
version="2.0.0",
|
|
25
|
+
lifespan=lifespan,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class DocumentInfo(BaseModel):
|
|
30
|
+
filename: str
|
|
31
|
+
filetype: str
|
|
32
|
+
word_count: int
|
|
33
|
+
unique_words: int
|
|
34
|
+
lexical_density: float
|
|
35
|
+
ingested_at: str
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class SummaryResponse(BaseModel):
|
|
39
|
+
total_documents: int
|
|
40
|
+
total_words: int
|
|
41
|
+
total_characters: int
|
|
42
|
+
average_lexical_density: float
|
|
43
|
+
first_ingestion: Optional[str]
|
|
44
|
+
last_ingestion: Optional[str]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class AnalysisResponse(BaseModel):
|
|
48
|
+
filename: str
|
|
49
|
+
word_count: int
|
|
50
|
+
char_count: int
|
|
51
|
+
sentence_count: int
|
|
52
|
+
unique_words: int
|
|
53
|
+
lexical_density: float
|
|
54
|
+
avg_word_length: float
|
|
55
|
+
top_words: List[Dict[str, Any]]
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class IngestResult(BaseModel):
|
|
59
|
+
filename: str
|
|
60
|
+
status: str
|
|
61
|
+
word_count: Optional[int] = None
|
|
62
|
+
top_words: Optional[list] = None
|
|
63
|
+
error: Optional[str] = None
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@app.post("/ingest/file")
|
|
67
|
+
async def ingest_single_file(file: UploadFile = File(...)):
|
|
68
|
+
"""Ingest a single file."""
|
|
69
|
+
engine = app.state.engine
|
|
70
|
+
if not engine:
|
|
71
|
+
raise HTTPException(500, "Engine not initialized")
|
|
72
|
+
|
|
73
|
+
# Save temp file
|
|
74
|
+
tmp_path = Path(f"/tmp/{file.filename}")
|
|
75
|
+
with open(tmp_path, "wb") as f:
|
|
76
|
+
content = await file.read()
|
|
77
|
+
f.write(content)
|
|
78
|
+
|
|
79
|
+
try:
|
|
80
|
+
result = engine.ingest_file(str(tmp_path))
|
|
81
|
+
return result
|
|
82
|
+
except Exception as e:
|
|
83
|
+
raise HTTPException(400, str(e))
|
|
84
|
+
finally:
|
|
85
|
+
tmp_path.unlink(missing_ok=True)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@app.post("/ingest/directory")
|
|
89
|
+
async def ingest_directory(dir_path: Optional[str] = None):
|
|
90
|
+
"""Ingest all files from a directory."""
|
|
91
|
+
engine = app.state.engine
|
|
92
|
+
if not engine:
|
|
93
|
+
raise HTTPException(500, "Engine not initialized")
|
|
94
|
+
|
|
95
|
+
results = engine.ingest_directory(dir_path)
|
|
96
|
+
return results
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@app.get("/summary")
|
|
100
|
+
async def get_summary():
|
|
101
|
+
"""Get overall analysis summary."""
|
|
102
|
+
engine = app.state.engine
|
|
103
|
+
if not engine:
|
|
104
|
+
raise HTTPException(500, "Engine not initialized")
|
|
105
|
+
|
|
106
|
+
return engine.get_summary()
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@app.get("/documents", response_model=List[DocumentInfo])
|
|
110
|
+
async def get_documents(search: Optional[str] = None):
|
|
111
|
+
"""List all ingested documents."""
|
|
112
|
+
engine = app.state.engine
|
|
113
|
+
if not engine:
|
|
114
|
+
raise HTTPException(500, "Engine not initialized")
|
|
115
|
+
|
|
116
|
+
if search:
|
|
117
|
+
docs = engine.search(search)
|
|
118
|
+
else:
|
|
119
|
+
docs = engine.get_documents()
|
|
120
|
+
|
|
121
|
+
return docs
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
@app.get("/search")
|
|
125
|
+
async def search_documents(query: str):
|
|
126
|
+
"""Search documents by name (keyword)."""
|
|
127
|
+
engine = app.state.engine
|
|
128
|
+
if not engine:
|
|
129
|
+
raise HTTPException(500, "Engine not initialized")
|
|
130
|
+
|
|
131
|
+
return engine.search(query)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
@app.get("/search-semantic")
|
|
135
|
+
async def search_semantic_documents(query: str, top: int = 10):
|
|
136
|
+
"""Semantic search using document embeddings."""
|
|
137
|
+
engine = app.state.engine
|
|
138
|
+
if not engine:
|
|
139
|
+
raise HTTPException(500, "Engine not initialized")
|
|
140
|
+
|
|
141
|
+
results = engine.search_semantic(query, n_results=top)
|
|
142
|
+
return results
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
@app.get("/embeddings/status")
|
|
146
|
+
async def embedding_status():
|
|
147
|
+
"""Check embedding store status."""
|
|
148
|
+
engine = app.state.engine
|
|
149
|
+
if not engine:
|
|
150
|
+
return {"enabled": False, "reason": "Engine not initialized"}
|
|
151
|
+
|
|
152
|
+
if engine.embed_store:
|
|
153
|
+
return {
|
|
154
|
+
"enabled": True,
|
|
155
|
+
"model": engine.embed_store.model_name,
|
|
156
|
+
"documents_count": engine.embed_store.collection.count(),
|
|
157
|
+
}
|
|
158
|
+
return {"enabled": False, "reason": "Embeddings not available"}
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
class TextAnalysisRequest(BaseModel):
|
|
162
|
+
text: str
|
|
163
|
+
language: str = "fr"
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
@app.post("/analyze")
|
|
167
|
+
async def analyze_text(request: TextAnalysisRequest):
|
|
168
|
+
"""Analyze raw text (no file needed)."""
|
|
169
|
+
from zaza.analysis import analyze_text as analyze
|
|
170
|
+
result = analyze(request.text, stop_words_lang=request.language)
|
|
171
|
+
return result
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
@app.get("/health")
|
|
175
|
+
async def health():
|
|
176
|
+
"""Health check."""
|
|
177
|
+
return {"status": "ok", "version": "2.0.0"}
|