zaza-semantic-engine 3.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. zaza_semantic_engine-3.0.0/LICENSE +21 -0
  2. zaza_semantic_engine-3.0.0/PKG-INFO +170 -0
  3. zaza_semantic_engine-3.0.0/README.md +127 -0
  4. zaza_semantic_engine-3.0.0/pyproject.toml +68 -0
  5. zaza_semantic_engine-3.0.0/setup.cfg +4 -0
  6. zaza_semantic_engine-3.0.0/src/zaza/__init__.py +3 -0
  7. zaza_semantic_engine-3.0.0/src/zaza/analysis.py +86 -0
  8. zaza_semantic_engine-3.0.0/src/zaza/api.py +177 -0
  9. zaza_semantic_engine-3.0.0/src/zaza/cli.py +198 -0
  10. zaza_semantic_engine-3.0.0/src/zaza/config.py +79 -0
  11. zaza_semantic_engine-3.0.0/src/zaza/database.py +176 -0
  12. zaza_semantic_engine-3.0.0/src/zaza/embeddings.py +134 -0
  13. zaza_semantic_engine-3.0.0/src/zaza/engine.py +179 -0
  14. zaza_semantic_engine-3.0.0/src/zaza/ingestion.py +287 -0
  15. zaza_semantic_engine-3.0.0/src/zaza/reporting.py +83 -0
  16. zaza_semantic_engine-3.0.0/src/zaza_semantic_engine.egg-info/PKG-INFO +170 -0
  17. zaza_semantic_engine-3.0.0/src/zaza_semantic_engine.egg-info/SOURCES.txt +27 -0
  18. zaza_semantic_engine-3.0.0/src/zaza_semantic_engine.egg-info/dependency_links.txt +1 -0
  19. zaza_semantic_engine-3.0.0/src/zaza_semantic_engine.egg-info/entry_points.txt +2 -0
  20. zaza_semantic_engine-3.0.0/src/zaza_semantic_engine.egg-info/requires.txt +23 -0
  21. zaza_semantic_engine-3.0.0/src/zaza_semantic_engine.egg-info/top_level.txt +1 -0
  22. zaza_semantic_engine-3.0.0/tests/test_analysis.py +69 -0
  23. zaza_semantic_engine-3.0.0/tests/test_cli.py +52 -0
  24. zaza_semantic_engine-3.0.0/tests/test_database.py +81 -0
  25. zaza_semantic_engine-3.0.0/tests/test_embeddings.py +142 -0
  26. zaza_semantic_engine-3.0.0/tests/test_formats_v3.py +272 -0
  27. zaza_semantic_engine-3.0.0/tests/test_ingestion.py +70 -0
  28. zaza_semantic_engine-3.0.0/tests/test_ingestion_extended.py +75 -0
  29. zaza_semantic_engine-3.0.0/tests/test_reporting.py +67 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Zaza
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,170 @@
1
+ Metadata-Version: 2.4
2
+ Name: zaza-semantic-engine
3
+ Version: 3.0.0
4
+ Summary: Local-first multi-format document ingestion engine with semantic search using sentence-transformers and ChromaDB
5
+ Author: zaza6525
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/zaza6525/zaza-semantic-engine
8
+ Project-URL: Repository, https://github.com/zaza6525/zaza-semantic-engine
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Classifier: Programming Language :: Python :: 3.14
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Classifier: Topic :: Text Processing :: Linguistic
20
+ Requires-Python: >=3.10
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: pyyaml>=6.0
24
+ Requires-Dist: pypdf>=4.0
25
+ Requires-Dist: chardet>=5.0
26
+ Requires-Dist: python-docx>=1.1
27
+ Requires-Dist: beautifulsoup4>=4.12
28
+ Requires-Dist: lxml>=5.0
29
+ Provides-Extra: dev
30
+ Requires-Dist: pytest>=7.0; extra == "dev"
31
+ Requires-Dist: pytest-cov>=4.0; extra == "dev"
32
+ Provides-Extra: api
33
+ Requires-Dist: fastapi>=0.104; extra == "api"
34
+ Requires-Dist: uvicorn>=0.24; extra == "api"
35
+ Requires-Dist: python-multipart>=0.0.6; extra == "api"
36
+ Provides-Extra: semantic
37
+ Requires-Dist: chromadb>=0.5; extra == "semantic"
38
+ Requires-Dist: sentence-transformers>=3.0; extra == "semantic"
39
+ Requires-Dist: ebooklib>=0.18; extra == "semantic"
40
+ Provides-Extra: all
41
+ Requires-Dist: zaza-semantic-engine[api,semantic]; extra == "all"
42
+ Dynamic: license-file
43
+
44
+ # Zaza Semantic Engine
45
+
46
+ Local-first multi-format document ingestion engine with **real semantic search**.
47
+
48
+ [![Tests](https://github.com/zaza6525/zaza-semantic-engine/actions/workflows/test.yml/badge.svg)](https://github.com/zaza6525/zaza-semantic-engine/actions)
49
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
50
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
51
+ [![PyPI version](https://img.shields.io/pypi/v/zaza-semantic-engine.svg)](https://pypi.org/project/zaza-semantic-engine/)
52
+
53
+ ## Why Zaza?
54
+
55
+ Most document tools fall into two camps: cloud-based SaaS (your docs leave your machine) or dumb keyword search (finds exact word matches, misses the point). Zaza does both **locally** and **semantically**.
56
+
57
+ - **Local-first** — your documents never leave your machine. No API keys, no data leaks.
58
+ - **Semantic search** — find documents by *meaning*, not just keywords. Search "budget" and it finds "financial analysis", "quarterly results".
59
+ - **Multi-format** — TXT, PDF, Markdown, DOCX, JSON, YAML, EPUB, CSV, HTML, XML. Ingest anything.
60
+ - **50+ languages** — built on `paraphrase-multilingual-MiniLM-L12-v2`. Search in French, English, Arabic, or any supported language.
61
+ - **Zero config** — `zaza ingest ./docs/` and you're done.
62
+
63
+ ## Installation
64
+
65
+ ```bash
66
+ # Core package
67
+ pip install -e .
68
+
69
+ # With API support
70
+ pip install -e ".[api]"
71
+
72
+ # With semantic search (embeddings + multilingual model)
73
+ pip install -e ".[semantic]"
74
+
75
+ # Full installation
76
+ pip install -e ".[all]"
77
+ ```
78
+
79
+ ## Quick Start
80
+
81
+ ```bash
82
+ # Ingest documents
83
+ zaza ingest ./my-documents/
84
+
85
+ # Keyword search (by filename)
86
+ zaza search "report"
87
+
88
+ # Semantic search (by meaning)
89
+ zaza search-semantic "financial analysis quarterly results" --top 5
90
+
91
+ # View stats
92
+ zaza stats
93
+
94
+ # Start API server (V3: either form works)
95
+ zaza api
96
+ zaza server
97
+ ```
98
+
99
+ ## Semantic Search in Action
100
+
101
+ This project uses **sentence-transformers** (`paraphrase-multilingual-MiniLM-L12-v2`) to generate embeddings and **ChromaDB** for vector storage.
102
+
103
+ Unlike keyword search, semantic search finds documents with *related concepts* even when the exact words differ:
104
+
105
+ | Query | Keyword Search | Semantic Search |
106
+ |-------|---------------|-----------------|
107
+ | "budget" | Only files named "budget" | Finds "financial report", "quarterly analysis", "cost breakdown" |
108
+ | "rapport financier" | Only French files with exact match | Finds "financial analysis", "balance sheet", "revenue summary" |
109
+
110
+ ## CLI Commands
111
+
112
+ | Command | Description |
113
+ |---------|-------------|
114
+ | `zaza ingest <path>` | Index documents from a directory or file |
115
+ | `zaza search <query>` | Search documents by filename (keyword) |
116
+ | `zaza search-semantic <query>` | Semantic search using embeddings |
117
+ | `zaza stats` | Show indexing statistics |
118
+ | `zaza documents` | List all indexed documents |
119
+ | `zaza report [format]` | Generate report (json/csv) |
120
+ | `zaza api` | Start the REST API server |
121
+ | `zaza server` | **V3 alias** — same as `zaza api` |
122
+
123
+ ## API Endpoints
124
+
125
+ | Method | Path | Description |
126
+ |--------|------|-------------|
127
+ | GET | `/health` | Health check |
128
+ | GET | `/summary` | Engine summary |
129
+ | GET | `/documents` | List documents |
130
+ | GET | `/search?q=` | Keyword search |
131
+ | GET | `/search-semantic?q=&top=10` | Semantic search |
132
+ | GET | `/embeddings/status` | Check embedding store |
133
+ | POST | `/analyze` | Analyze raw text |
134
+ | POST | `/ingest/file` | Upload and ingest a file |
135
+ | POST | `/ingest/directory` | Ingest all files from directory |
136
+
137
+ ## Supported Formats
138
+
139
+ | Format | Extension | Method |
140
+ |--------|-----------|--------|
141
+ | Plain text | `.txt` | Direct read |
142
+ | Markdown | `.md`, `.markdown` | Syntax stripped |
143
+ | PDF | `.pdf` | via `pypdf` |
144
+ | CSV | `.csv` | Converted to key-value |
145
+ | HTML | `.html`, `.htm` | via `BeautifulSoup` |
146
+ | XML | `.xml` | Standard library |
147
+ | Word | `.docx` | via `python-docx` |
148
+ | JSON | `.json` | Recursive key-value (V3) |
149
+ | YAML | `.yaml`, `.yml` | Recursive key-value (V3) |
150
+ | ePUB | `.epub` | via `ebooklib` (V3, requires `[semantic]`) |
151
+
152
+ ## Model Caching (V3)
153
+
154
+ The embedding model is cached globally within a single process. `zaza ingest` + `zaza search-semantic` doesn't reload the model — it reuses the cached instance. Startup time drops significantly.
155
+
156
+ ## Configuration
157
+
158
+ Edit `config.yaml` to customize paths, embedding models, and search settings.
159
+
160
+ ```yaml
161
+ semantic:
162
+ enabled: true # Set false to disable embeddings
163
+ model_name: "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
164
+ embed_dir: "./data/embeddings" # ChromaDB persist directory
165
+ max_search_results: 10
166
+ ```
167
+
168
+ ## License
169
+
170
+ MIT
@@ -0,0 +1,127 @@
1
+ # Zaza Semantic Engine
2
+
3
+ Local-first multi-format document ingestion engine with **real semantic search**.
4
+
5
+ [![Tests](https://github.com/zaza6525/zaza-semantic-engine/actions/workflows/test.yml/badge.svg)](https://github.com/zaza6525/zaza-semantic-engine/actions)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
7
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
8
+ [![PyPI version](https://img.shields.io/pypi/v/zaza-semantic-engine.svg)](https://pypi.org/project/zaza-semantic-engine/)
9
+
10
+ ## Why Zaza?
11
+
12
+ Most document tools fall into two camps: cloud-based SaaS (your docs leave your machine) or dumb keyword search (finds exact word matches, misses the point). Zaza does both **locally** and **semantically**.
13
+
14
+ - **Local-first** — your documents never leave your machine. No API keys, no data leaks.
15
+ - **Semantic search** — find documents by *meaning*, not just keywords. Search "budget" and it finds "financial analysis", "quarterly results".
16
+ - **Multi-format** — TXT, PDF, Markdown, DOCX, JSON, YAML, EPUB, CSV, HTML, XML. Ingest anything.
17
+ - **50+ languages** — built on `paraphrase-multilingual-MiniLM-L12-v2`. Search in French, English, Arabic, or any supported language.
18
+ - **Zero config** — `zaza ingest ./docs/` and you're done.
19
+
20
+ ## Installation
21
+
22
+ ```bash
23
+ # Core package
24
+ pip install -e .
25
+
26
+ # With API support
27
+ pip install -e ".[api]"
28
+
29
+ # With semantic search (embeddings + multilingual model)
30
+ pip install -e ".[semantic]"
31
+
32
+ # Full installation
33
+ pip install -e ".[all]"
34
+ ```
35
+
36
+ ## Quick Start
37
+
38
+ ```bash
39
+ # Ingest documents
40
+ zaza ingest ./my-documents/
41
+
42
+ # Keyword search (by filename)
43
+ zaza search "report"
44
+
45
+ # Semantic search (by meaning)
46
+ zaza search-semantic "financial analysis quarterly results" --top 5
47
+
48
+ # View stats
49
+ zaza stats
50
+
51
+ # Start API server (V3: either form works)
52
+ zaza api
53
+ zaza server
54
+ ```
55
+
56
+ ## Semantic Search in Action
57
+
58
+ This project uses **sentence-transformers** (`paraphrase-multilingual-MiniLM-L12-v2`) to generate embeddings and **ChromaDB** for vector storage.
59
+
60
+ Unlike keyword search, semantic search finds documents with *related concepts* even when the exact words differ:
61
+
62
+ | Query | Keyword Search | Semantic Search |
63
+ |-------|---------------|-----------------|
64
+ | "budget" | Only files named "budget" | Finds "financial report", "quarterly analysis", "cost breakdown" |
65
+ | "rapport financier" | Only French files with exact match | Finds "financial analysis", "balance sheet", "revenue summary" |
66
+
67
+ ## CLI Commands
68
+
69
+ | Command | Description |
70
+ |---------|-------------|
71
+ | `zaza ingest <path>` | Index documents from a directory or file |
72
+ | `zaza search <query>` | Search documents by filename (keyword) |
73
+ | `zaza search-semantic <query>` | Semantic search using embeddings |
74
+ | `zaza stats` | Show indexing statistics |
75
+ | `zaza documents` | List all indexed documents |
76
+ | `zaza report [format]` | Generate report (json/csv) |
77
+ | `zaza api` | Start the REST API server |
78
+ | `zaza server` | **V3 alias** — same as `zaza api` |
79
+
80
+ ## API Endpoints
81
+
82
+ | Method | Path | Description |
83
+ |--------|------|-------------|
84
+ | GET | `/health` | Health check |
85
+ | GET | `/summary` | Engine summary |
86
+ | GET | `/documents` | List documents |
87
+ | GET | `/search?q=` | Keyword search |
88
+ | GET | `/search-semantic?q=&top=10` | Semantic search |
89
+ | GET | `/embeddings/status` | Check embedding store |
90
+ | POST | `/analyze` | Analyze raw text |
91
+ | POST | `/ingest/file` | Upload and ingest a file |
92
+ | POST | `/ingest/directory` | Ingest all files from directory |
93
+
94
+ ## Supported Formats
95
+
96
+ | Format | Extension | Method |
97
+ |--------|-----------|--------|
98
+ | Plain text | `.txt` | Direct read |
99
+ | Markdown | `.md`, `.markdown` | Syntax stripped |
100
+ | PDF | `.pdf` | via `pypdf` |
101
+ | CSV | `.csv` | Converted to key-value |
102
+ | HTML | `.html`, `.htm` | via `BeautifulSoup` |
103
+ | XML | `.xml` | Standard library |
104
+ | Word | `.docx` | via `python-docx` |
105
+ | JSON | `.json` | Recursive key-value (V3) |
106
+ | YAML | `.yaml`, `.yml` | Recursive key-value (V3) |
107
+ | ePUB | `.epub` | via `ebooklib` (V3, requires `[semantic]`) |
108
+
109
+ ## Model Caching (V3)
110
+
111
+ The embedding model is cached globally within a single process. `zaza ingest` + `zaza search-semantic` doesn't reload the model — it reuses the cached instance. Startup time drops significantly.
112
+
113
+ ## Configuration
114
+
115
+ Edit `config.yaml` to customize paths, embedding models, and search settings.
116
+
117
+ ```yaml
118
+ semantic:
119
+ enabled: true # Set false to disable embeddings
120
+ model_name: "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
121
+ embed_dir: "./data/embeddings" # ChromaDB persist directory
122
+ max_search_results: 10
123
+ ```
124
+
125
+ ## License
126
+
127
+ MIT
@@ -0,0 +1,68 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "zaza-semantic-engine"
7
+ version = "3.0.0"
8
+ description = "Local-first multi-format document ingestion engine with semantic search using sentence-transformers and ChromaDB"
9
+ readme = "README.md"
10
+ license = {text = "MIT"}
11
+ requires-python = ">=3.10"
12
+ authors = [
13
+ {name = "zaza6525"},
14
+ ]
15
+ classifiers = [
16
+ "Development Status :: 4 - Beta",
17
+ "Intended Audience :: Developers",
18
+ "License :: OSI Approved :: MIT License",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3.10",
21
+ "Programming Language :: Python :: 3.11",
22
+ "Programming Language :: Python :: 3.12",
23
+ "Programming Language :: Python :: 3.13",
24
+ "Programming Language :: Python :: 3.14",
25
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
26
+ "Topic :: Text Processing :: Linguistic",
27
+ ]
28
+ dependencies = [
29
+ "pyyaml>=6.0",
30
+ "pypdf>=4.0",
31
+ "chardet>=5.0",
32
+ "python-docx>=1.1",
33
+ "beautifulsoup4>=4.12",
34
+ "lxml>=5.0",
35
+ ]
36
+
37
+ [project.optional-dependencies]
38
+ dev = [
39
+ "pytest>=7.0",
40
+ "pytest-cov>=4.0",
41
+ ]
42
+ api = [
43
+ "fastapi>=0.104",
44
+ "uvicorn>=0.24",
45
+ "python-multipart>=0.0.6",
46
+ ]
47
+ semantic = [
48
+ "chromadb>=0.5",
49
+ "sentence-transformers>=3.0",
50
+ "ebooklib>=0.18",
51
+ ]
52
+ all = [
53
+ "zaza-semantic-engine[api,semantic]",
54
+ ]
55
+
56
+ [project.urls]
57
+ Homepage = "https://github.com/zaza6525/zaza-semantic-engine"
58
+ Repository = "https://github.com/zaza6525/zaza-semantic-engine"
59
+
60
+ [project.scripts]
61
+ zaza = "zaza.cli:main"
62
+
63
+ [tool.setuptools.packages.find]
64
+ where = ["src"]
65
+
66
+ [tool.pytest.ini_options]
67
+ testpaths = ["tests"]
68
+ addopts = "-v --tb=short"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,3 @@
1
+ """ZAZA Semantic Engine v2 - Multi-format semantic ingestion pipeline."""
2
+
3
+ __version__ = "2.0.0"
@@ -0,0 +1,86 @@
1
+ """Semantic analysis engine."""
2
+
3
+ from collections import Counter
4
+ from typing import List, Dict, Tuple
5
+
6
+
7
+ # Simple stop words for French and English
8
+ STOP_WORDS = {
9
+ "fr": {
10
+ "au", "aux", "avec", "ce", "ces", "dans", "de", "des", "du", "elle", "en",
11
+ "et", "etant", "eu", "il", "ils", "je", "juste", "la", "le", "les", "leur",
12
+ "lui", "ma", "mais", "me", "mes", "mon", "ne", "nos", "notre", "nous", "on",
13
+ "ou", "par", "pas", "pour", "qu", "que", "qui", "sa", "se", "ses", "son",
14
+ "sur", "ta", "te", "tes", "toi", "ton", "tu", "un", "une", "vos", "votre",
15
+ "vous", "c", "d", "j", "l", "m", "n", "s", "t", "y", "est", "sont", "was",
16
+ "been", "has", "have", "had", "a", "i", "it", "at", "be", "this", "that",
17
+ "were", "are", "been", "being",
18
+ },
19
+ "en": {
20
+ "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for", "of",
21
+ "with", "by", "from", "is", "are", "was", "were", "be", "been", "being",
22
+ "have", "has", "had", "do", "does", "did", "will", "would", "shall", "should",
23
+ "can", "could", "may", "might", "must", "this", "that", "these", "those",
24
+ "i", "you", "he", "she", "it", "we", "they", "what", "which", "who", "whom",
25
+ "if", "then", "than", "so", "as", "about", "up", "out", "into", "through",
26
+ },
27
+ }
28
+
29
+
30
+ def analyze_text(content: str, top_words: int = 20, min_word_length: int = 3,
31
+ stop_words_lang: str = "fr") -> Dict:
32
+ """Perform semantic analysis on text content.
33
+
34
+ Returns a dict with metrics and extracted data.
35
+ """
36
+ if not content or not content.strip():
37
+ return {
38
+ "word_count": 0,
39
+ "char_count": 0,
40
+ "sentence_count": 0,
41
+ "unique_words": 0,
42
+ "lexical_density": 0.0,
43
+ "top_words": [],
44
+ "avg_word_length": 0.0,
45
+ "readability": {},
46
+ }
47
+
48
+ # Basic metrics
49
+ words = content.split()
50
+ chars = len(content)
51
+ sentences = len([s for s in content.replace('\n', ' ').split('.')
52
+ if s.strip()])
53
+
54
+ # Clean words for analysis
55
+ import re
56
+ clean_words = re.findall(r'[a-zA-Z\u00C0-\u024F\u0400-\u04FF]+', content.lower())
57
+
58
+ # Filter by min length
59
+ filtered_words = [w for w in clean_words if len(w) >= min_word_length]
60
+
61
+ # Stop words
62
+ sw = STOP_WORDS.get(stop_words_lang, STOP_WORDS["en"])
63
+ meaningful = [w for w in filtered_words if w not in sw]
64
+
65
+ # Word frequency
66
+ word_counts = Counter(meaningful)
67
+ top = word_counts.most_common(top_words)
68
+
69
+ # Lexical density
70
+ density = round(len(set(meaningful)) / max(len(meaningful), 1), 4)
71
+
72
+ # Average word length
73
+ avg_len = round(sum(len(w) for w in meaningful) / max(len(meaningful), 1), 2)
74
+
75
+ return {
76
+ "word_count": len(clean_words),
77
+ "char_count": chars,
78
+ "sentence_count": max(sentences, 1),
79
+ "unique_words": len(set(meaningful)),
80
+ "lexical_density": density,
81
+ "top_words": [{"word": w, "count": c} for w, c in top],
82
+ "avg_word_length": avg_len,
83
+ "readability": {
84
+ "words_per_sentence": round(len(meaningful) / max(sentences, 1), 2),
85
+ },
86
+ }
@@ -0,0 +1,177 @@
1
+ """FastAPI REST API for ZAZA Semantic Engine."""
2
+
3
+ from pathlib import Path
4
+ from typing import Optional, List, Dict, Any
5
+ from contextlib import asynccontextmanager
6
+
7
+ from fastapi import FastAPI, HTTPException, UploadFile, File, Form, Body
8
+ from pydantic import BaseModel
9
+
10
+
11
+ @asynccontextmanager
12
+ async def lifespan(app: FastAPI):
13
+ """Initialize engine on startup."""
14
+ from zaza.engine import SemanticEngine
15
+ app.state.engine = SemanticEngine()
16
+ yield
17
+ # Cleanup on shutdown
18
+ app.state.engine = None
19
+
20
+
21
+ app = FastAPI(
22
+ title="ZAZA Semantic Engine",
23
+ description="Multi-format document ingestion and semantic analysis API",
24
+ version="2.0.0",
25
+ lifespan=lifespan,
26
+ )
27
+
28
+
29
+ class DocumentInfo(BaseModel):
30
+ filename: str
31
+ filetype: str
32
+ word_count: int
33
+ unique_words: int
34
+ lexical_density: float
35
+ ingested_at: str
36
+
37
+
38
+ class SummaryResponse(BaseModel):
39
+ total_documents: int
40
+ total_words: int
41
+ total_characters: int
42
+ average_lexical_density: float
43
+ first_ingestion: Optional[str]
44
+ last_ingestion: Optional[str]
45
+
46
+
47
+ class AnalysisResponse(BaseModel):
48
+ filename: str
49
+ word_count: int
50
+ char_count: int
51
+ sentence_count: int
52
+ unique_words: int
53
+ lexical_density: float
54
+ avg_word_length: float
55
+ top_words: List[Dict[str, Any]]
56
+
57
+
58
+ class IngestResult(BaseModel):
59
+ filename: str
60
+ status: str
61
+ word_count: Optional[int] = None
62
+ top_words: Optional[list] = None
63
+ error: Optional[str] = None
64
+
65
+
66
+ @app.post("/ingest/file")
67
+ async def ingest_single_file(file: UploadFile = File(...)):
68
+ """Ingest a single file."""
69
+ engine = app.state.engine
70
+ if not engine:
71
+ raise HTTPException(500, "Engine not initialized")
72
+
73
+ # Save temp file
74
+ tmp_path = Path(f"/tmp/{file.filename}")
75
+ with open(tmp_path, "wb") as f:
76
+ content = await file.read()
77
+ f.write(content)
78
+
79
+ try:
80
+ result = engine.ingest_file(str(tmp_path))
81
+ return result
82
+ except Exception as e:
83
+ raise HTTPException(400, str(e))
84
+ finally:
85
+ tmp_path.unlink(missing_ok=True)
86
+
87
+
88
+ @app.post("/ingest/directory")
89
+ async def ingest_directory(dir_path: Optional[str] = None):
90
+ """Ingest all files from a directory."""
91
+ engine = app.state.engine
92
+ if not engine:
93
+ raise HTTPException(500, "Engine not initialized")
94
+
95
+ results = engine.ingest_directory(dir_path)
96
+ return results
97
+
98
+
99
+ @app.get("/summary")
100
+ async def get_summary():
101
+ """Get overall analysis summary."""
102
+ engine = app.state.engine
103
+ if not engine:
104
+ raise HTTPException(500, "Engine not initialized")
105
+
106
+ return engine.get_summary()
107
+
108
+
109
+ @app.get("/documents", response_model=List[DocumentInfo])
110
+ async def get_documents(search: Optional[str] = None):
111
+ """List all ingested documents."""
112
+ engine = app.state.engine
113
+ if not engine:
114
+ raise HTTPException(500, "Engine not initialized")
115
+
116
+ if search:
117
+ docs = engine.search(search)
118
+ else:
119
+ docs = engine.get_documents()
120
+
121
+ return docs
122
+
123
+
124
+ @app.get("/search")
125
+ async def search_documents(query: str):
126
+ """Search documents by name (keyword)."""
127
+ engine = app.state.engine
128
+ if not engine:
129
+ raise HTTPException(500, "Engine not initialized")
130
+
131
+ return engine.search(query)
132
+
133
+
134
+ @app.get("/search-semantic")
135
+ async def search_semantic_documents(query: str, top: int = 10):
136
+ """Semantic search using document embeddings."""
137
+ engine = app.state.engine
138
+ if not engine:
139
+ raise HTTPException(500, "Engine not initialized")
140
+
141
+ results = engine.search_semantic(query, n_results=top)
142
+ return results
143
+
144
+
145
+ @app.get("/embeddings/status")
146
+ async def embedding_status():
147
+ """Check embedding store status."""
148
+ engine = app.state.engine
149
+ if not engine:
150
+ return {"enabled": False, "reason": "Engine not initialized"}
151
+
152
+ if engine.embed_store:
153
+ return {
154
+ "enabled": True,
155
+ "model": engine.embed_store.model_name,
156
+ "documents_count": engine.embed_store.collection.count(),
157
+ }
158
+ return {"enabled": False, "reason": "Embeddings not available"}
159
+
160
+
161
+ class TextAnalysisRequest(BaseModel):
162
+ text: str
163
+ language: str = "fr"
164
+
165
+
166
+ @app.post("/analyze")
167
+ async def analyze_text(request: TextAnalysisRequest):
168
+ """Analyze raw text (no file needed)."""
169
+ from zaza.analysis import analyze_text as analyze
170
+ result = analyze(request.text, stop_words_lang=request.language)
171
+ return result
172
+
173
+
174
+ @app.get("/health")
175
+ async def health():
176
+ """Health check."""
177
+ return {"status": "ok", "version": "2.0.0"}