gnosisllm-knowledge 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gnosisllm_knowledge/__init__.py +152 -0
- gnosisllm_knowledge/api/__init__.py +5 -0
- gnosisllm_knowledge/api/knowledge.py +548 -0
- gnosisllm_knowledge/backends/__init__.py +26 -0
- gnosisllm_knowledge/backends/memory/__init__.py +9 -0
- gnosisllm_knowledge/backends/memory/indexer.py +384 -0
- gnosisllm_knowledge/backends/memory/searcher.py +516 -0
- gnosisllm_knowledge/backends/opensearch/__init__.py +19 -0
- gnosisllm_knowledge/backends/opensearch/agentic.py +738 -0
- gnosisllm_knowledge/backends/opensearch/config.py +195 -0
- gnosisllm_knowledge/backends/opensearch/indexer.py +499 -0
- gnosisllm_knowledge/backends/opensearch/mappings.py +255 -0
- gnosisllm_knowledge/backends/opensearch/queries.py +445 -0
- gnosisllm_knowledge/backends/opensearch/searcher.py +383 -0
- gnosisllm_knowledge/backends/opensearch/setup.py +1390 -0
- gnosisllm_knowledge/chunking/__init__.py +9 -0
- gnosisllm_knowledge/chunking/fixed.py +138 -0
- gnosisllm_knowledge/chunking/sentence.py +239 -0
- gnosisllm_knowledge/cli/__init__.py +18 -0
- gnosisllm_knowledge/cli/app.py +509 -0
- gnosisllm_knowledge/cli/commands/__init__.py +7 -0
- gnosisllm_knowledge/cli/commands/agentic.py +529 -0
- gnosisllm_knowledge/cli/commands/load.py +369 -0
- gnosisllm_knowledge/cli/commands/search.py +440 -0
- gnosisllm_knowledge/cli/commands/setup.py +228 -0
- gnosisllm_knowledge/cli/display/__init__.py +5 -0
- gnosisllm_knowledge/cli/display/service.py +555 -0
- gnosisllm_knowledge/cli/utils/__init__.py +5 -0
- gnosisllm_knowledge/cli/utils/config.py +207 -0
- gnosisllm_knowledge/core/__init__.py +87 -0
- gnosisllm_knowledge/core/domain/__init__.py +43 -0
- gnosisllm_knowledge/core/domain/document.py +240 -0
- gnosisllm_knowledge/core/domain/result.py +176 -0
- gnosisllm_knowledge/core/domain/search.py +327 -0
- gnosisllm_knowledge/core/domain/source.py +139 -0
- gnosisllm_knowledge/core/events/__init__.py +23 -0
- gnosisllm_knowledge/core/events/emitter.py +216 -0
- gnosisllm_knowledge/core/events/types.py +226 -0
- gnosisllm_knowledge/core/exceptions.py +407 -0
- gnosisllm_knowledge/core/interfaces/__init__.py +20 -0
- gnosisllm_knowledge/core/interfaces/agentic.py +136 -0
- gnosisllm_knowledge/core/interfaces/chunker.py +64 -0
- gnosisllm_knowledge/core/interfaces/fetcher.py +112 -0
- gnosisllm_knowledge/core/interfaces/indexer.py +244 -0
- gnosisllm_knowledge/core/interfaces/loader.py +102 -0
- gnosisllm_knowledge/core/interfaces/searcher.py +178 -0
- gnosisllm_knowledge/core/interfaces/setup.py +164 -0
- gnosisllm_knowledge/fetchers/__init__.py +12 -0
- gnosisllm_knowledge/fetchers/config.py +77 -0
- gnosisllm_knowledge/fetchers/http.py +167 -0
- gnosisllm_knowledge/fetchers/neoreader.py +204 -0
- gnosisllm_knowledge/loaders/__init__.py +13 -0
- gnosisllm_knowledge/loaders/base.py +399 -0
- gnosisllm_knowledge/loaders/factory.py +202 -0
- gnosisllm_knowledge/loaders/sitemap.py +285 -0
- gnosisllm_knowledge/loaders/website.py +57 -0
- gnosisllm_knowledge/py.typed +0 -0
- gnosisllm_knowledge/services/__init__.py +9 -0
- gnosisllm_knowledge/services/indexing.py +387 -0
- gnosisllm_knowledge/services/search.py +349 -0
- gnosisllm_knowledge-0.2.0.dist-info/METADATA +382 -0
- gnosisllm_knowledge-0.2.0.dist-info/RECORD +64 -0
- gnosisllm_knowledge-0.2.0.dist-info/WHEEL +4 -0
- gnosisllm_knowledge-0.2.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""Fixed-size text chunker."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from gnosisllm_knowledge.core.domain.document import TextChunk
|
|
8
|
+
|
|
9
|
+
# Default chunking parameters
|
|
10
|
+
DEFAULT_CHUNK_SIZE = 4000 # Characters
|
|
11
|
+
DEFAULT_CHUNK_OVERLAP = 200 # Characters
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class FixedSizeChunker:
|
|
15
|
+
"""Simple fixed-size text chunker.
|
|
16
|
+
|
|
17
|
+
This chunker splits text into fixed-size chunks without regard for
|
|
18
|
+
semantic boundaries. It's faster than sentence-aware chunking but
|
|
19
|
+
may split words or sentences in the middle.
|
|
20
|
+
|
|
21
|
+
For better results with natural language, use SentenceChunker.
|
|
22
|
+
|
|
23
|
+
Example:
|
|
24
|
+
```python
|
|
25
|
+
chunker = FixedSizeChunker(chunk_size=4000, chunk_overlap=200)
|
|
26
|
+
chunks = chunker.chunk(long_text)
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
Attributes:
|
|
30
|
+
chunk_size: Target chunk size in characters.
|
|
31
|
+
chunk_overlap: Overlap between consecutive chunks.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
chunk_size: int = DEFAULT_CHUNK_SIZE,
|
|
37
|
+
chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
|
|
38
|
+
) -> None:
|
|
39
|
+
"""Initialize the chunker.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
chunk_size: Target chunk size in characters.
|
|
43
|
+
chunk_overlap: Overlap between consecutive chunks.
|
|
44
|
+
"""
|
|
45
|
+
if chunk_overlap >= chunk_size:
|
|
46
|
+
raise ValueError("chunk_overlap must be less than chunk_size")
|
|
47
|
+
|
|
48
|
+
self._chunk_size = chunk_size
|
|
49
|
+
self._chunk_overlap = chunk_overlap
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def name(self) -> str:
|
|
53
|
+
"""Return the chunker name."""
|
|
54
|
+
return "fixed"
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def chunk_size(self) -> int:
|
|
58
|
+
"""Return the target chunk size."""
|
|
59
|
+
return self._chunk_size
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def chunk_overlap(self) -> int:
|
|
63
|
+
"""Return the chunk overlap."""
|
|
64
|
+
return self._chunk_overlap
|
|
65
|
+
|
|
66
|
+
def chunk(self, text: str, **options: Any) -> list[TextChunk]:
|
|
67
|
+
"""Split text into fixed-size chunks.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
text: The text to chunk.
|
|
71
|
+
**options: Override options:
|
|
72
|
+
- chunk_size: Override default chunk size
|
|
73
|
+
- chunk_overlap: Override default overlap
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
List of TextChunk objects.
|
|
77
|
+
"""
|
|
78
|
+
# Get options with fallbacks
|
|
79
|
+
chunk_size = options.get("chunk_size", self._chunk_size)
|
|
80
|
+
chunk_overlap = options.get("chunk_overlap", self._chunk_overlap)
|
|
81
|
+
|
|
82
|
+
if not text:
|
|
83
|
+
return []
|
|
84
|
+
|
|
85
|
+
text = text.strip()
|
|
86
|
+
if len(text) <= chunk_size:
|
|
87
|
+
return [
|
|
88
|
+
TextChunk(
|
|
89
|
+
content=text,
|
|
90
|
+
index=0,
|
|
91
|
+
start_position=0,
|
|
92
|
+
end_position=len(text),
|
|
93
|
+
)
|
|
94
|
+
]
|
|
95
|
+
|
|
96
|
+
chunks: list[TextChunk] = []
|
|
97
|
+
step = chunk_size - chunk_overlap
|
|
98
|
+
chunk_index = 0
|
|
99
|
+
|
|
100
|
+
for start_pos in range(0, len(text), step):
|
|
101
|
+
end_pos = min(start_pos + chunk_size, len(text))
|
|
102
|
+
chunk_content = text[start_pos:end_pos]
|
|
103
|
+
|
|
104
|
+
if chunk_content.strip():
|
|
105
|
+
chunks.append(
|
|
106
|
+
TextChunk(
|
|
107
|
+
content=chunk_content,
|
|
108
|
+
index=chunk_index,
|
|
109
|
+
start_position=start_pos,
|
|
110
|
+
end_position=end_pos,
|
|
111
|
+
)
|
|
112
|
+
)
|
|
113
|
+
chunk_index += 1
|
|
114
|
+
|
|
115
|
+
# Stop if we've reached the end
|
|
116
|
+
if end_pos >= len(text):
|
|
117
|
+
break
|
|
118
|
+
|
|
119
|
+
return chunks
|
|
120
|
+
|
|
121
|
+
def estimate_chunks(self, text: str) -> int:
|
|
122
|
+
"""Estimate the number of chunks that would be created.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
text: The text to estimate.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
Estimated number of chunks.
|
|
129
|
+
"""
|
|
130
|
+
if not text:
|
|
131
|
+
return 0
|
|
132
|
+
|
|
133
|
+
text_len = len(text)
|
|
134
|
+
if text_len <= self._chunk_size:
|
|
135
|
+
return 1
|
|
136
|
+
|
|
137
|
+
step = self._chunk_size - self._chunk_overlap
|
|
138
|
+
return max(1, (text_len + step - 1) // step)
|
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
"""Sentence-aware text chunker."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from gnosisllm_knowledge.core.domain.document import TextChunk
|
|
9
|
+
|
|
10
|
+
# Default chunking parameters
|
|
11
|
+
DEFAULT_CHUNK_SIZE = 8000 # Characters (fits ~2000 tokens)
|
|
12
|
+
DEFAULT_CHUNK_OVERLAP = 200 # Characters
|
|
13
|
+
DEFAULT_MIN_CHUNK_SIZE = 100 # Characters
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class SentenceChunker:
|
|
17
|
+
"""Text chunker that respects sentence boundaries.
|
|
18
|
+
|
|
19
|
+
This chunker splits text into chunks while trying to keep sentences
|
|
20
|
+
intact. It finds sentence boundaries and creates chunks of approximately
|
|
21
|
+
the target size without breaking sentences in the middle.
|
|
22
|
+
|
|
23
|
+
Example:
|
|
24
|
+
```python
|
|
25
|
+
chunker = SentenceChunker(chunk_size=4000, chunk_overlap=200)
|
|
26
|
+
chunks = chunker.chunk(long_text)
|
|
27
|
+
for chunk in chunks:
|
|
28
|
+
print(f"Chunk {chunk.index}: {chunk.length} chars")
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
Attributes:
|
|
32
|
+
chunk_size: Target chunk size in characters.
|
|
33
|
+
chunk_overlap: Overlap between consecutive chunks.
|
|
34
|
+
min_chunk_size: Minimum chunk size to create.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(
|
|
38
|
+
self,
|
|
39
|
+
chunk_size: int = DEFAULT_CHUNK_SIZE,
|
|
40
|
+
chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
|
|
41
|
+
min_chunk_size: int = DEFAULT_MIN_CHUNK_SIZE,
|
|
42
|
+
) -> None:
|
|
43
|
+
"""Initialize the chunker.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
chunk_size: Target chunk size in characters.
|
|
47
|
+
chunk_overlap: Overlap between consecutive chunks.
|
|
48
|
+
min_chunk_size: Minimum chunk size to create.
|
|
49
|
+
"""
|
|
50
|
+
self._chunk_size = chunk_size
|
|
51
|
+
self._chunk_overlap = chunk_overlap
|
|
52
|
+
self._min_chunk_size = min_chunk_size
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def name(self) -> str:
|
|
56
|
+
"""Return the chunker name."""
|
|
57
|
+
return "sentence"
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def chunk_size(self) -> int:
|
|
61
|
+
"""Return the target chunk size."""
|
|
62
|
+
return self._chunk_size
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def chunk_overlap(self) -> int:
|
|
66
|
+
"""Return the chunk overlap."""
|
|
67
|
+
return self._chunk_overlap
|
|
68
|
+
|
|
69
|
+
def chunk(self, text: str, **options: Any) -> list[TextChunk]:
|
|
70
|
+
"""Split text into chunks respecting sentence boundaries.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
text: The text to chunk.
|
|
74
|
+
**options: Override options:
|
|
75
|
+
- chunk_size: Override default chunk size
|
|
76
|
+
- chunk_overlap: Override default overlap
|
|
77
|
+
- min_chunk_size: Override minimum size
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
List of TextChunk objects.
|
|
81
|
+
"""
|
|
82
|
+
# Get options with fallbacks
|
|
83
|
+
chunk_size = options.get("chunk_size", self._chunk_size)
|
|
84
|
+
chunk_overlap = options.get("chunk_overlap", self._chunk_overlap)
|
|
85
|
+
min_chunk_size = options.get("min_chunk_size", self._min_chunk_size)
|
|
86
|
+
|
|
87
|
+
# Clean and normalize text
|
|
88
|
+
text = self._clean_text(text)
|
|
89
|
+
|
|
90
|
+
if not text or len(text) < min_chunk_size:
|
|
91
|
+
if text:
|
|
92
|
+
return [
|
|
93
|
+
TextChunk(
|
|
94
|
+
content=text,
|
|
95
|
+
index=0,
|
|
96
|
+
start_position=0,
|
|
97
|
+
end_position=len(text),
|
|
98
|
+
)
|
|
99
|
+
]
|
|
100
|
+
return []
|
|
101
|
+
|
|
102
|
+
# Find sentence boundaries
|
|
103
|
+
boundaries = self._find_sentence_boundaries(text)
|
|
104
|
+
|
|
105
|
+
chunks: list[TextChunk] = []
|
|
106
|
+
start_pos = 0
|
|
107
|
+
chunk_index = 0
|
|
108
|
+
|
|
109
|
+
while start_pos < len(text):
|
|
110
|
+
# Find the end position for this chunk
|
|
111
|
+
end_pos = min(start_pos + chunk_size, len(text))
|
|
112
|
+
|
|
113
|
+
# If we're not at the end, find a good boundary
|
|
114
|
+
if end_pos < len(text):
|
|
115
|
+
# Find the nearest sentence boundary before end_pos
|
|
116
|
+
best_boundary = self._find_best_boundary(
|
|
117
|
+
boundaries, start_pos, end_pos, chunk_size
|
|
118
|
+
)
|
|
119
|
+
if best_boundary:
|
|
120
|
+
end_pos = best_boundary
|
|
121
|
+
|
|
122
|
+
# Extract chunk content
|
|
123
|
+
chunk_content = text[start_pos:end_pos].strip()
|
|
124
|
+
|
|
125
|
+
if len(chunk_content) >= min_chunk_size:
|
|
126
|
+
chunks.append(
|
|
127
|
+
TextChunk(
|
|
128
|
+
content=chunk_content,
|
|
129
|
+
index=chunk_index,
|
|
130
|
+
start_position=start_pos,
|
|
131
|
+
end_position=end_pos,
|
|
132
|
+
)
|
|
133
|
+
)
|
|
134
|
+
chunk_index += 1
|
|
135
|
+
|
|
136
|
+
# Move to next chunk position with overlap
|
|
137
|
+
if end_pos >= len(text):
|
|
138
|
+
break
|
|
139
|
+
|
|
140
|
+
# Calculate next start position
|
|
141
|
+
start_pos = max(start_pos + 1, end_pos - chunk_overlap)
|
|
142
|
+
|
|
143
|
+
# Ensure we're making progress
|
|
144
|
+
if start_pos >= end_pos:
|
|
145
|
+
start_pos = end_pos
|
|
146
|
+
|
|
147
|
+
return chunks
|
|
148
|
+
|
|
149
|
+
def estimate_chunks(self, text: str) -> int:
|
|
150
|
+
"""Estimate the number of chunks that would be created.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
text: The text to estimate.
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
Estimated number of chunks.
|
|
157
|
+
"""
|
|
158
|
+
if not text:
|
|
159
|
+
return 0
|
|
160
|
+
|
|
161
|
+
text_len = len(text)
|
|
162
|
+
if text_len <= self._chunk_size:
|
|
163
|
+
return 1
|
|
164
|
+
|
|
165
|
+
# Account for overlap
|
|
166
|
+
effective_chunk_size = self._chunk_size - self._chunk_overlap
|
|
167
|
+
if effective_chunk_size <= 0:
|
|
168
|
+
effective_chunk_size = self._chunk_size
|
|
169
|
+
|
|
170
|
+
return max(1, (text_len + effective_chunk_size - 1) // effective_chunk_size)
|
|
171
|
+
|
|
172
|
+
def _clean_text(self, text: str) -> str:
|
|
173
|
+
"""Clean and normalize text.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
text: Raw text.
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
Cleaned text.
|
|
180
|
+
"""
|
|
181
|
+
# Normalize whitespace
|
|
182
|
+
text = re.sub(r"\s+", " ", text)
|
|
183
|
+
# Remove non-printable characters (except newlines)
|
|
184
|
+
text = "".join(c for c in text if c.isprintable() or c in "\n\t")
|
|
185
|
+
return text.strip()
|
|
186
|
+
|
|
187
|
+
def _find_sentence_boundaries(self, text: str) -> list[int]:
|
|
188
|
+
"""Find sentence boundary positions in text.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
text: The text to analyze.
|
|
192
|
+
|
|
193
|
+
Returns:
|
|
194
|
+
List of positions after sentence endings.
|
|
195
|
+
"""
|
|
196
|
+
boundaries: list[int] = []
|
|
197
|
+
|
|
198
|
+
# Pattern for sentence endings: .!? followed by space or end
|
|
199
|
+
# Also handle paragraph breaks
|
|
200
|
+
pattern = r"[.!?]+[\s\n]+|[\n]{2,}"
|
|
201
|
+
|
|
202
|
+
for match in re.finditer(pattern, text):
|
|
203
|
+
boundaries.append(match.end())
|
|
204
|
+
|
|
205
|
+
return boundaries
|
|
206
|
+
|
|
207
|
+
def _find_best_boundary(
|
|
208
|
+
self,
|
|
209
|
+
boundaries: list[int],
|
|
210
|
+
start_pos: int,
|
|
211
|
+
end_pos: int,
|
|
212
|
+
chunk_size: int,
|
|
213
|
+
) -> int | None:
|
|
214
|
+
"""Find the best sentence boundary for chunking.
|
|
215
|
+
|
|
216
|
+
Tries to find a boundary close to end_pos but not too close
|
|
217
|
+
to start_pos.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
boundaries: List of sentence boundaries.
|
|
221
|
+
start_pos: Chunk start position.
|
|
222
|
+
end_pos: Desired end position.
|
|
223
|
+
chunk_size: Target chunk size.
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
Best boundary position or None if none found.
|
|
227
|
+
"""
|
|
228
|
+
min_pos = start_pos + (chunk_size // 2) # Don't split too early
|
|
229
|
+
best = None
|
|
230
|
+
|
|
231
|
+
for boundary in boundaries:
|
|
232
|
+
if boundary <= start_pos:
|
|
233
|
+
continue
|
|
234
|
+
if boundary > end_pos:
|
|
235
|
+
break
|
|
236
|
+
if boundary >= min_pos:
|
|
237
|
+
best = boundary
|
|
238
|
+
|
|
239
|
+
return best
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""GnosisLLM Knowledge CLI.
|
|
2
|
+
|
|
3
|
+
Enterprise-grade command-line interface for knowledge management.
|
|
4
|
+
|
|
5
|
+
Commands:
|
|
6
|
+
setup - Configure OpenSearch with ML model for neural search
|
|
7
|
+
load - Load and index content from URLs or sitemaps
|
|
8
|
+
search - Search indexed content with multiple modes
|
|
9
|
+
|
|
10
|
+
Example:
|
|
11
|
+
$ gnosisllm-knowledge setup --host localhost --port 9200
|
|
12
|
+
$ gnosisllm-knowledge load https://docs.example.com/sitemap.xml
|
|
13
|
+
$ gnosisllm-knowledge search "how to configure auth"
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from gnosisllm_knowledge.cli.app import app, main
|
|
17
|
+
|
|
18
|
+
__all__ = ["app", "main"]
|