gnosisllm-knowledge 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. gnosisllm_knowledge/__init__.py +152 -0
  2. gnosisllm_knowledge/api/__init__.py +5 -0
  3. gnosisllm_knowledge/api/knowledge.py +548 -0
  4. gnosisllm_knowledge/backends/__init__.py +26 -0
  5. gnosisllm_knowledge/backends/memory/__init__.py +9 -0
  6. gnosisllm_knowledge/backends/memory/indexer.py +384 -0
  7. gnosisllm_knowledge/backends/memory/searcher.py +516 -0
  8. gnosisllm_knowledge/backends/opensearch/__init__.py +19 -0
  9. gnosisllm_knowledge/backends/opensearch/agentic.py +738 -0
  10. gnosisllm_knowledge/backends/opensearch/config.py +195 -0
  11. gnosisllm_knowledge/backends/opensearch/indexer.py +499 -0
  12. gnosisllm_knowledge/backends/opensearch/mappings.py +255 -0
  13. gnosisllm_knowledge/backends/opensearch/queries.py +445 -0
  14. gnosisllm_knowledge/backends/opensearch/searcher.py +383 -0
  15. gnosisllm_knowledge/backends/opensearch/setup.py +1390 -0
  16. gnosisllm_knowledge/chunking/__init__.py +9 -0
  17. gnosisllm_knowledge/chunking/fixed.py +138 -0
  18. gnosisllm_knowledge/chunking/sentence.py +239 -0
  19. gnosisllm_knowledge/cli/__init__.py +18 -0
  20. gnosisllm_knowledge/cli/app.py +509 -0
  21. gnosisllm_knowledge/cli/commands/__init__.py +7 -0
  22. gnosisllm_knowledge/cli/commands/agentic.py +529 -0
  23. gnosisllm_knowledge/cli/commands/load.py +369 -0
  24. gnosisllm_knowledge/cli/commands/search.py +440 -0
  25. gnosisllm_knowledge/cli/commands/setup.py +228 -0
  26. gnosisllm_knowledge/cli/display/__init__.py +5 -0
  27. gnosisllm_knowledge/cli/display/service.py +555 -0
  28. gnosisllm_knowledge/cli/utils/__init__.py +5 -0
  29. gnosisllm_knowledge/cli/utils/config.py +207 -0
  30. gnosisllm_knowledge/core/__init__.py +87 -0
  31. gnosisllm_knowledge/core/domain/__init__.py +43 -0
  32. gnosisllm_knowledge/core/domain/document.py +240 -0
  33. gnosisllm_knowledge/core/domain/result.py +176 -0
  34. gnosisllm_knowledge/core/domain/search.py +327 -0
  35. gnosisllm_knowledge/core/domain/source.py +139 -0
  36. gnosisllm_knowledge/core/events/__init__.py +23 -0
  37. gnosisllm_knowledge/core/events/emitter.py +216 -0
  38. gnosisllm_knowledge/core/events/types.py +226 -0
  39. gnosisllm_knowledge/core/exceptions.py +407 -0
  40. gnosisllm_knowledge/core/interfaces/__init__.py +20 -0
  41. gnosisllm_knowledge/core/interfaces/agentic.py +136 -0
  42. gnosisllm_knowledge/core/interfaces/chunker.py +64 -0
  43. gnosisllm_knowledge/core/interfaces/fetcher.py +112 -0
  44. gnosisllm_knowledge/core/interfaces/indexer.py +244 -0
  45. gnosisllm_knowledge/core/interfaces/loader.py +102 -0
  46. gnosisllm_knowledge/core/interfaces/searcher.py +178 -0
  47. gnosisllm_knowledge/core/interfaces/setup.py +164 -0
  48. gnosisllm_knowledge/fetchers/__init__.py +12 -0
  49. gnosisllm_knowledge/fetchers/config.py +77 -0
  50. gnosisllm_knowledge/fetchers/http.py +167 -0
  51. gnosisllm_knowledge/fetchers/neoreader.py +204 -0
  52. gnosisllm_knowledge/loaders/__init__.py +13 -0
  53. gnosisllm_knowledge/loaders/base.py +399 -0
  54. gnosisllm_knowledge/loaders/factory.py +202 -0
  55. gnosisllm_knowledge/loaders/sitemap.py +285 -0
  56. gnosisllm_knowledge/loaders/website.py +57 -0
  57. gnosisllm_knowledge/py.typed +0 -0
  58. gnosisllm_knowledge/services/__init__.py +9 -0
  59. gnosisllm_knowledge/services/indexing.py +387 -0
  60. gnosisllm_knowledge/services/search.py +349 -0
  61. gnosisllm_knowledge-0.2.0.dist-info/METADATA +382 -0
  62. gnosisllm_knowledge-0.2.0.dist-info/RECORD +64 -0
  63. gnosisllm_knowledge-0.2.0.dist-info/WHEEL +4 -0
  64. gnosisllm_knowledge-0.2.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,9 @@
1
+ """Text chunking strategies."""
2
+
3
+ from gnosisllm_knowledge.chunking.fixed import FixedSizeChunker
4
+ from gnosisllm_knowledge.chunking.sentence import SentenceChunker
5
+
6
+ __all__ = [
7
+ "SentenceChunker",
8
+ "FixedSizeChunker",
9
+ ]
@@ -0,0 +1,138 @@
1
+ """Fixed-size text chunker."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from gnosisllm_knowledge.core.domain.document import TextChunk
8
+
9
+ # Default chunking parameters
10
+ DEFAULT_CHUNK_SIZE = 4000 # Characters
11
+ DEFAULT_CHUNK_OVERLAP = 200 # Characters
12
+
13
+
14
+ class FixedSizeChunker:
15
+ """Simple fixed-size text chunker.
16
+
17
+ This chunker splits text into fixed-size chunks without regard for
18
+ semantic boundaries. It's faster than sentence-aware chunking but
19
+ may split words or sentences in the middle.
20
+
21
+ For better results with natural language, use SentenceChunker.
22
+
23
+ Example:
24
+ ```python
25
+ chunker = FixedSizeChunker(chunk_size=4000, chunk_overlap=200)
26
+ chunks = chunker.chunk(long_text)
27
+ ```
28
+
29
+ Attributes:
30
+ chunk_size: Target chunk size in characters.
31
+ chunk_overlap: Overlap between consecutive chunks.
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ chunk_size: int = DEFAULT_CHUNK_SIZE,
37
+ chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
38
+ ) -> None:
39
+ """Initialize the chunker.
40
+
41
+ Args:
42
+ chunk_size: Target chunk size in characters.
43
+ chunk_overlap: Overlap between consecutive chunks.
44
+ """
45
+ if chunk_overlap >= chunk_size:
46
+ raise ValueError("chunk_overlap must be less than chunk_size")
47
+
48
+ self._chunk_size = chunk_size
49
+ self._chunk_overlap = chunk_overlap
50
+
51
+ @property
52
+ def name(self) -> str:
53
+ """Return the chunker name."""
54
+ return "fixed"
55
+
56
+ @property
57
+ def chunk_size(self) -> int:
58
+ """Return the target chunk size."""
59
+ return self._chunk_size
60
+
61
+ @property
62
+ def chunk_overlap(self) -> int:
63
+ """Return the chunk overlap."""
64
+ return self._chunk_overlap
65
+
66
+ def chunk(self, text: str, **options: Any) -> list[TextChunk]:
67
+ """Split text into fixed-size chunks.
68
+
69
+ Args:
70
+ text: The text to chunk.
71
+ **options: Override options:
72
+ - chunk_size: Override default chunk size
73
+ - chunk_overlap: Override default overlap
74
+
75
+ Returns:
76
+ List of TextChunk objects.
77
+ """
78
+ # Get options with fallbacks
79
+ chunk_size = options.get("chunk_size", self._chunk_size)
80
+ chunk_overlap = options.get("chunk_overlap", self._chunk_overlap)
81
+
82
+ if not text:
83
+ return []
84
+
85
+ text = text.strip()
86
+ if len(text) <= chunk_size:
87
+ return [
88
+ TextChunk(
89
+ content=text,
90
+ index=0,
91
+ start_position=0,
92
+ end_position=len(text),
93
+ )
94
+ ]
95
+
96
+ chunks: list[TextChunk] = []
97
+ step = chunk_size - chunk_overlap
98
+ chunk_index = 0
99
+
100
+ for start_pos in range(0, len(text), step):
101
+ end_pos = min(start_pos + chunk_size, len(text))
102
+ chunk_content = text[start_pos:end_pos]
103
+
104
+ if chunk_content.strip():
105
+ chunks.append(
106
+ TextChunk(
107
+ content=chunk_content,
108
+ index=chunk_index,
109
+ start_position=start_pos,
110
+ end_position=end_pos,
111
+ )
112
+ )
113
+ chunk_index += 1
114
+
115
+ # Stop if we've reached the end
116
+ if end_pos >= len(text):
117
+ break
118
+
119
+ return chunks
120
+
121
+ def estimate_chunks(self, text: str) -> int:
122
+ """Estimate the number of chunks that would be created.
123
+
124
+ Args:
125
+ text: The text to estimate.
126
+
127
+ Returns:
128
+ Estimated number of chunks.
129
+ """
130
+ if not text:
131
+ return 0
132
+
133
+ text_len = len(text)
134
+ if text_len <= self._chunk_size:
135
+ return 1
136
+
137
+ step = self._chunk_size - self._chunk_overlap
138
+ return max(1, (text_len + step - 1) // step)
@@ -0,0 +1,239 @@
1
+ """Sentence-aware text chunker."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from typing import Any
7
+
8
+ from gnosisllm_knowledge.core.domain.document import TextChunk
9
+
10
+ # Default chunking parameters
11
+ DEFAULT_CHUNK_SIZE = 8000 # Characters (fits ~2000 tokens)
12
+ DEFAULT_CHUNK_OVERLAP = 200 # Characters
13
+ DEFAULT_MIN_CHUNK_SIZE = 100 # Characters
14
+
15
+
16
+ class SentenceChunker:
17
+ """Text chunker that respects sentence boundaries.
18
+
19
+ This chunker splits text into chunks while trying to keep sentences
20
+ intact. It finds sentence boundaries and creates chunks of approximately
21
+ the target size without breaking sentences in the middle.
22
+
23
+ Example:
24
+ ```python
25
+ chunker = SentenceChunker(chunk_size=4000, chunk_overlap=200)
26
+ chunks = chunker.chunk(long_text)
27
+ for chunk in chunks:
28
+ print(f"Chunk {chunk.index}: {chunk.length} chars")
29
+ ```
30
+
31
+ Attributes:
32
+ chunk_size: Target chunk size in characters.
33
+ chunk_overlap: Overlap between consecutive chunks.
34
+ min_chunk_size: Minimum chunk size to create.
35
+ """
36
+
37
+ def __init__(
38
+ self,
39
+ chunk_size: int = DEFAULT_CHUNK_SIZE,
40
+ chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
41
+ min_chunk_size: int = DEFAULT_MIN_CHUNK_SIZE,
42
+ ) -> None:
43
+ """Initialize the chunker.
44
+
45
+ Args:
46
+ chunk_size: Target chunk size in characters.
47
+ chunk_overlap: Overlap between consecutive chunks.
48
+ min_chunk_size: Minimum chunk size to create.
49
+ """
50
+ self._chunk_size = chunk_size
51
+ self._chunk_overlap = chunk_overlap
52
+ self._min_chunk_size = min_chunk_size
53
+
54
+ @property
55
+ def name(self) -> str:
56
+ """Return the chunker name."""
57
+ return "sentence"
58
+
59
+ @property
60
+ def chunk_size(self) -> int:
61
+ """Return the target chunk size."""
62
+ return self._chunk_size
63
+
64
+ @property
65
+ def chunk_overlap(self) -> int:
66
+ """Return the chunk overlap."""
67
+ return self._chunk_overlap
68
+
69
+ def chunk(self, text: str, **options: Any) -> list[TextChunk]:
70
+ """Split text into chunks respecting sentence boundaries.
71
+
72
+ Args:
73
+ text: The text to chunk.
74
+ **options: Override options:
75
+ - chunk_size: Override default chunk size
76
+ - chunk_overlap: Override default overlap
77
+ - min_chunk_size: Override minimum size
78
+
79
+ Returns:
80
+ List of TextChunk objects.
81
+ """
82
+ # Get options with fallbacks
83
+ chunk_size = options.get("chunk_size", self._chunk_size)
84
+ chunk_overlap = options.get("chunk_overlap", self._chunk_overlap)
85
+ min_chunk_size = options.get("min_chunk_size", self._min_chunk_size)
86
+
87
+ # Clean and normalize text
88
+ text = self._clean_text(text)
89
+
90
+ if not text or len(text) < min_chunk_size:
91
+ if text:
92
+ return [
93
+ TextChunk(
94
+ content=text,
95
+ index=0,
96
+ start_position=0,
97
+ end_position=len(text),
98
+ )
99
+ ]
100
+ return []
101
+
102
+ # Find sentence boundaries
103
+ boundaries = self._find_sentence_boundaries(text)
104
+
105
+ chunks: list[TextChunk] = []
106
+ start_pos = 0
107
+ chunk_index = 0
108
+
109
+ while start_pos < len(text):
110
+ # Find the end position for this chunk
111
+ end_pos = min(start_pos + chunk_size, len(text))
112
+
113
+ # If we're not at the end, find a good boundary
114
+ if end_pos < len(text):
115
+ # Find the nearest sentence boundary before end_pos
116
+ best_boundary = self._find_best_boundary(
117
+ boundaries, start_pos, end_pos, chunk_size
118
+ )
119
+ if best_boundary:
120
+ end_pos = best_boundary
121
+
122
+ # Extract chunk content
123
+ chunk_content = text[start_pos:end_pos].strip()
124
+
125
+ if len(chunk_content) >= min_chunk_size:
126
+ chunks.append(
127
+ TextChunk(
128
+ content=chunk_content,
129
+ index=chunk_index,
130
+ start_position=start_pos,
131
+ end_position=end_pos,
132
+ )
133
+ )
134
+ chunk_index += 1
135
+
136
+ # Move to next chunk position with overlap
137
+ if end_pos >= len(text):
138
+ break
139
+
140
+ # Calculate next start position
141
+ start_pos = max(start_pos + 1, end_pos - chunk_overlap)
142
+
143
+ # Ensure we're making progress
144
+ if start_pos >= end_pos:
145
+ start_pos = end_pos
146
+
147
+ return chunks
148
+
149
+ def estimate_chunks(self, text: str) -> int:
150
+ """Estimate the number of chunks that would be created.
151
+
152
+ Args:
153
+ text: The text to estimate.
154
+
155
+ Returns:
156
+ Estimated number of chunks.
157
+ """
158
+ if not text:
159
+ return 0
160
+
161
+ text_len = len(text)
162
+ if text_len <= self._chunk_size:
163
+ return 1
164
+
165
+ # Account for overlap
166
+ effective_chunk_size = self._chunk_size - self._chunk_overlap
167
+ if effective_chunk_size <= 0:
168
+ effective_chunk_size = self._chunk_size
169
+
170
+ return max(1, (text_len + effective_chunk_size - 1) // effective_chunk_size)
171
+
172
+ def _clean_text(self, text: str) -> str:
173
+ """Clean and normalize text.
174
+
175
+ Args:
176
+ text: Raw text.
177
+
178
+ Returns:
179
+ Cleaned text.
180
+ """
181
+ # Normalize whitespace
182
+ text = re.sub(r"\s+", " ", text)
183
+ # Remove non-printable characters (except newlines)
184
+ text = "".join(c for c in text if c.isprintable() or c in "\n\t")
185
+ return text.strip()
186
+
187
+ def _find_sentence_boundaries(self, text: str) -> list[int]:
188
+ """Find sentence boundary positions in text.
189
+
190
+ Args:
191
+ text: The text to analyze.
192
+
193
+ Returns:
194
+ List of positions after sentence endings.
195
+ """
196
+ boundaries: list[int] = []
197
+
198
+ # Pattern for sentence endings: .!? followed by space or end
199
+ # Also handle paragraph breaks
200
+ pattern = r"[.!?]+[\s\n]+|[\n]{2,}"
201
+
202
+ for match in re.finditer(pattern, text):
203
+ boundaries.append(match.end())
204
+
205
+ return boundaries
206
+
207
+ def _find_best_boundary(
208
+ self,
209
+ boundaries: list[int],
210
+ start_pos: int,
211
+ end_pos: int,
212
+ chunk_size: int,
213
+ ) -> int | None:
214
+ """Find the best sentence boundary for chunking.
215
+
216
+ Tries to find a boundary close to end_pos but not too close
217
+ to start_pos.
218
+
219
+ Args:
220
+ boundaries: List of sentence boundaries.
221
+ start_pos: Chunk start position.
222
+ end_pos: Desired end position.
223
+ chunk_size: Target chunk size.
224
+
225
+ Returns:
226
+ Best boundary position or None if none found.
227
+ """
228
+ min_pos = start_pos + (chunk_size // 2) # Don't split too early
229
+ best = None
230
+
231
+ for boundary in boundaries:
232
+ if boundary <= start_pos:
233
+ continue
234
+ if boundary > end_pos:
235
+ break
236
+ if boundary >= min_pos:
237
+ best = boundary
238
+
239
+ return best
@@ -0,0 +1,18 @@
1
+ """GnosisLLM Knowledge CLI.
2
+
3
+ Enterprise-grade command-line interface for knowledge management.
4
+
5
+ Commands:
6
+ setup - Configure OpenSearch with ML model for neural search
7
+ load - Load and index content from URLs or sitemaps
8
+ search - Search indexed content with multiple modes
9
+
10
+ Example:
11
+ $ gnosisllm-knowledge setup --host localhost --port 9200
12
+ $ gnosisllm-knowledge load https://docs.example.com/sitemap.xml
13
+ $ gnosisllm-knowledge search "how to configure auth"
14
+ """
15
+
16
+ from gnosisllm_knowledge.cli.app import app, main
17
+
18
+ __all__ = ["app", "main"]