gnosisllm-knowledge 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. gnosisllm_knowledge/__init__.py +152 -0
  2. gnosisllm_knowledge/api/__init__.py +5 -0
  3. gnosisllm_knowledge/api/knowledge.py +548 -0
  4. gnosisllm_knowledge/backends/__init__.py +26 -0
  5. gnosisllm_knowledge/backends/memory/__init__.py +9 -0
  6. gnosisllm_knowledge/backends/memory/indexer.py +384 -0
  7. gnosisllm_knowledge/backends/memory/searcher.py +516 -0
  8. gnosisllm_knowledge/backends/opensearch/__init__.py +19 -0
  9. gnosisllm_knowledge/backends/opensearch/agentic.py +738 -0
  10. gnosisllm_knowledge/backends/opensearch/config.py +195 -0
  11. gnosisllm_knowledge/backends/opensearch/indexer.py +499 -0
  12. gnosisllm_knowledge/backends/opensearch/mappings.py +255 -0
  13. gnosisllm_knowledge/backends/opensearch/queries.py +445 -0
  14. gnosisllm_knowledge/backends/opensearch/searcher.py +383 -0
  15. gnosisllm_knowledge/backends/opensearch/setup.py +1390 -0
  16. gnosisllm_knowledge/chunking/__init__.py +9 -0
  17. gnosisllm_knowledge/chunking/fixed.py +138 -0
  18. gnosisllm_knowledge/chunking/sentence.py +239 -0
  19. gnosisllm_knowledge/cli/__init__.py +18 -0
  20. gnosisllm_knowledge/cli/app.py +509 -0
  21. gnosisllm_knowledge/cli/commands/__init__.py +7 -0
  22. gnosisllm_knowledge/cli/commands/agentic.py +529 -0
  23. gnosisllm_knowledge/cli/commands/load.py +369 -0
  24. gnosisllm_knowledge/cli/commands/search.py +440 -0
  25. gnosisllm_knowledge/cli/commands/setup.py +228 -0
  26. gnosisllm_knowledge/cli/display/__init__.py +5 -0
  27. gnosisllm_knowledge/cli/display/service.py +555 -0
  28. gnosisllm_knowledge/cli/utils/__init__.py +5 -0
  29. gnosisllm_knowledge/cli/utils/config.py +207 -0
  30. gnosisllm_knowledge/core/__init__.py +87 -0
  31. gnosisllm_knowledge/core/domain/__init__.py +43 -0
  32. gnosisllm_knowledge/core/domain/document.py +240 -0
  33. gnosisllm_knowledge/core/domain/result.py +176 -0
  34. gnosisllm_knowledge/core/domain/search.py +327 -0
  35. gnosisllm_knowledge/core/domain/source.py +139 -0
  36. gnosisllm_knowledge/core/events/__init__.py +23 -0
  37. gnosisllm_knowledge/core/events/emitter.py +216 -0
  38. gnosisllm_knowledge/core/events/types.py +226 -0
  39. gnosisllm_knowledge/core/exceptions.py +407 -0
  40. gnosisllm_knowledge/core/interfaces/__init__.py +20 -0
  41. gnosisllm_knowledge/core/interfaces/agentic.py +136 -0
  42. gnosisllm_knowledge/core/interfaces/chunker.py +64 -0
  43. gnosisllm_knowledge/core/interfaces/fetcher.py +112 -0
  44. gnosisllm_knowledge/core/interfaces/indexer.py +244 -0
  45. gnosisllm_knowledge/core/interfaces/loader.py +102 -0
  46. gnosisllm_knowledge/core/interfaces/searcher.py +178 -0
  47. gnosisllm_knowledge/core/interfaces/setup.py +164 -0
  48. gnosisllm_knowledge/fetchers/__init__.py +12 -0
  49. gnosisllm_knowledge/fetchers/config.py +77 -0
  50. gnosisllm_knowledge/fetchers/http.py +167 -0
  51. gnosisllm_knowledge/fetchers/neoreader.py +204 -0
  52. gnosisllm_knowledge/loaders/__init__.py +13 -0
  53. gnosisllm_knowledge/loaders/base.py +399 -0
  54. gnosisllm_knowledge/loaders/factory.py +202 -0
  55. gnosisllm_knowledge/loaders/sitemap.py +285 -0
  56. gnosisllm_knowledge/loaders/website.py +57 -0
  57. gnosisllm_knowledge/py.typed +0 -0
  58. gnosisllm_knowledge/services/__init__.py +9 -0
  59. gnosisllm_knowledge/services/indexing.py +387 -0
  60. gnosisllm_knowledge/services/search.py +349 -0
  61. gnosisllm_knowledge-0.2.0.dist-info/METADATA +382 -0
  62. gnosisllm_knowledge-0.2.0.dist-info/RECORD +64 -0
  63. gnosisllm_knowledge-0.2.0.dist-info/WHEEL +4 -0
  64. gnosisllm_knowledge-0.2.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,207 @@
1
+ """CLI configuration provider."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ from dotenv import load_dotenv
11
+
12
+
13
+ @dataclass
14
+ class CliConfig:
15
+ """CLI configuration loaded from environment.
16
+
17
+ Provides a unified interface for accessing configuration
18
+ with sensible defaults for CLI operations.
19
+ """
20
+
21
+ # OpenSearch
22
+ opensearch_host: str = "localhost"
23
+ opensearch_port: int = 9200
24
+ opensearch_username: str | None = None
25
+ opensearch_password: str | None = None
26
+ opensearch_use_ssl: bool = False
27
+ opensearch_verify_certs: bool = False
28
+ opensearch_model_id: str | None = None
29
+ opensearch_index_name: str = "knowledge"
30
+ opensearch_pipeline_name: str = "gnosisllm-ingest-pipeline"
31
+ opensearch_search_pipeline_name: str = "gnosisllm-search-pipeline"
32
+
33
+ # OpenAI
34
+ openai_api_key: str | None = None
35
+ openai_embedding_model: str = "text-embedding-ada-002"
36
+ openai_embedding_dimension: int = 1536
37
+
38
+ # Agentic Search
39
+ opensearch_flow_agent_id: str | None = None
40
+ opensearch_conversational_agent_id: str | None = None
41
+ agentic_llm_model: str = "gpt-4o"
42
+ agentic_max_iterations: int = 5
43
+ agentic_timeout_seconds: int = 60
44
+
45
+ # Neoreader
46
+ neoreader_host: str = "https://api.neoreader.dev"
47
+
48
+ @classmethod
49
+ def from_env(cls, env_file: str | Path | None = None) -> CliConfig:
50
+ """Load configuration from environment.
51
+
52
+ Args:
53
+ env_file: Optional path to .env file.
54
+
55
+ Returns:
56
+ Configured CliConfig instance.
57
+ """
58
+ # Load .env file if exists
59
+ if env_file:
60
+ load_dotenv(env_file)
61
+ else:
62
+ load_dotenv()
63
+
64
+ return cls(
65
+ opensearch_host=os.getenv("OPENSEARCH_HOST", "localhost"),
66
+ opensearch_port=int(os.getenv("OPENSEARCH_PORT", "9200")),
67
+ opensearch_username=os.getenv("OPENSEARCH_USERNAME"),
68
+ opensearch_password=os.getenv("OPENSEARCH_PASSWORD"),
69
+ opensearch_use_ssl=os.getenv("OPENSEARCH_USE_SSL", "false").lower() == "true",
70
+ opensearch_verify_certs=os.getenv("OPENSEARCH_VERIFY_CERTS", "false").lower()
71
+ == "true",
72
+ opensearch_model_id=os.getenv("OPENSEARCH_MODEL_ID"),
73
+ opensearch_index_name=os.getenv("OPENSEARCH_INDEX_NAME", "knowledge"),
74
+ opensearch_pipeline_name=os.getenv(
75
+ "OPENSEARCH_PIPELINE_NAME", "gnosisllm-ingest-pipeline"
76
+ ),
77
+ opensearch_search_pipeline_name=os.getenv(
78
+ "OPENSEARCH_SEARCH_PIPELINE_NAME", "gnosisllm-search-pipeline"
79
+ ),
80
+ openai_api_key=os.getenv("OPENAI_API_KEY"),
81
+ openai_embedding_model=os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-ada-002"),
82
+ openai_embedding_dimension=int(os.getenv("OPENAI_EMBEDDING_DIMENSION", "1536")),
83
+ # Agentic search configuration
84
+ opensearch_flow_agent_id=os.getenv("OPENSEARCH_FLOW_AGENT_ID"),
85
+ opensearch_conversational_agent_id=os.getenv("OPENSEARCH_CONVERSATIONAL_AGENT_ID"),
86
+ agentic_llm_model=os.getenv("AGENTIC_LLM_MODEL", "gpt-4o"),
87
+ agentic_max_iterations=int(os.getenv("AGENTIC_MAX_ITERATIONS", "5")),
88
+ agentic_timeout_seconds=int(os.getenv("AGENTIC_TIMEOUT_SECONDS", "60")),
89
+ neoreader_host=os.getenv("NEOREADER_HOST", "https://api.neoreader.dev"),
90
+ )
91
+
92
+ def get(self, key: str, default: Any = None) -> Any:
93
+ """Get configuration value by key.
94
+
95
+ Args:
96
+ key: Configuration key (e.g., "OPENSEARCH_HOST").
97
+ default: Default value if not found.
98
+
99
+ Returns:
100
+ Configuration value or default.
101
+ """
102
+ # Convert env-style key to attribute name
103
+ attr_name = key.lower()
104
+ return getattr(self, attr_name, default)
105
+
106
+ def require(self, key: str) -> str:
107
+ """Get required configuration value.
108
+
109
+ Args:
110
+ key: Configuration key.
111
+
112
+ Returns:
113
+ Configuration value.
114
+
115
+ Raises:
116
+ ValueError: If value is not set.
117
+ """
118
+ value = self.get(key)
119
+ if not value:
120
+ raise ValueError(f"{key} is required but not set")
121
+ return str(value)
122
+
123
+ @property
124
+ def opensearch_url(self) -> str:
125
+ """Get OpenSearch URL."""
126
+ protocol = "https" if self.opensearch_use_ssl else "http"
127
+ return f"{protocol}://{self.opensearch_host}:{self.opensearch_port}"
128
+
129
+ def validate_for_setup(self) -> list[str]:
130
+ """Validate configuration for setup command.
131
+
132
+ Returns:
133
+ List of validation errors (empty if valid).
134
+ """
135
+ errors = []
136
+ if not self.openai_api_key:
137
+ errors.append("OPENAI_API_KEY is required for setup")
138
+ return errors
139
+
140
+ def validate_for_search(self) -> list[str]:
141
+ """Validate configuration for search command.
142
+
143
+ Returns:
144
+ List of validation errors (empty if valid).
145
+ """
146
+ errors = []
147
+ if not self.opensearch_model_id:
148
+ errors.append(
149
+ "OPENSEARCH_MODEL_ID is required for semantic/hybrid search. "
150
+ "Run 'gnosisllm-knowledge setup' first."
151
+ )
152
+ return errors
153
+
154
+ def validate_for_agentic_search(self, agent_type: str = "flow") -> list[str]:
155
+ """Validate configuration for agentic search.
156
+
157
+ Args:
158
+ agent_type: Type of agent ('flow' or 'conversational').
159
+
160
+ Returns:
161
+ List of validation errors (empty if valid).
162
+ """
163
+ errors = self.validate_for_search()
164
+
165
+ if agent_type == "flow" and not self.opensearch_flow_agent_id:
166
+ errors.append(
167
+ "OPENSEARCH_FLOW_AGENT_ID is required for flow agent search. "
168
+ "Run 'gnosisllm-knowledge agentic setup' first."
169
+ )
170
+ elif agent_type == "conversational" and not self.opensearch_conversational_agent_id:
171
+ errors.append(
172
+ "OPENSEARCH_CONVERSATIONAL_AGENT_ID is required for conversational agent search. "
173
+ "Run 'gnosisllm-knowledge agentic setup' first."
174
+ )
175
+
176
+ return errors
177
+
178
+ def validate_for_agentic_setup(self) -> list[str]:
179
+ """Validate configuration for agentic setup command.
180
+
181
+ Returns:
182
+ List of validation errors (empty if valid).
183
+ """
184
+ errors = self.validate_for_setup()
185
+
186
+ if not self.opensearch_model_id:
187
+ errors.append(
188
+ "OPENSEARCH_MODEL_ID is required for agentic setup. "
189
+ "Run 'gnosisllm-knowledge setup' first to deploy the embedding model."
190
+ )
191
+
192
+ return errors
193
+
194
+ @property
195
+ def has_agentic_agents(self) -> bool:
196
+ """Check if any agentic agent is configured."""
197
+ return bool(self.opensearch_flow_agent_id or self.opensearch_conversational_agent_id)
198
+
199
+ @property
200
+ def has_flow_agent(self) -> bool:
201
+ """Check if flow agent is configured."""
202
+ return bool(self.opensearch_flow_agent_id)
203
+
204
+ @property
205
+ def has_conversational_agent(self) -> bool:
206
+ """Check if conversational agent is configured."""
207
+ return bool(self.opensearch_conversational_agent_id)
@@ -0,0 +1,87 @@
1
+ """Core module - Foundation layer with domain models, interfaces, and events."""
2
+
3
+ from gnosisllm_knowledge.core.domain import (
4
+ AgenticSearchQuery,
5
+ AgenticSearchResult,
6
+ AgentType,
7
+ BatchResult,
8
+ Document,
9
+ DocumentStatus,
10
+ IndexResult,
11
+ LoadResult,
12
+ ReasoningStep,
13
+ SearchMode,
14
+ SearchQuery,
15
+ SearchResult,
16
+ SearchResultItem,
17
+ SourceConfig,
18
+ TextChunk,
19
+ ValidationResult,
20
+ )
21
+ from gnosisllm_knowledge.core.events import Event, EventEmitter, EventType
22
+ from gnosisllm_knowledge.core.exceptions import (
23
+ AuthenticationError,
24
+ AuthorizationError,
25
+ ConfigurationError,
26
+ ConnectionError,
27
+ EmbeddingError,
28
+ IndexError,
29
+ KnowledgeError,
30
+ LoadError,
31
+ SearchError,
32
+ SetupError,
33
+ TimeoutError,
34
+ ValidationError,
35
+ )
36
+ from gnosisllm_knowledge.core.interfaces import (
37
+ IAgenticSearcher,
38
+ IContentFetcher,
39
+ IContentLoader,
40
+ IDocumentIndexer,
41
+ IKnowledgeSearcher,
42
+ ITextChunker,
43
+ )
44
+
45
+ __all__ = [
46
+ # Domain models
47
+ "Document",
48
+ "DocumentStatus",
49
+ "TextChunk",
50
+ "LoadResult",
51
+ "IndexResult",
52
+ "BatchResult",
53
+ "ValidationResult",
54
+ "SearchQuery",
55
+ "SearchResult",
56
+ "SearchResultItem",
57
+ "SearchMode",
58
+ "AgenticSearchQuery",
59
+ "AgenticSearchResult",
60
+ "AgentType",
61
+ "ReasoningStep",
62
+ "SourceConfig",
63
+ # Events
64
+ "Event",
65
+ "EventType",
66
+ "EventEmitter",
67
+ # Exceptions
68
+ "KnowledgeError",
69
+ "ConfigurationError",
70
+ "ConnectionError",
71
+ "AuthenticationError",
72
+ "AuthorizationError",
73
+ "LoadError",
74
+ "ValidationError",
75
+ "IndexError",
76
+ "SearchError",
77
+ "EmbeddingError",
78
+ "SetupError",
79
+ "TimeoutError",
80
+ # Interfaces
81
+ "IContentLoader",
82
+ "IContentFetcher",
83
+ "ITextChunker",
84
+ "IDocumentIndexer",
85
+ "IKnowledgeSearcher",
86
+ "IAgenticSearcher",
87
+ ]
@@ -0,0 +1,43 @@
1
+ """Domain models - Value objects and entities."""
2
+
3
+ from gnosisllm_knowledge.core.domain.document import Document, DocumentStatus, TextChunk
4
+ from gnosisllm_knowledge.core.domain.result import (
5
+ BatchResult,
6
+ IndexResult,
7
+ LoadResult,
8
+ ValidationResult,
9
+ )
10
+ from gnosisllm_knowledge.core.domain.search import (
11
+ AgenticSearchQuery,
12
+ AgenticSearchResult,
13
+ AgentType,
14
+ ReasoningStep,
15
+ SearchMode,
16
+ SearchQuery,
17
+ SearchResult,
18
+ SearchResultItem,
19
+ )
20
+ from gnosisllm_knowledge.core.domain.source import SourceConfig
21
+
22
+ __all__ = [
23
+ # Document
24
+ "Document",
25
+ "DocumentStatus",
26
+ "TextChunk",
27
+ # Result
28
+ "LoadResult",
29
+ "IndexResult",
30
+ "BatchResult",
31
+ "ValidationResult",
32
+ # Search
33
+ "SearchQuery",
34
+ "SearchResult",
35
+ "SearchResultItem",
36
+ "SearchMode",
37
+ "AgenticSearchQuery",
38
+ "AgenticSearchResult",
39
+ "AgentType",
40
+ "ReasoningStep",
41
+ # Source
42
+ "SourceConfig",
43
+ ]
@@ -0,0 +1,240 @@
1
+ """Document domain models."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ from dataclasses import dataclass, field
7
+ from datetime import UTC, datetime
8
+ from enum import Enum
9
+ from typing import Any
10
+
11
+
12
+ class DocumentStatus(Enum):
13
+ """Document processing status."""
14
+
15
+ PENDING = "pending"
16
+ PROCESSING = "processing"
17
+ INDEXED = "indexed"
18
+ FAILED = "failed"
19
+ DELETED = "deleted"
20
+
21
+
22
+ @dataclass
23
+ class Document:
24
+ """Represents a document to be indexed.
25
+
26
+ This is the core domain object that flows through the knowledge pipeline.
27
+ Documents are created by loaders, processed by chunkers, and stored by indexers.
28
+
29
+ Attributes:
30
+ content: The main text content of the document.
31
+ source: Source identifier (URL, file path, etc.).
32
+ doc_id: Unique identifier. Auto-generated from content hash if not provided.
33
+ title: Optional document title.
34
+ url: URL where the document was fetched from.
35
+ metadata: Arbitrary metadata dictionary.
36
+
37
+ Multi-tenancy fields:
38
+ account_id: Account/tenant identifier.
39
+ collection_id: Collection the document belongs to.
40
+ source_id: Source identifier within the collection.
41
+
42
+ Chunking info:
43
+ chunk_index: Index of this chunk (0-based).
44
+ total_chunks: Total number of chunks for the parent document.
45
+ parent_doc_id: Reference to the original document ID.
46
+
47
+ Quality and validation:
48
+ quality_score: Quality score from 0.0 to 1.0.
49
+ language: Detected language code (ISO 639-1).
50
+ content_hash: SHA-256 hash for deduplication.
51
+ word_count: Number of words in content.
52
+
53
+ Status:
54
+ status: Current processing status.
55
+
56
+ PII handling:
57
+ pii_detected: Whether PII was detected.
58
+ pii_redacted: Whether PII was redacted.
59
+
60
+ Timestamps:
61
+ created_at: When the document was created.
62
+ updated_at: When the document was last updated.
63
+ indexed_at: When the document was indexed.
64
+ """
65
+
66
+ content: str
67
+ source: str
68
+ doc_id: str | None = None
69
+ title: str | None = None
70
+ url: str | None = None
71
+ metadata: dict[str, Any] = field(default_factory=dict)
72
+
73
+ # Multi-tenancy fields
74
+ account_id: str | None = None
75
+ collection_id: str | None = None
76
+ source_id: str | None = None
77
+
78
+ # Chunking info
79
+ chunk_index: int | None = None
80
+ total_chunks: int | None = None
81
+ parent_doc_id: str | None = None
82
+
83
+ # Quality and validation
84
+ quality_score: float | None = None
85
+ language: str | None = None
86
+ content_hash: str | None = None
87
+ word_count: int | None = None
88
+
89
+ # Status
90
+ status: DocumentStatus = DocumentStatus.PENDING
91
+
92
+ # PII handling
93
+ pii_detected: bool = False
94
+ pii_redacted: bool = False
95
+
96
+ # Timestamps
97
+ created_at: datetime = field(default_factory=lambda: datetime.now(UTC))
98
+ updated_at: datetime | None = None
99
+ indexed_at: datetime | None = None
100
+
101
+ def __post_init__(self) -> None:
102
+ """Generate doc_id and content_hash if not provided."""
103
+ if not self.content:
104
+ raise ValueError("Document content cannot be empty")
105
+
106
+ # Generate content hash for deduplication
107
+ if self.content_hash is None:
108
+ self.content_hash = hashlib.sha256(self.content.encode()).hexdigest()
109
+
110
+ # Generate doc_id from content hash if not provided
111
+ if self.doc_id is None:
112
+ self.doc_id = f"{self.source}#{self.content_hash[:16]}"
113
+
114
+ # Calculate word count
115
+ if self.word_count is None:
116
+ self.word_count = len(self.content.split())
117
+
118
+ def with_chunk_info(
119
+ self,
120
+ chunk_index: int,
121
+ total_chunks: int,
122
+ parent_doc_id: str | None = None,
123
+ ) -> Document:
124
+ """Create a new document with chunk information.
125
+
126
+ Args:
127
+ chunk_index: Index of this chunk (0-based).
128
+ total_chunks: Total number of chunks.
129
+ parent_doc_id: Reference to the original document ID.
130
+
131
+ Returns:
132
+ New Document instance with chunk information set.
133
+ """
134
+ return Document(
135
+ content=self.content,
136
+ source=self.source,
137
+ doc_id=None, # Will be regenerated
138
+ title=self.title,
139
+ url=self.url,
140
+ metadata=self.metadata.copy(),
141
+ account_id=self.account_id,
142
+ collection_id=self.collection_id,
143
+ source_id=self.source_id,
144
+ chunk_index=chunk_index,
145
+ total_chunks=total_chunks,
146
+ parent_doc_id=parent_doc_id or self.doc_id,
147
+ quality_score=self.quality_score,
148
+ language=self.language,
149
+ status=self.status,
150
+ pii_detected=self.pii_detected,
151
+ pii_redacted=self.pii_redacted,
152
+ created_at=self.created_at,
153
+ )
154
+
155
+ def with_tenant(
156
+ self,
157
+ account_id: str,
158
+ collection_id: str | None = None,
159
+ source_id: str | None = None,
160
+ ) -> Document:
161
+ """Create a new document with tenant information.
162
+
163
+ Args:
164
+ account_id: Account/tenant identifier.
165
+ collection_id: Collection identifier.
166
+ source_id: Source identifier.
167
+
168
+ Returns:
169
+ New Document instance with tenant information set.
170
+ """
171
+ return Document(
172
+ content=self.content,
173
+ source=self.source,
174
+ doc_id=self.doc_id,
175
+ title=self.title,
176
+ url=self.url,
177
+ metadata=self.metadata.copy(),
178
+ account_id=account_id,
179
+ collection_id=collection_id or self.collection_id,
180
+ source_id=source_id or self.source_id,
181
+ chunk_index=self.chunk_index,
182
+ total_chunks=self.total_chunks,
183
+ parent_doc_id=self.parent_doc_id,
184
+ quality_score=self.quality_score,
185
+ language=self.language,
186
+ content_hash=self.content_hash,
187
+ word_count=self.word_count,
188
+ status=self.status,
189
+ pii_detected=self.pii_detected,
190
+ pii_redacted=self.pii_redacted,
191
+ created_at=self.created_at,
192
+ updated_at=self.updated_at,
193
+ indexed_at=self.indexed_at,
194
+ )
195
+
196
+ @property
197
+ def is_chunk(self) -> bool:
198
+ """Check if this document is a chunk of a larger document."""
199
+ return self.chunk_index is not None and self.total_chunks is not None
200
+
201
+ @property
202
+ def is_multi_tenant(self) -> bool:
203
+ """Check if this document has tenant information."""
204
+ return self.account_id is not None
205
+
206
+
207
+ @dataclass
208
+ class TextChunk:
209
+ """Represents a chunk of text from a document.
210
+
211
+ Text chunks are created by chunkers to split large documents into
212
+ smaller, embedding-friendly pieces.
213
+
214
+ Attributes:
215
+ content: The text content of the chunk.
216
+ index: Index of this chunk (0-based).
217
+ start_position: Start position in the original text.
218
+ end_position: End position in the original text.
219
+ metadata: Optional metadata for the chunk.
220
+ """
221
+
222
+ content: str
223
+ index: int
224
+ start_position: int
225
+ end_position: int
226
+ metadata: dict[str, Any] = field(default_factory=dict)
227
+
228
+ @property
229
+ def length(self) -> int:
230
+ """Return the length of the chunk content."""
231
+ return len(self.content)
232
+
233
+ def __post_init__(self) -> None:
234
+ """Validate chunk data."""
235
+ if self.start_position < 0:
236
+ raise ValueError("start_position must be non-negative")
237
+ if self.end_position < self.start_position:
238
+ raise ValueError("end_position must be >= start_position")
239
+ if self.index < 0:
240
+ raise ValueError("index must be non-negative")