ragit 0.8.2__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ragit/loaders.py CHANGED
@@ -6,15 +6,78 @@
6
6
  Document loading and chunking utilities.
7
7
 
8
8
  Provides simple functions to load documents from files and chunk text.
9
+
10
+ Includes ai4rag-inspired patterns:
11
+ - Auto-generated document IDs via SHA256 hash
12
+ - Sequence numbering for chunk ordering
13
+ - Deduplication via content hashing
9
14
  """
10
15
 
16
+ import hashlib
11
17
  import re
12
18
  from pathlib import Path
13
- from typing import Any
14
19
 
15
20
  from ragit.core.experiment.experiment import Chunk, Document
16
21
 
17
22
 
23
+ def generate_document_id(content: str) -> str:
24
+ """
25
+ Generate a unique document ID from content using SHA256 hash.
26
+
27
+ Pattern from ai4rag langchain_chunker.py.
28
+
29
+ Parameters
30
+ ----------
31
+ content : str
32
+ Document content to hash.
33
+
34
+ Returns
35
+ -------
36
+ str
37
+ 16-character hex string (first 64 bits of SHA256).
38
+
39
+ Examples
40
+ --------
41
+ >>> doc_id = generate_document_id("Hello, world!")
42
+ >>> len(doc_id)
43
+ 16
44
+ """
45
+ return hashlib.sha256(content.encode()).hexdigest()[:16]
46
+
47
+
48
+ def deduplicate_documents(documents: list[Document]) -> list[Document]:
49
+ """
50
+ Remove duplicate documents based on content hash.
51
+
52
+ Pattern from ai4rag chroma.py.
53
+
54
+ Parameters
55
+ ----------
56
+ documents : list[Document]
57
+ Documents to deduplicate.
58
+
59
+ Returns
60
+ -------
61
+ list[Document]
62
+ Unique documents (first occurrence kept).
63
+
64
+ Examples
65
+ --------
66
+ >>> unique_docs = deduplicate_documents(docs)
67
+ >>> print(f"Removed {len(docs) - len(unique_docs)} duplicates")
68
+ """
69
+ seen_hashes: set[str] = set()
70
+ unique_docs: list[Document] = []
71
+
72
+ for doc in documents:
73
+ content_hash = generate_document_id(doc.content)
74
+ if content_hash not in seen_hashes:
75
+ seen_hashes.add(content_hash)
76
+ unique_docs.append(doc)
77
+
78
+ return unique_docs
79
+
80
+
18
81
  def load_text(path: str | Path) -> Document:
19
82
  """
20
83
  Load a single text file as a Document.
@@ -77,11 +140,16 @@ def chunk_text(
77
140
  text: str,
78
141
  chunk_size: int = 512,
79
142
  chunk_overlap: int = 50,
80
- doc_id: str = "doc",
81
- metadata: dict[str, Any] | None = None,
143
+ doc_id: str | None = None,
144
+ include_metadata: bool = True,
82
145
  ) -> list[Chunk]:
83
146
  """
84
- Split text into overlapping chunks.
147
+ Split text into overlapping chunks with rich metadata.
148
+
149
+ Includes ai4rag-inspired metadata:
150
+ - document_id: SHA256 hash for deduplication and window search
151
+ - sequence_number: Order within the document
152
+ - chunk_start/chunk_end: Character positions in original text
85
153
 
86
154
  Parameters
87
155
  ----------
@@ -91,37 +159,55 @@ def chunk_text(
91
159
  Maximum characters per chunk (default: 512).
92
160
  chunk_overlap : int
93
161
  Overlap between chunks (default: 50).
94
- doc_id : str
95
- Document ID for the chunks (default: "doc").
96
- metadata : dict, optional
97
- Metadata to attach to each chunk (default: None).
162
+ doc_id : str, optional
163
+ Document ID for the chunks. If None, generates from content hash.
164
+ include_metadata : bool
165
+ Include rich metadata in chunks (default: True).
98
166
 
99
167
  Returns
100
168
  -------
101
169
  list[Chunk]
102
- List of text chunks.
170
+ List of text chunks with metadata.
103
171
 
104
172
  Examples
105
173
  --------
106
- >>> chunks = chunk_text("Long document...", chunk_size=256, chunk_overlap=50)
174
+ >>> chunks = chunk_text("Long document...", chunk_size=256)
175
+ >>> print(chunks[0].metadata)
176
+ {'document_id': 'a1b2c3...', 'sequence_number': 0, 'chunk_start': 0, 'chunk_end': 256}
107
177
  """
108
178
  if chunk_overlap >= chunk_size:
109
179
  raise ValueError("chunk_overlap must be less than chunk_size")
110
180
 
181
+ # Generate document ID if not provided
182
+ effective_doc_id = doc_id or generate_document_id(text)
183
+
111
184
  chunks = []
112
185
  start = 0
113
- chunk_idx = 0
114
- chunk_metadata = metadata or {}
186
+ sequence_number = 0
115
187
 
116
188
  while start < len(text):
117
- end = start + chunk_size
189
+ end = min(start + chunk_size, len(text))
118
190
  chunk_content = text[start:end].strip()
119
191
 
120
192
  if chunk_content:
193
+ metadata = {}
194
+ if include_metadata:
195
+ metadata = {
196
+ "document_id": effective_doc_id,
197
+ "sequence_number": sequence_number,
198
+ "chunk_start": start,
199
+ "chunk_end": end,
200
+ }
201
+
121
202
  chunks.append(
122
- Chunk(content=chunk_content, doc_id=doc_id, chunk_index=chunk_idx, metadata=chunk_metadata.copy())
203
+ Chunk(
204
+ content=chunk_content,
205
+ doc_id=effective_doc_id,
206
+ chunk_index=sequence_number,
207
+ metadata=metadata,
208
+ )
123
209
  )
124
- chunk_idx += 1
210
+ sequence_number += 1
125
211
 
126
212
  start = end - chunk_overlap
127
213
  if start >= len(text) - chunk_overlap:
@@ -130,9 +216,14 @@ def chunk_text(
130
216
  return chunks
131
217
 
132
218
 
133
- def chunk_document(doc: Document, chunk_size: int = 512, chunk_overlap: int = 50) -> list[Chunk]:
219
+ def chunk_document(
220
+ doc: Document,
221
+ chunk_size: int = 512,
222
+ chunk_overlap: int = 50,
223
+ include_metadata: bool = True,
224
+ ) -> list[Chunk]:
134
225
  """
135
- Split a Document into overlapping chunks.
226
+ Split a Document into overlapping chunks with rich metadata.
136
227
 
137
228
  Parameters
138
229
  ----------
@@ -142,17 +233,29 @@ def chunk_document(doc: Document, chunk_size: int = 512, chunk_overlap: int = 50
142
233
  Maximum characters per chunk.
143
234
  chunk_overlap : int
144
235
  Overlap between chunks.
236
+ include_metadata : bool
237
+ Include rich metadata in chunks (default: True).
145
238
 
146
239
  Returns
147
240
  -------
148
241
  list[Chunk]
149
- List of chunks from the document.
242
+ List of chunks from the document with metadata.
150
243
  """
151
- return chunk_text(doc.content, chunk_size, chunk_overlap, doc.id, metadata=doc.metadata)
244
+ chunks = chunk_text(doc.content, chunk_size, chunk_overlap, doc.id, include_metadata)
245
+
246
+ # Merge document metadata into chunk metadata
247
+ if doc.metadata and include_metadata:
248
+ for chunk in chunks:
249
+ chunk.metadata = {**doc.metadata, **chunk.metadata}
250
+
251
+ return chunks
152
252
 
153
253
 
154
254
  def chunk_by_separator(
155
- text: str, separator: str = "\n\n", doc_id: str = "doc", metadata: dict[str, Any] | None = None
255
+ text: str,
256
+ separator: str = "\n\n",
257
+ doc_id: str | None = None,
258
+ include_metadata: bool = True,
156
259
  ) -> list[Chunk]:
157
260
  """
158
261
  Split text by a separator (e.g., paragraphs, sections).
@@ -163,64 +266,96 @@ def chunk_by_separator(
163
266
  Text to split.
164
267
  separator : str
165
268
  Separator string (default: double newline for paragraphs).
166
- doc_id : str
167
- Document ID for the chunks.
168
- metadata : dict, optional
169
- Metadata to attach to each chunk (default: None).
269
+ doc_id : str, optional
270
+ Document ID for the chunks. If None, generates from content hash.
271
+ include_metadata : bool
272
+ Include rich metadata in chunks (default: True).
170
273
 
171
274
  Returns
172
275
  -------
173
276
  list[Chunk]
174
- List of chunks.
277
+ List of chunks with metadata.
175
278
 
176
279
  Examples
177
280
  --------
178
281
  >>> chunks = chunk_by_separator(text, separator="\\n---\\n")
179
282
  """
283
+ effective_doc_id = doc_id or generate_document_id(text)
180
284
  parts = text.split(separator)
181
- chunks = []
182
- chunk_metadata = metadata or {}
285
+ chunks: list[Chunk] = []
286
+ current_pos = 0
183
287
 
184
- for idx, part in enumerate(parts):
288
+ for _idx, part in enumerate(parts):
185
289
  content = part.strip()
186
290
  if content:
187
- chunks.append(Chunk(content=content, doc_id=doc_id, chunk_index=idx, metadata=chunk_metadata.copy()))
291
+ metadata = {}
292
+ if include_metadata:
293
+ # Find actual position in original text
294
+ part_start = text.find(part, current_pos)
295
+ part_end = part_start + len(part) if part_start >= 0 else current_pos + len(part)
296
+ metadata = {
297
+ "document_id": effective_doc_id,
298
+ "sequence_number": len(chunks),
299
+ "chunk_start": part_start if part_start >= 0 else current_pos,
300
+ "chunk_end": part_end,
301
+ }
302
+ current_pos = part_end
303
+
304
+ chunks.append(
305
+ Chunk(
306
+ content=content,
307
+ doc_id=effective_doc_id,
308
+ chunk_index=len(chunks),
309
+ metadata=metadata,
310
+ )
311
+ )
188
312
 
189
313
  return chunks
190
314
 
191
315
 
192
- def chunk_rst_sections(text: str, doc_id: str = "doc", metadata: dict[str, Any] | None = None) -> list[Chunk]:
316
+ def chunk_rst_sections(
317
+ text: str,
318
+ doc_id: str | None = None,
319
+ include_metadata: bool = True,
320
+ ) -> list[Chunk]:
193
321
  """
194
- Split RST document by section headers.
322
+ Split RST document by section headers with rich metadata.
195
323
 
196
324
  Parameters
197
325
  ----------
198
326
  text : str
199
327
  RST document text.
200
- doc_id : str
201
- Document ID for the chunks.
202
- metadata : dict, optional
203
- Metadata to attach to each chunk (default: None).
328
+ doc_id : str, optional
329
+ Document ID for the chunks. If None, generates from content hash.
330
+ include_metadata : bool
331
+ Include rich metadata in chunks (default: True).
204
332
 
205
333
  Returns
206
334
  -------
207
335
  list[Chunk]
208
- List of section chunks.
336
+ List of section chunks with metadata.
209
337
  """
338
+ effective_doc_id = doc_id or generate_document_id(text)
339
+
210
340
  # Match RST section headers (title followed by underline of =, -, ~, etc.)
211
341
  pattern = r"\n([^\n]+)\n([=\-~`\'\"^_*+#]+)\n"
212
- chunk_metadata = metadata or {}
213
342
 
214
343
  # Find all section positions
215
344
  matches = list(re.finditer(pattern, text))
216
345
 
217
346
  if not matches:
218
347
  # No sections found, return whole text as one chunk
219
- return (
220
- [Chunk(content=text.strip(), doc_id=doc_id, chunk_index=0, metadata=chunk_metadata.copy())]
221
- if text.strip()
222
- else []
223
- )
348
+ if text.strip():
349
+ metadata = {}
350
+ if include_metadata:
351
+ metadata = {
352
+ "document_id": effective_doc_id,
353
+ "sequence_number": 0,
354
+ "chunk_start": 0,
355
+ "chunk_end": len(text),
356
+ }
357
+ return [Chunk(content=text.strip(), doc_id=effective_doc_id, chunk_index=0, metadata=metadata)]
358
+ return []
224
359
 
225
360
  chunks = []
226
361
 
@@ -229,7 +364,15 @@ def chunk_rst_sections(text: str, doc_id: str = "doc", metadata: dict[str, Any]
229
364
  if first_pos > 0:
230
365
  pre_content = text[:first_pos].strip()
231
366
  if pre_content:
232
- chunks.append(Chunk(content=pre_content, doc_id=doc_id, chunk_index=0, metadata=chunk_metadata.copy()))
367
+ metadata = {}
368
+ if include_metadata:
369
+ metadata = {
370
+ "document_id": effective_doc_id,
371
+ "sequence_number": 0,
372
+ "chunk_start": 0,
373
+ "chunk_end": first_pos,
374
+ }
375
+ chunks.append(Chunk(content=pre_content, doc_id=effective_doc_id, chunk_index=0, metadata=metadata))
233
376
 
234
377
  # Extract each section
235
378
  for i, match in enumerate(matches):
@@ -238,8 +381,21 @@ def chunk_rst_sections(text: str, doc_id: str = "doc", metadata: dict[str, Any]
238
381
 
239
382
  section_content = text[start:end].strip()
240
383
  if section_content:
384
+ metadata = {}
385
+ if include_metadata:
386
+ metadata = {
387
+ "document_id": effective_doc_id,
388
+ "sequence_number": len(chunks),
389
+ "chunk_start": start,
390
+ "chunk_end": end,
391
+ }
241
392
  chunks.append(
242
- Chunk(content=section_content, doc_id=doc_id, chunk_index=len(chunks), metadata=chunk_metadata.copy())
393
+ Chunk(
394
+ content=section_content,
395
+ doc_id=effective_doc_id,
396
+ chunk_index=len(chunks),
397
+ metadata=metadata,
398
+ )
243
399
  )
244
400
 
245
401
  return chunks
ragit/logging.py ADDED
@@ -0,0 +1,194 @@
1
+ #
2
+ # Copyright RODMENA LIMITED 2025
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+ """
6
+ Structured logging for ragit.
7
+
8
+ Provides consistent logging across all ragit components with:
9
+ - Operation timing
10
+ - Context tracking
11
+ - Configurable log levels
12
+ """
13
+
14
+ import logging
15
+ import time
16
+ from collections.abc import Callable, Generator
17
+ from contextlib import contextmanager
18
+ from functools import wraps
19
+ from typing import Any, TypeVar
20
+
21
+ # Create ragit logger
22
+ logger = logging.getLogger("ragit")
23
+
24
+ # Type variable for decorated functions
25
+ F = TypeVar("F", bound=Callable[..., Any])
26
+
27
+
28
+ def setup_logging(level: str = "INFO", format_string: str | None = None) -> None:
29
+ """Configure ragit logging.
30
+
31
+ Parameters
32
+ ----------
33
+ level : str
34
+ Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL).
35
+ format_string : str, optional
36
+ Custom format string. If None, uses default format.
37
+
38
+ Examples
39
+ --------
40
+ >>> from ragit.logging import setup_logging
41
+ >>> setup_logging("DEBUG")
42
+ """
43
+ logger.setLevel(level.upper())
44
+
45
+ # Only add handler if none exist
46
+ if not logger.handlers:
47
+ handler = logging.StreamHandler()
48
+ handler.setLevel(level.upper())
49
+
50
+ if format_string is None:
51
+ format_string = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
52
+
53
+ formatter = logging.Formatter(format_string)
54
+ handler.setFormatter(formatter)
55
+ logger.addHandler(handler)
56
+
57
+
58
+ @contextmanager
59
+ def log_operation(operation: str, **context: Any) -> Generator[dict[str, Any], None, None]:
60
+ """Context manager for logging operations with timing.
61
+
62
+ Parameters
63
+ ----------
64
+ operation : str
65
+ Name of the operation being performed.
66
+ **context
67
+ Additional context to include in log messages.
68
+
69
+ Yields
70
+ ------
71
+ dict
72
+ Mutable dict to add additional context during the operation.
73
+
74
+ Examples
75
+ --------
76
+ >>> with log_operation("embed", model="nomic-embed-text") as ctx:
77
+ ... result = provider.embed(text, model)
78
+ ... ctx["dimensions"] = len(result.embedding)
79
+ """
80
+ start = time.perf_counter()
81
+ extra_context: dict[str, Any] = {}
82
+
83
+ # Build context string
84
+ ctx_str = ", ".join(f"{k}={v}" for k, v in context.items()) if context else ""
85
+
86
+ logger.debug(f"{operation}.start" + (f" [{ctx_str}]" if ctx_str else ""))
87
+
88
+ try:
89
+ yield extra_context
90
+ duration_ms = (time.perf_counter() - start) * 1000
91
+
92
+ # Combine original context with extra context
93
+ all_context = {**context, **extra_context, "duration_ms": f"{duration_ms:.2f}"}
94
+ ctx_str = ", ".join(f"{k}={v}" for k, v in all_context.items())
95
+
96
+ logger.info(f"{operation}.success [{ctx_str}]")
97
+ except Exception as e:
98
+ duration_ms = (time.perf_counter() - start) * 1000
99
+ all_context = {**context, **extra_context, "duration_ms": f"{duration_ms:.2f}", "error": str(e)}
100
+ ctx_str = ", ".join(f"{k}={v}" for k, v in all_context.items())
101
+
102
+ logger.error(f"{operation}.failed [{ctx_str}]", exc_info=True)
103
+ raise
104
+
105
+
106
+ def log_method(operation: str) -> Callable[[F], F]:
107
+ """Decorator for logging method calls with timing.
108
+
109
+ Parameters
110
+ ----------
111
+ operation : str
112
+ Name of the operation for logging.
113
+
114
+ Returns
115
+ -------
116
+ Callable
117
+ Decorated function.
118
+
119
+ Examples
120
+ --------
121
+ >>> class MyProvider:
122
+ ... @log_method("embed")
123
+ ... def embed(self, text: str, model: str):
124
+ ... ...
125
+ """
126
+
127
+ def decorator(func: F) -> F:
128
+ @wraps(func)
129
+ def wrapper(*args: Any, **kwargs: Any) -> Any:
130
+ with log_operation(operation, method=func.__name__):
131
+ return func(*args, **kwargs)
132
+
133
+ return wrapper # type: ignore
134
+
135
+ return decorator
136
+
137
+
138
+ class LogContext:
139
+ """Context tracker for correlating related log messages.
140
+
141
+ Useful for tracing operations across multiple components.
142
+
143
+ Examples
144
+ --------
145
+ >>> ctx = LogContext("query-123")
146
+ >>> ctx.log("Starting retrieval", top_k=5)
147
+ >>> ctx.log("Retrieved chunks", count=3)
148
+ """
149
+
150
+ def __init__(self, request_id: str | None = None):
151
+ """Initialize log context.
152
+
153
+ Parameters
154
+ ----------
155
+ request_id : str, optional
156
+ Unique identifier for this context. Auto-generated if not provided.
157
+ """
158
+ self.request_id = request_id or f"req-{int(time.time() * 1000) % 100000}"
159
+ self._start_time = time.perf_counter()
160
+
161
+ def log(self, message: str, level: str = "INFO", **context: Any) -> None:
162
+ """Log a message with this context.
163
+
164
+ Parameters
165
+ ----------
166
+ message : str
167
+ Log message.
168
+ level : str
169
+ Log level (DEBUG, INFO, WARNING, ERROR).
170
+ **context
171
+ Additional context key-value pairs.
172
+ """
173
+ elapsed_ms = (time.perf_counter() - self._start_time) * 1000
174
+ ctx_str = ", ".join(f"{k}={v}" for k, v in context.items())
175
+ full_msg = f"[{self.request_id}] {message}" + (f" [{ctx_str}]" if ctx_str else "") + f" (+{elapsed_ms:.0f}ms)"
176
+
177
+ log_level = getattr(logging, level.upper(), logging.INFO)
178
+ logger.log(log_level, full_msg)
179
+
180
+ def debug(self, message: str, **context: Any) -> None:
181
+ """Log debug message."""
182
+ self.log(message, "DEBUG", **context)
183
+
184
+ def info(self, message: str, **context: Any) -> None:
185
+ """Log info message."""
186
+ self.log(message, "INFO", **context)
187
+
188
+ def warning(self, message: str, **context: Any) -> None:
189
+ """Log warning message."""
190
+ self.log(message, "WARNING", **context)
191
+
192
+ def error(self, message: str, **context: Any) -> None:
193
+ """Log error message."""
194
+ self.log(message, "ERROR", **context)