raglineage 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. raglineage-0.1.0/CONTRIBUTING.md +57 -0
  2. raglineage-0.1.0/LICENSE +17 -0
  3. raglineage-0.1.0/MANIFEST.in +7 -0
  4. raglineage-0.1.0/PKG-INFO +336 -0
  5. raglineage-0.1.0/README.md +296 -0
  6. raglineage-0.1.0/examples/data/products.csv +6 -0
  7. raglineage-0.1.0/examples/data/sample.txt +15 -0
  8. raglineage-0.1.0/pyproject.toml +84 -0
  9. raglineage-0.1.0/raglineage/__init__.py +6 -0
  10. raglineage-0.1.0/raglineage/api.py +406 -0
  11. raglineage-0.1.0/raglineage/audit/__init__.py +15 -0
  12. raglineage-0.1.0/raglineage/audit/auditor.py +46 -0
  13. raglineage-0.1.0/raglineage/audit/checks.py +98 -0
  14. raglineage-0.1.0/raglineage/cli/__init__.py +1 -0
  15. raglineage-0.1.0/raglineage/cli/main.py +137 -0
  16. raglineage-0.1.0/raglineage/config.py +22 -0
  17. raglineage-0.1.0/raglineage/embedding/__init__.py +13 -0
  18. raglineage-0.1.0/raglineage/embedding/base.py +41 -0
  19. raglineage-0.1.0/raglineage/embedding/local.py +45 -0
  20. raglineage-0.1.0/raglineage/embedding/openai.py +54 -0
  21. raglineage-0.1.0/raglineage/ingest/__init__.py +8 -0
  22. raglineage-0.1.0/raglineage/ingest/auto.py +49 -0
  23. raglineage-0.1.0/raglineage/ingest/base.py +37 -0
  24. raglineage-0.1.0/raglineage/ingest/files.py +60 -0
  25. raglineage-0.1.0/raglineage/ingest/tabular.py +103 -0
  26. raglineage-0.1.0/raglineage/lineage/__init__.py +7 -0
  27. raglineage-0.1.0/raglineage/lineage/diff.py +99 -0
  28. raglineage-0.1.0/raglineage/lineage/graph.py +167 -0
  29. raglineage-0.1.0/raglineage/lineage/versioning.py +159 -0
  30. raglineage-0.1.0/raglineage/retrieval/__init__.py +6 -0
  31. raglineage-0.1.0/raglineage/retrieval/filters.py +74 -0
  32. raglineage-0.1.0/raglineage/retrieval/retriever.py +88 -0
  33. raglineage-0.1.0/raglineage/schemas/__init__.py +16 -0
  34. raglineage-0.1.0/raglineage/schemas/audit.py +48 -0
  35. raglineage-0.1.0/raglineage/schemas/dataset.py +67 -0
  36. raglineage-0.1.0/raglineage/schemas/lineage_node.py +86 -0
  37. raglineage-0.1.0/raglineage/store/__init__.py +7 -0
  38. raglineage-0.1.0/raglineage/store/base.py +59 -0
  39. raglineage-0.1.0/raglineage/store/faiss_store.py +117 -0
  40. raglineage-0.1.0/raglineage/store/mapping.py +76 -0
  41. raglineage-0.1.0/raglineage/transform/__init__.py +15 -0
  42. raglineage-0.1.0/raglineage/transform/base.py +29 -0
  43. raglineage-0.1.0/raglineage/transform/chunkers.py +173 -0
  44. raglineage-0.1.0/raglineage/transform/dedupe.py +45 -0
  45. raglineage-0.1.0/raglineage/transform/normalize.py +57 -0
  46. raglineage-0.1.0/raglineage/utils/__init__.py +14 -0
  47. raglineage-0.1.0/raglineage/utils/hashing.py +36 -0
  48. raglineage-0.1.0/raglineage/utils/io.py +57 -0
  49. raglineage-0.1.0/raglineage/utils/logging.py +28 -0
  50. raglineage-0.1.0/raglineage.egg-info/PKG-INFO +336 -0
  51. raglineage-0.1.0/raglineage.egg-info/SOURCES.txt +58 -0
  52. raglineage-0.1.0/raglineage.egg-info/dependency_links.txt +1 -0
  53. raglineage-0.1.0/raglineage.egg-info/entry_points.txt +2 -0
  54. raglineage-0.1.0/raglineage.egg-info/requires.txt +16 -0
  55. raglineage-0.1.0/raglineage.egg-info/top_level.txt +1 -0
  56. raglineage-0.1.0/setup.cfg +4 -0
  57. raglineage-0.1.0/tests/test_end_to_end_small_dataset.py +43 -0
  58. raglineage-0.1.0/tests/test_graph_diff.py +87 -0
  59. raglineage-0.1.0/tests/test_incremental_update.py +34 -0
  60. raglineage-0.1.0/tests/test_lineage_node.py +43 -0
@@ -0,0 +1,57 @@
1
+ # Contributing to raglineage
2
+
3
+ Thank you for your interest in contributing to raglineage! This document provides guidelines and instructions for contributing.
4
+
5
+ ## Development Setup
6
+
7
+ 1. Fork the repository
8
+ 2. Clone your fork:
9
+ ```bash
10
+ git clone https://github.com/YOUR_USERNAME/raglineage.git
11
+ cd raglineage
12
+ ```
13
+ 3. Install in development mode:
14
+ ```bash
15
+ pip install -e ".[dev]"
16
+ ```
17
+ 4. Run tests to ensure everything works:
18
+ ```bash
19
+ pytest
20
+ ```
21
+
22
+ ## Code Style
23
+
24
+ - Use **type hints** everywhere (Python ≥ 3.10)
25
+ - Follow **PEP 8** style guidelines
26
+ - Use **ruff** for linting (configuration in `pyproject.toml`)
27
+ - Use **pydantic** models for all data schemas
28
+ - Write **docstrings** for all public functions and classes
29
+
30
+ ## Testing
31
+
32
+ - Write tests for all new features
33
+ - Aim for high test coverage
34
+ - Tests should be in `tests/` directory
35
+ - Run tests with: `pytest`
36
+
37
+ ## Pull Request Process
38
+
39
+ 1. Create a feature branch from `main`
40
+ 2. Make your changes
41
+ 3. Add tests for new functionality
42
+ 4. Ensure all tests pass: `pytest`
43
+ 5. Run linting: `ruff check .`
44
+ 6. Update documentation if needed
45
+ 7. Submit a pull request with a clear description
46
+
47
+ ## Commit Messages
48
+
49
+ - Use clear, descriptive commit messages
50
+ - Reference issue numbers if applicable
51
+ - Follow conventional commit format when possible
52
+
53
+ ## Questions?
54
+
55
+ Open an issue on GitHub for questions or discussions.
56
+
57
+ Thank you for contributing!
@@ -0,0 +1,17 @@
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ Copyright 2026 Pranav Motarwar
6
+
7
+ Licensed under the Apache License, Version 2.0 (the "License");
8
+ you may not use this file except in compliance with the License.
9
+ You may obtain a copy of the License at
10
+
11
+ http://www.apache.org/licenses/LICENSE-2.0
12
+
13
+ Unless required by applicable law or agreed to in writing, software
14
+ distributed under the License is distributed on an "AS IS" BASIS,
15
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ See the License for the specific language governing permissions and
17
+ limitations under the License.
@@ -0,0 +1,7 @@
1
+ include README.md
2
+ include LICENSE
3
+ include CONTRIBUTING.md
4
+ include pyproject.toml
5
+ recursive-include examples *
6
+ recursive-exclude * __pycache__
7
+ recursive-exclude * *.py[co]
@@ -0,0 +1,336 @@
1
+ Metadata-Version: 2.4
2
+ Name: raglineage
3
+ Version: 0.1.0
4
+ Summary: Lineage-aware RAG engine for auditable, reproducible, versioned retrieval and answers
5
+ Author-email: Pranav Motarwar <pranav.motarwar@example.com>
6
+ License: Apache-2.0
7
+ Project-URL: Homepage, https://github.com/PranavMotarwar/raglineage
8
+ Project-URL: Documentation, https://github.com/PranavMotarwar/raglineage
9
+ Project-URL: Repository, https://github.com/PranavMotarwar/raglineage
10
+ Project-URL: Issues, https://github.com/PranavMotarwar/raglineage/issues
11
+ Keywords: rag,lineage,provenance,vector-search,nlp,llm
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: Apache Software License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
+ Requires-Python: >=3.10
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE
25
+ Requires-Dist: pydantic>=2.0.0
26
+ Requires-Dist: networkx>=3.0
27
+ Requires-Dist: faiss-cpu>=1.7.4
28
+ Requires-Dist: sentence-transformers>=2.2.0
29
+ Requires-Dist: typer>=0.9.0
30
+ Requires-Dist: rich>=13.0.0
31
+ Requires-Dist: pyyaml>=6.0
32
+ Provides-Extra: dev
33
+ Requires-Dist: pytest>=7.4.0; extra == "dev"
34
+ Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
35
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
36
+ Requires-Dist: mypy>=1.5.0; extra == "dev"
37
+ Provides-Extra: openai
38
+ Requires-Dist: openai>=1.0.0; extra == "openai"
39
+ Dynamic: license-file
40
+
41
+ # raglineage
42
+
43
+ **Lineage-aware RAG engine for auditable, reproducible, versioned retrieval and answers**
44
+
45
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
46
+ [![License: Apache-2.0](https://img.shields.io/badge/License-Apache--2.0-green.svg)](https://opensource.org/licenses/Apache-2.0)
47
+
48
+ ## The Unique Idea
49
+
50
+ Most RAG tools store text chunks and embeddings. They lose provenance and cannot explain answer drift.
51
+
52
+ **raglineage** treats RAG as a data lineage and provenance problem, not just vector search. Every retrievable unit is a **Lineage Node (LN)** with:
53
+
54
+ - Immutable ID and dataset version
55
+ - Precise source reference (file path, page, row, URL, etc.)
56
+ - Full transform chain (ordered list of transforms applied)
57
+ - Content hash for integrity
58
+ - Timestamps for auditing
59
+
60
+ The system maintains a **Lineage Graph (DAG)** linking nodes through structural and semantic relationships, enabling:
61
+
62
+ - Dataset versioning and diffing
63
+ - Incremental rebuilds (only recompute what changed)
64
+ - Answer auditing (reconstruct provenance of any answer)
65
+ - Version consistency checks
66
+ - Staleness detection
67
+
68
+ This is **not** a LangChain/LlamaIndex wrapper—it's a first-class lineage system.
69
+
70
+ ## Architecture
71
+
72
+ ```
73
+ ┌─────────────────────────────────────────────────────────────┐
74
+ │ Data Sources │
75
+ │ (PDFs, CSVs, JSON, APIs, Text Files) │
76
+ └──────────────────────┬──────────────────────────────────────┘
77
+
78
+
79
+ ┌─────────────────────────────────────────────────────────────┐
80
+ │ Ingestion Layer │
81
+ │ AutoIngestor → FileIngestor → TabularIngestor │
82
+ └──────────────────────┬──────────────────────────────────────┘
83
+
84
+
85
+ ┌─────────────────────────────────────────────────────────────┐
86
+ │ Transform Layer │
87
+ │ Chunkers → Dedupe → Normalize │
88
+ │ (Each transform recorded in transform_chain) │
89
+ └──────────────────────┬──────────────────────────────────────┘
90
+
91
+
92
+ ┌─────────────────────────────────────────────────────────────┐
93
+ │ Lineage Node Creation │
94
+ │ ln_id, source, transform_chain, content_hash, version │
95
+ └──────────────────────┬──────────────────────────────────────┘
96
+
97
+
98
+ ┌─────────────────────────────────────────────────────────────┐
99
+ │ Lineage Graph (DAG) │
100
+ │ networkx DAG: nodes=LN, edges=relationships │
101
+ └──────────────────────┬──────────────────────────────────────┘
102
+
103
+
104
+ ┌─────────────────────────────────────────────────────────────┐
105
+ │ Embedding + Vector Store │
106
+ │ Embeddings → FAISS Store → LN ID Mapping │
107
+ └──────────────────────┬──────────────────────────────────────┘
108
+
109
+
110
+ ┌─────────────────────────────────────────────────────────────┐
111
+ │ Retrieval + Audit │
112
+ │ Query → Top-K → Graph Walk → Answer + Lineage │
113
+ │ Audit → Version Check → Staleness → Risk Flags │
114
+ └─────────────────────────────────────────────────────────────┘
115
+ ```
116
+
117
+ ## Lineage Node Example
118
+
119
+ Every retrievable chunk is a Lineage Node with complete provenance:
120
+
121
+ ```json
122
+ {
123
+ "ln_id": "ln_92af",
124
+ "content": "Revenue declined due to supply constraints",
125
+ "source": {
126
+ "type": "pdf",
127
+ "uri": "data/10Q_Q3_2023.pdf",
128
+ "page": 14,
129
+ "section": "Management Discussion"
130
+ },
131
+ "dataset_version": "v3.1",
132
+ "transform_chain": [
133
+ "pdf_parse",
134
+ "section_split",
135
+ "semantic_chunk",
136
+ "deduplicate"
137
+ ],
138
+ "content_hash": "sha256:a3f5b8c9d2e1f4a6b7c8d9e0f1a2b3c4d5e6f7a8b9c0d1e2f3a4b5c6d7e8f9a0",
139
+ "created_at": "2026-01-20T00:00:00Z"
140
+ }
141
+ ```
142
+
143
+ ## Audited Answer Example
144
+
145
+ Every answer includes full lineage and audit metadata:
146
+
147
+ ```json
148
+ {
149
+ "question": "Why did revenue fall in Q3?",
150
+ "answer": "Revenue declined primarily due to supply constraints affecting shipments.",
151
+ "lineage": [
152
+ {
153
+ "ln_id": "ln_92af",
154
+ "score": 0.91,
155
+ "source": {
156
+ "uri": "data/10Q_Q3_2023.pdf",
157
+ "page": 14
158
+ },
159
+ "dataset_version": "v3.1",
160
+ "transform_chain": ["pdf_parse","section_split","semantic_chunk","deduplicate"]
161
+ }
162
+ ],
163
+ "audit": {
164
+ "staleness_check": "pass",
165
+ "version_consistency": "single_version",
166
+ "transform_risk_flags": []
167
+ }
168
+ }
169
+ ```
170
+
171
+ ## Quickstart
172
+
173
+ ### Installation
174
+
175
+ ```bash
176
+ pip install raglineage
177
+ ```
178
+
179
+ ### Basic Usage
180
+
181
+ ```python
182
+ from raglineage import RagLineage
183
+
184
+ rag = RagLineage(
185
+ source="examples/data",
186
+ store_backend="faiss",
187
+ embed_backend="local"
188
+ )
189
+
190
+ # Build initial version
191
+ rag.build(version="v1.0")
192
+
193
+ # Query with lineage
194
+ ans = rag.query("What is the refund policy?", k=5)
195
+ print(ans.model_dump_json(indent=2))
196
+
197
+ # Audit the answer
198
+ report = rag.audit(ans)
199
+ print(report.model_dump_json(indent=2))
200
+ ```
201
+
202
+ ### CLI Usage
203
+
204
+ ```bash
205
+ # Initialize a project
206
+ raglineage init ./my_project
207
+
208
+ # Build from source
209
+ raglineage build --source ./data --version v1.0
210
+
211
+ # Update incrementally
212
+ raglineage update --source ./data --version v1.1 --changed-only
213
+
214
+ # Query
215
+ raglineage query "What is the refund policy?" --k 5
216
+
217
+ # Diff versions
218
+ raglineage diff v1.0 v1.1
219
+ ```
220
+
221
+ ## Comparison with Other RAG Tools
222
+
223
+ | Feature | raglineage | LangChain | LlamaIndex |
224
+ |---------|-----------|-----------|------------|
225
+ | **Lineage Tracking** | ✅ First-class | ❌ Not built-in | ❌ Not built-in |
226
+ | **Dataset Versioning** | ✅ Native | ❌ Manual | ❌ Manual |
227
+ | **Incremental Updates** | ✅ Automatic | ❌ Full rebuild | ❌ Full rebuild |
228
+ | **Answer Auditing** | ✅ Built-in | ❌ Manual | ❌ Manual |
229
+ | **Transform Chain Tracking** | ✅ Every LN | ❌ Not tracked | ❌ Not tracked |
230
+ | **Version Diffing** | ✅ Structured | ❌ Not available | ❌ Not available |
231
+ | **Graph Relationships** | ✅ DAG-based | ⚠️ Optional | ⚠️ Optional |
232
+ | **Source Provenance** | ✅ Complete | ⚠️ Basic | ⚠️ Basic |
233
+
234
+ **Key Difference**: raglineage treats lineage as a core requirement, not an afterthought. Every operation preserves and tracks provenance.
235
+
236
+ ## Core Concepts
237
+
238
+ ### Lineage Nodes (LN)
239
+
240
+ A Lineage Node is the atomic unit of retrieval. Each LN has:
241
+ - **ln_id**: Stable, deterministic identifier
242
+ - **content**: The actual text content
243
+ - **source**: Precise reference to origin (file, page, row, etc.)
244
+ - **dataset_version**: Version tag for the dataset
245
+ - **transform_chain**: Ordered list of transforms applied
246
+ - **content_hash**: SHA-256 hash for integrity
247
+ - **timestamps**: Created/updated timestamps
248
+
249
+ ### Lineage Graph
250
+
251
+ A directed acyclic graph (DAG) where:
252
+ - **Nodes**: Lineage Node IDs
253
+ - **Edges**: Typed relationships (adjacent, semantic, references, same_entity, etc.)
254
+
255
+ Enables graph-walk retrieval and relationship exploration.
256
+
257
+ ### Dataset Versioning
258
+
259
+ Each dataset build produces a versioned manifest:
260
+ - Tracks all source files and their hashes
261
+ - Enables diffing between versions
262
+ - Supports incremental updates (only recompute changed files)
263
+
264
+ ### Answer Auditing
265
+
266
+ Every answer includes:
267
+ - **Lineage**: List of LNs used with scores and metadata
268
+ - **Audit Report**:
269
+ - Version consistency check
270
+ - Staleness detection
271
+ - Transform risk flags
272
+
273
+ ## Project Structure
274
+
275
+ ```
276
+ raglineage/
277
+ ├── raglineage/
278
+ │ ├── __init__.py
279
+ │ ├── api.py # High-level API
280
+ │ ├── config.py # Configuration
281
+ │ ├── schemas/ # Pydantic models
282
+ │ ├── ingest/ # Data ingestion
283
+ │ ├── transform/ # Transformations
284
+ │ ├── lineage/ # Graph & versioning
285
+ │ ├── embedding/ # Embedding backends
286
+ │ ├── store/ # Vector stores
287
+ │ ├── retrieval/ # Retrieval logic
288
+ │ ├── audit/ # Auditing
289
+ │ ├── cli/ # CLI interface
290
+ │ └── utils/ # Utilities
291
+ ├── tests/ # Test suite
292
+ ├── examples/ # Example datasets
293
+ └── pyproject.toml # Package config
294
+ ```
295
+
296
+ ## Requirements
297
+
298
+ - Python ≥ 3.10
299
+ - Strict type hints throughout
300
+ - Pydantic models for schemas
301
+ - NetworkX for graph operations
302
+ - FAISS for vector storage
303
+ - Sentence-transformers for local embeddings
304
+
305
+ ## Development
306
+
307
+ ```bash
308
+ # Clone repository
309
+ git clone https://github.com/PranavMotarwar/raglineage.git
310
+ cd raglineage
311
+
312
+ # Install in development mode
313
+ pip install -e ".[dev]"
314
+
315
+ # Run tests
316
+ pytest
317
+
318
+ # Run linting
319
+ ruff check .
320
+ ```
321
+
322
+ ## Contributing
323
+
324
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
325
+
326
+ ## License
327
+
328
+ Apache-2.0 License. See [LICENSE](LICENSE) for details.
329
+
330
+ ## Author
331
+
332
+ Pranav Motarwar - [GitHub](https://github.com/PranavMotarwar)
333
+
334
+ ---
335
+
336
+ **raglineage** - Where every answer has a traceable origin.