contextiq 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 ContextIQ Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,132 @@
1
+ Metadata-Version: 2.4
2
+ Name: contextiq
3
+ Version: 0.1.0
4
+ Summary: Turn messy files into agent-ready context.
5
+ Author: ContextIQ Contributors
6
+ License-Expression: MIT
7
+ Keywords: rag,agents,llm,ingestion,chunking,search,context
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Classifier: Topic :: Software Development :: Libraries
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Classifier: Topic :: Text Processing
19
+ Requires-Python: >=3.10
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Provides-Extra: docs
23
+ Requires-Dist: python-docx>=1.1.0; extra == "docs"
24
+ Requires-Dist: pypdf>=5.0.0; extra == "docs"
25
+ Provides-Extra: dev
26
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
27
+ Dynamic: license-file
28
+
29
+ # ContextIQ
30
+
31
+ ContextIQ turns messy files into agent-ready context.
32
+
33
+ It is a local-first ingestion pipeline for developers building RAG systems, agent memory layers, document search, and eval datasets. Point it at a folder and it produces clean JSONL and Markdown exports with chunked, traceable content.
34
+
35
+ ## Why it exists
36
+
37
+ Most AI tooling starts after your data is already clean. Real projects get stuck much earlier:
38
+
39
+ - PDFs are noisy
40
+ - Word docs lose structure
41
+ - repos and notes mix formats
42
+ - chunks are inconsistent
43
+ - source traceability is easy to lose
44
+
45
+ ContextIQ focuses on the missing middle: consistent ingestion, chunking, and export.
46
+
47
+ ## Features
48
+
49
+ - Local-first CLI
50
+ - Recursive file ingestion
51
+ - Built-in support for:
52
+ - `.txt`, `.md`, `.rst`
53
+ - `.json`, `.jsonl`
54
+ - `.csv`, `.tsv`
55
+ - `.html`, `.htm`
56
+ - optional `.pdf` via `pypdf`
57
+ - optional `.docx` via `python-docx`
58
+ - Document-aware chunking
59
+ - Source-preserving metadata
60
+ - JSONL and Markdown exports
61
+ - Run manifest with counts, warnings, and timings
62
+
63
+ ## Quickstart
64
+
65
+ ```bash
66
+ python -m venv .venv
67
+ . .venv/bin/activate
68
+ pip install -e .[dev]
69
+ contextiq ingest ./examples --out ./build/context
70
+ ```
71
+
72
+ On Windows PowerShell:
73
+
74
+ ```powershell
75
+ python -m venv .venv
76
+ .venv\Scripts\Activate.ps1
77
+ pip install -e .[dev]
78
+ contextiq ingest .\examples --out .\build\context
79
+ ```
80
+
81
+ ## CLI
82
+
83
+ ```bash
84
+ contextiq ingest <path> --out <directory>
85
+ ```
86
+
87
+ Useful flags:
88
+
89
+ - `--include-ext .md,.txt,.json`
90
+ - `--exclude-glob "*.min.js,*.lock"`
91
+ - `--chunk-size 1200`
92
+ - `--chunk-overlap 150`
93
+ - `--formats jsonl,markdown`
94
+ - `--fail-on-warning`
95
+
96
+ ## Output
97
+
98
+ `contextiq ingest` writes:
99
+
100
+ - `documents.jsonl`: normalized source documents
101
+ - `chunks.jsonl`: chunked outputs for RAG/agents
102
+ - `chunks.md`: human-readable review file
103
+ - `manifest.json`: summary of the run
104
+
105
+ Each chunk preserves:
106
+
107
+ - source path
108
+ - document id
109
+ - chunk id
110
+ - byte and character ranges when available
111
+ - headings / section hints
112
+
113
+ ## Example
114
+
115
+ ```bash
116
+ contextiq ingest ./docs --out ./dist/context --chunk-size 900 --chunk-overlap 120
117
+ ```
118
+
119
+ ## Development
120
+
121
+ ```bash
122
+ pip install -e .[dev]
123
+ pytest
124
+ ```
125
+
126
+ ## Roadmap
127
+
128
+ - embeddings plugin interface
129
+ - vector DB exporters
130
+ - OCR pipeline
131
+ - table extraction
132
+ - citation-aware retrieval benchmarks
@@ -0,0 +1,104 @@
1
+ # ContextIQ
2
+
3
+ ContextIQ turns messy files into agent-ready context.
4
+
5
+ It is a local-first ingestion pipeline for developers building RAG systems, agent memory layers, document search, and eval datasets. Point it at a folder and it produces clean JSONL and Markdown exports with chunked, traceable content.
6
+
7
+ ## Why it exists
8
+
9
+ Most AI tooling starts after your data is already clean. Real projects get stuck much earlier:
10
+
11
+ - PDFs are noisy
12
+ - Word docs lose structure
13
+ - repos and notes mix formats
14
+ - chunks are inconsistent
15
+ - source traceability is easy to lose
16
+
17
+ ContextIQ focuses on the missing middle: consistent ingestion, chunking, and export.
18
+
19
+ ## Features
20
+
21
+ - Local-first CLI
22
+ - Recursive file ingestion
23
+ - Built-in support for:
24
+ - `.txt`, `.md`, `.rst`
25
+ - `.json`, `.jsonl`
26
+ - `.csv`, `.tsv`
27
+ - `.html`, `.htm`
28
+ - optional `.pdf` via `pypdf`
29
+ - optional `.docx` via `python-docx`
30
+ - Document-aware chunking
31
+ - Source-preserving metadata
32
+ - JSONL and Markdown exports
33
+ - Run manifest with counts, warnings, and timings
34
+
35
+ ## Quickstart
36
+
37
+ ```bash
38
+ python -m venv .venv
39
+ . .venv/bin/activate
40
+ pip install -e .[dev]
41
+ contextiq ingest ./examples --out ./build/context
42
+ ```
43
+
44
+ On Windows PowerShell:
45
+
46
+ ```powershell
47
+ python -m venv .venv
48
+ .venv\Scripts\Activate.ps1
49
+ pip install -e .[dev]
50
+ contextiq ingest .\examples --out .\build\context
51
+ ```
52
+
53
+ ## CLI
54
+
55
+ ```bash
56
+ contextiq ingest <path> --out <directory>
57
+ ```
58
+
59
+ Useful flags:
60
+
61
+ - `--include-ext .md,.txt,.json`
62
+ - `--exclude-glob "*.min.js,*.lock"`
63
+ - `--chunk-size 1200`
64
+ - `--chunk-overlap 150`
65
+ - `--formats jsonl,markdown`
66
+ - `--fail-on-warning`
67
+
68
+ ## Output
69
+
70
+ `contextiq ingest` writes:
71
+
72
+ - `documents.jsonl`: normalized source documents
73
+ - `chunks.jsonl`: chunked outputs for RAG/agents
74
+ - `chunks.md`: human-readable review file
75
+ - `manifest.json`: summary of the run
76
+
77
+ Each chunk preserves:
78
+
79
+ - source path
80
+ - document id
81
+ - chunk id
82
+ - byte and character ranges when available
83
+ - headings / section hints
84
+
85
+ ## Example
86
+
87
+ ```bash
88
+ contextiq ingest ./docs --out ./dist/context --chunk-size 900 --chunk-overlap 120
89
+ ```
90
+
91
+ ## Development
92
+
93
+ ```bash
94
+ pip install -e .[dev]
95
+ pytest
96
+ ```
97
+
98
+ ## Roadmap
99
+
100
+ - embeddings plugin interface
101
+ - vector DB exporters
102
+ - OCR pipeline
103
+ - table extraction
104
+ - citation-aware retrieval benchmarks
@@ -0,0 +1,48 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "contextiq"
7
+ version = "0.1.0"
8
+ description = "Turn messy files into agent-ready context."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = "MIT"
12
+ license-files = ["LICENSE"]
13
+ authors = [
14
+ { name = "ContextIQ Contributors" }
15
+ ]
16
+ keywords = ["rag", "agents", "llm", "ingestion", "chunking", "search", "context"]
17
+ classifiers = [
18
+ "Development Status :: 3 - Alpha",
19
+ "Intended Audience :: Developers",
20
+ "Operating System :: OS Independent",
21
+ "Programming Language :: Python :: 3",
22
+ "Programming Language :: Python :: 3.10",
23
+ "Programming Language :: Python :: 3.11",
24
+ "Programming Language :: Python :: 3.12",
25
+ "Programming Language :: Python :: 3.13",
26
+ "Topic :: Software Development :: Libraries",
27
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
28
+ "Topic :: Text Processing",
29
+ ]
30
+ dependencies = []
31
+
32
+ [project.optional-dependencies]
33
+ docs = ["python-docx>=1.1.0", "pypdf>=5.0.0"]
34
+ dev = ["pytest>=8.0.0"]
35
+
36
+ [project.scripts]
37
+ contextiq = "contextiq.cli:main"
38
+
39
+ [tool.setuptools]
40
+ package-dir = {"" = "src"}
41
+
42
+ [tool.setuptools.packages.find]
43
+ where = ["src"]
44
+
45
+ [tool.pytest.ini_options]
46
+ testpaths = ["tests"]
47
+ addopts = "--basetemp=./build/.pytest-tmp"
48
+ cache_dir = "build/.pytest-cache"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,5 @@
1
+ """ContextIQ package."""
2
+
3
+ __all__ = ["__version__"]
4
+
5
+ __version__ = "0.1.0"
@@ -0,0 +1,5 @@
1
+ from .cli import main
2
+
3
+
4
+ if __name__ == "__main__":
5
+ raise SystemExit(main())
@@ -0,0 +1,60 @@
1
+ from __future__ import annotations
2
+
3
+ from .models import Chunk, Document
4
+ from .utils import stable_id
5
+
6
+
7
+ def chunk_document(document: Document, chunk_size: int, chunk_overlap: int) -> list[Chunk]:
8
+ if not document.text:
9
+ return []
10
+
11
+ text = document.text
12
+ chunks: list[Chunk] = []
13
+ start = 0
14
+ index = 0
15
+
16
+ while start < len(text):
17
+ end = min(len(text), start + chunk_size)
18
+ if end < len(text):
19
+ candidate = text.rfind("\n\n", start, end)
20
+ if candidate > start + max(200, chunk_size // 3):
21
+ end = candidate
22
+ else:
23
+ sentence_break = text.rfind(". ", start, end)
24
+ if sentence_break > start + max(120, chunk_size // 4):
25
+ end = sentence_break + 1
26
+
27
+ chunk_text = text[start:end].strip()
28
+ if chunk_text:
29
+ chunks.append(
30
+ Chunk(
31
+ chunk_id=stable_id(document.doc_id, str(index), str(start), str(end)),
32
+ doc_id=document.doc_id,
33
+ source_path=document.source_path,
34
+ text=chunk_text,
35
+ start_char=start,
36
+ end_char=end,
37
+ section_title=_resolve_section_title(document, start, end),
38
+ metadata={"source_type": document.source_type},
39
+ )
40
+ )
41
+
42
+ if end >= len(text):
43
+ break
44
+
45
+ next_start = max(end - chunk_overlap, start + 1)
46
+ if next_start <= start:
47
+ next_start = end
48
+ start = next_start
49
+ index += 1
50
+
51
+ return chunks
52
+
53
+
54
+ def _resolve_section_title(document: Document, start: int, end: int) -> str | None:
55
+ for section in document.sections:
56
+ if section.start_char <= start < section.end_char:
57
+ return section.title
58
+ if start <= section.start_char < end:
59
+ return section.title
60
+ return None
@@ -0,0 +1,63 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ from .exporters import export_result
8
+ from .pipeline import default_config, run_ingest
9
+
10
+
11
+ def build_parser() -> argparse.ArgumentParser:
12
+ parser = argparse.ArgumentParser(prog="contextiq", description="Turn messy files into agent-ready context.")
13
+ subparsers = parser.add_subparsers(dest="command", required=True)
14
+
15
+ ingest = subparsers.add_parser("ingest", help="Ingest a directory of files.")
16
+ ingest.add_argument("path", help="Root directory to ingest")
17
+ ingest.add_argument("--out", required=True, help="Output directory")
18
+ ingest.add_argument("--include-ext", help="Comma-separated file extensions to include")
19
+ ingest.add_argument("--exclude-glob", help="Comma-separated glob patterns to exclude")
20
+ ingest.add_argument("--chunk-size", type=int, default=1200, help="Target chunk size in characters")
21
+ ingest.add_argument("--chunk-overlap", type=int, default=150, help="Chunk overlap in characters")
22
+ ingest.add_argument("--formats", default="jsonl,markdown", help="Comma-separated output formats")
23
+ ingest.add_argument("--fail-on-warning", action="store_true", help="Exit non-zero if warnings occur")
24
+ return parser
25
+
26
+
27
+ def main(argv: list[str] | None = None) -> int:
28
+ args = build_parser().parse_args(argv)
29
+ if args.command != "ingest":
30
+ return 1
31
+
32
+ root = Path(args.path).resolve()
33
+ out_dir = Path(args.out).resolve()
34
+ config = default_config(root, out_dir)
35
+
36
+ if args.include_ext:
37
+ config.include_extensions = {normalize_extension(ext) for ext in args.include_ext.split(",") if ext.strip()}
38
+ if args.exclude_glob:
39
+ config.exclude_globs = [item.strip() for item in args.exclude_glob.split(",") if item.strip()]
40
+ config.chunk_size = args.chunk_size
41
+ config.chunk_overlap = args.chunk_overlap
42
+ config.formats = {item.strip().lower() for item in args.formats.split(",") if item.strip()}
43
+ config.fail_on_warning = args.fail_on_warning
44
+
45
+ result = run_ingest(config)
46
+ export_result(out_dir, result, root, config.formats, config.to_manifest_config())
47
+
48
+ print(f"Ingested {len(result.documents)} documents into {len(result.chunks)} chunks.")
49
+ if result.warnings:
50
+ print(f"Warnings: {len(result.warnings)}")
51
+ print(f"Output: {out_dir}")
52
+ return 0
53
+
54
+
55
+ def normalize_extension(extension: str) -> str:
56
+ ext = extension.strip().lower()
57
+ if not ext:
58
+ return ext
59
+ return ext if ext.startswith(".") else f".{ext}"
60
+
61
+
62
+ if __name__ == "__main__":
63
+ raise SystemExit(main(sys.argv[1:]))
@@ -0,0 +1,51 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import Iterable
6
+
7
+
8
+ DEFAULT_EXTENSIONS = {
9
+ ".txt",
10
+ ".md",
11
+ ".rst",
12
+ ".json",
13
+ ".jsonl",
14
+ ".csv",
15
+ ".tsv",
16
+ ".html",
17
+ ".htm",
18
+ ".pdf",
19
+ ".docx",
20
+ }
21
+
22
+ DEFAULT_EXCLUDES = {
23
+ ".git",
24
+ ".hg",
25
+ ".svn",
26
+ ".venv",
27
+ "venv",
28
+ "node_modules",
29
+ "__pycache__",
30
+ "dist",
31
+ "build",
32
+ }
33
+
34
+
35
+ @dataclass(slots=True)
36
+ class DiscoveryConfig:
37
+ include_extensions: set[str]
38
+ exclude_globs: list[str]
39
+
40
+
41
+ def iter_files(root: Path, config: DiscoveryConfig) -> Iterable[Path]:
42
+ for path in root.rglob("*"):
43
+ if not path.is_file():
44
+ continue
45
+ if any(part in DEFAULT_EXCLUDES for part in path.parts):
46
+ continue
47
+ if config.include_extensions and path.suffix.lower() not in config.include_extensions:
48
+ continue
49
+ if any(path.match(pattern) for pattern in config.exclude_globs):
50
+ continue
51
+ yield path
@@ -0,0 +1,47 @@
1
+ from __future__ import annotations
2
+
3
+ import datetime as dt
4
+ from pathlib import Path
5
+
6
+ from .models import Chunk, IngestResult
7
+ from .utils import ensure_dir, write_json, write_jsonl
8
+
9
+
10
+ def export_result(
11
+ out_dir: Path,
12
+ result: IngestResult,
13
+ source_root: Path,
14
+ formats: set[str],
15
+ config: dict,
16
+ ) -> None:
17
+ ensure_dir(out_dir)
18
+ write_jsonl(out_dir / "documents.jsonl", (doc.to_dict() for doc in result.documents))
19
+ write_jsonl(out_dir / "chunks.jsonl", (chunk.to_dict() for chunk in result.chunks))
20
+
21
+ if "markdown" in formats:
22
+ _write_markdown(out_dir / "chunks.md", result.chunks)
23
+
24
+ manifest = {
25
+ "generated_at": dt.datetime.utcnow().isoformat() + "Z",
26
+ "source_root": str(source_root),
27
+ "summary": result.summary(),
28
+ "warnings": result.warnings,
29
+ "config": config,
30
+ }
31
+ write_json(out_dir / "manifest.json", manifest)
32
+
33
+
34
+ def _write_markdown(path: Path, chunks: list[Chunk]) -> None:
35
+ lines = ["# ContextIQ Chunk Export", ""]
36
+ for chunk in chunks:
37
+ lines.append(f"## {chunk.chunk_id}")
38
+ lines.append("")
39
+ lines.append(f"- Source: `{chunk.source_path}`")
40
+ lines.append(f"- Document: `{chunk.doc_id}`")
41
+ lines.append(f"- Range: `{chunk.start_char}:{chunk.end_char}`")
42
+ if chunk.section_title:
43
+ lines.append(f"- Section: `{chunk.section_title}`")
44
+ lines.append("")
45
+ lines.append(chunk.text)
46
+ lines.append("")
47
+ path.write_text("\n".join(lines), encoding="utf-8")
@@ -0,0 +1,138 @@
1
+ from __future__ import annotations
2
+
3
+ import csv
4
+ import json
5
+ from html.parser import HTMLParser
6
+ from pathlib import Path
7
+
8
+ from .models import Document, Section
9
+ from .utils import normalize_text, stable_id
10
+
11
+
12
+ class PlainTextExtractor(HTMLParser):
13
+ def __init__(self) -> None:
14
+ super().__init__()
15
+ self.parts: list[str] = []
16
+
17
+ def handle_data(self, data: str) -> None:
18
+ if data.strip():
19
+ self.parts.append(data.strip())
20
+
21
+ def get_text(self) -> str:
22
+ return "\n".join(self.parts)
23
+
24
+
25
+ def extract_sections(text: str) -> list[Section]:
26
+ sections: list[Section] = []
27
+ lines = text.splitlines()
28
+ cursor = 0
29
+ heading_positions: list[tuple[str, int, int]] = []
30
+ for line in lines:
31
+ stripped = line.strip()
32
+ line_len = len(line) + 1
33
+ if stripped.startswith("#"):
34
+ level = len(stripped) - len(stripped.lstrip("#"))
35
+ title = stripped[level:].strip() or "Section"
36
+ heading_positions.append((title, level, cursor))
37
+ cursor += line_len
38
+
39
+ for index, (title, level, start) in enumerate(heading_positions):
40
+ end = heading_positions[index + 1][2] if index + 1 < len(heading_positions) else len(text)
41
+ sections.append(Section(title=title, level=level, start_char=start, end_char=end))
42
+ return sections
43
+
44
+
45
+ def load_document(path: Path, root: Path) -> Document:
46
+ suffix = path.suffix.lower()
47
+ rel_path = path.relative_to(root).as_posix()
48
+ warnings: list[str] = []
49
+
50
+ if suffix in {".txt", ".md", ".rst"}:
51
+ text = path.read_text(encoding="utf-8", errors="ignore")
52
+ source_type = suffix[1:]
53
+ elif suffix in {".json", ".jsonl"}:
54
+ text = _load_json_like(path, suffix)
55
+ source_type = suffix[1:]
56
+ elif suffix in {".csv", ".tsv"}:
57
+ text = _load_delimited(path, "\t" if suffix == ".tsv" else ",")
58
+ source_type = suffix[1:]
59
+ elif suffix in {".html", ".htm"}:
60
+ text = _load_html(path)
61
+ source_type = "html"
62
+ elif suffix == ".pdf":
63
+ text, warning = _load_pdf(path)
64
+ source_type = "pdf"
65
+ if warning:
66
+ warnings.append(warning)
67
+ elif suffix == ".docx":
68
+ text, warning = _load_docx(path)
69
+ source_type = "docx"
70
+ if warning:
71
+ warnings.append(warning)
72
+ else:
73
+ raise ValueError(f"Unsupported file type: {path}")
74
+
75
+ normalized = normalize_text(text)
76
+ return Document(
77
+ doc_id=stable_id(rel_path),
78
+ source_path=rel_path,
79
+ source_type=source_type,
80
+ text=normalized,
81
+ metadata={"extension": suffix},
82
+ sections=extract_sections(normalized),
83
+ warnings=warnings,
84
+ )
85
+
86
+
87
+ def _load_json_like(path: Path, suffix: str) -> str:
88
+ if suffix == ".json":
89
+ data = json.loads(path.read_text(encoding="utf-8", errors="ignore"))
90
+ return json.dumps(data, indent=2, ensure_ascii=False)
91
+
92
+ rows = []
93
+ with path.open("r", encoding="utf-8", errors="ignore") as handle:
94
+ for line in handle:
95
+ stripped = line.strip()
96
+ if stripped:
97
+ rows.append(json.loads(stripped))
98
+ return "\n\n".join(json.dumps(row, indent=2, ensure_ascii=False) for row in rows)
99
+
100
+
101
+ def _load_delimited(path: Path, delimiter: str) -> str:
102
+ with path.open("r", encoding="utf-8", errors="ignore", newline="") as handle:
103
+ reader = csv.reader(handle, delimiter=delimiter)
104
+ lines = [" | ".join(cell.strip() for cell in row) for row in reader]
105
+ return "\n".join(lines)
106
+
107
+
108
+ def _load_html(path: Path) -> str:
109
+ parser = PlainTextExtractor()
110
+ parser.feed(path.read_text(encoding="utf-8", errors="ignore"))
111
+ return parser.get_text()
112
+
113
+
114
+ def _load_pdf(path: Path) -> tuple[str, str | None]:
115
+ try:
116
+ from pypdf import PdfReader
117
+ except ImportError:
118
+ return (
119
+ "",
120
+ "Skipped PDF content because 'pypdf' is not installed. Install with: pip install contextiq[docs]",
121
+ )
122
+
123
+ reader = PdfReader(str(path))
124
+ parts = [page.extract_text() or "" for page in reader.pages]
125
+ return "\n\n".join(parts), None
126
+
127
+
128
+ def _load_docx(path: Path) -> tuple[str, str | None]:
129
+ try:
130
+ from docx import Document as WordDocument
131
+ except ImportError:
132
+ return (
133
+ "",
134
+ "Skipped DOCX content because 'python-docx' is not installed. Install with: pip install contextiq[docs]",
135
+ )
136
+
137
+ doc = WordDocument(str(path))
138
+ return "\n".join(paragraph.text for paragraph in doc.paragraphs), None
@@ -0,0 +1,59 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import asdict, dataclass, field
4
+ from typing import Any
5
+
6
+
7
+ @dataclass(slots=True)
8
+ class Section:
9
+ title: str
10
+ level: int
11
+ start_char: int
12
+ end_char: int
13
+
14
+
15
+ @dataclass(slots=True)
16
+ class Document:
17
+ doc_id: str
18
+ source_path: str
19
+ source_type: str
20
+ text: str
21
+ metadata: dict[str, Any] = field(default_factory=dict)
22
+ sections: list[Section] = field(default_factory=list)
23
+ warnings: list[str] = field(default_factory=list)
24
+
25
+ def to_dict(self) -> dict[str, Any]:
26
+ payload = asdict(self)
27
+ payload["char_count"] = len(self.text)
28
+ return payload
29
+
30
+
31
+ @dataclass(slots=True)
32
+ class Chunk:
33
+ chunk_id: str
34
+ doc_id: str
35
+ source_path: str
36
+ text: str
37
+ start_char: int
38
+ end_char: int
39
+ section_title: str | None
40
+ metadata: dict[str, Any] = field(default_factory=dict)
41
+
42
+ def to_dict(self) -> dict[str, Any]:
43
+ payload = asdict(self)
44
+ payload["char_count"] = len(self.text)
45
+ return payload
46
+
47
+
48
+ @dataclass(slots=True)
49
+ class IngestResult:
50
+ documents: list[Document]
51
+ chunks: list[Chunk]
52
+ warnings: list[str]
53
+
54
+ def summary(self) -> dict[str, Any]:
55
+ return {
56
+ "document_count": len(self.documents),
57
+ "chunk_count": len(self.chunks),
58
+ "warning_count": len(self.warnings),
59
+ }
@@ -0,0 +1,70 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+
6
+ from .chunking import chunk_document
7
+ from .discovery import DEFAULT_EXTENSIONS, DiscoveryConfig, iter_files
8
+ from .loaders import load_document
9
+ from .models import IngestResult
10
+
11
+
12
+ @dataclass(slots=True)
13
+ class IngestConfig:
14
+ root: Path
15
+ output_dir: Path
16
+ include_extensions: set[str]
17
+ exclude_globs: list[str]
18
+ chunk_size: int
19
+ chunk_overlap: int
20
+ formats: set[str]
21
+ fail_on_warning: bool = False
22
+
23
+ def to_manifest_config(self) -> dict:
24
+ return {
25
+ "include_extensions": sorted(self.include_extensions),
26
+ "exclude_globs": self.exclude_globs,
27
+ "chunk_size": self.chunk_size,
28
+ "chunk_overlap": self.chunk_overlap,
29
+ "formats": sorted(self.formats),
30
+ "fail_on_warning": self.fail_on_warning,
31
+ }
32
+
33
+
34
+ def default_config(root: Path, output_dir: Path) -> IngestConfig:
35
+ return IngestConfig(
36
+ root=root,
37
+ output_dir=output_dir,
38
+ include_extensions=set(DEFAULT_EXTENSIONS),
39
+ exclude_globs=[],
40
+ chunk_size=1200,
41
+ chunk_overlap=150,
42
+ formats={"jsonl", "markdown"},
43
+ fail_on_warning=False,
44
+ )
45
+
46
+
47
+ def run_ingest(config: IngestConfig) -> IngestResult:
48
+ discovery = DiscoveryConfig(
49
+ include_extensions={ext.lower() for ext in config.include_extensions},
50
+ exclude_globs=config.exclude_globs,
51
+ )
52
+ documents = []
53
+ chunks = []
54
+ warnings: list[str] = []
55
+
56
+ for path in iter_files(config.root, discovery):
57
+ document = load_document(path, config.root)
58
+ if not document.text:
59
+ warnings.extend(f"{document.source_path}: {warning}" for warning in document.warnings)
60
+ documents.append(document)
61
+ continue
62
+
63
+ documents.append(document)
64
+ warnings.extend(f"{document.source_path}: {warning}" for warning in document.warnings)
65
+ chunks.extend(chunk_document(document, config.chunk_size, config.chunk_overlap))
66
+
67
+ if config.fail_on_warning and warnings:
68
+ raise ValueError("Warnings encountered during ingest:\n" + "\n".join(warnings))
69
+
70
+ return IngestResult(documents=documents, chunks=chunks, warnings=warnings)
@@ -0,0 +1,36 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import json
5
+ import re
6
+ from pathlib import Path
7
+ from typing import Iterable
8
+
9
+
10
+ WHITESPACE_RE = re.compile(r"[ \t]+")
11
+ BLANKLINE_RE = re.compile(r"\n{3,}")
12
+
13
+
14
+ def stable_id(*parts: str) -> str:
15
+ joined = "::".join(parts)
16
+ return hashlib.sha1(joined.encode("utf-8")).hexdigest()[:16]
17
+
18
+
19
+ def normalize_text(text: str) -> str:
20
+ text = text.replace("\r\n", "\n").replace("\r", "\n")
21
+ text = "\n".join(WHITESPACE_RE.sub(" ", line).rstrip() for line in text.splitlines())
22
+ return BLANKLINE_RE.sub("\n\n", text).strip()
23
+
24
+
25
+ def ensure_dir(path: Path) -> None:
26
+ path.mkdir(parents=True, exist_ok=True)
27
+
28
+
29
+ def write_json(path: Path, data: object) -> None:
30
+ path.write_text(json.dumps(data, indent=2, ensure_ascii=True) + "\n", encoding="utf-8")
31
+
32
+
33
+ def write_jsonl(path: Path, rows: Iterable[dict]) -> None:
34
+ with path.open("w", encoding="utf-8") as handle:
35
+ for row in rows:
36
+ handle.write(json.dumps(row, ensure_ascii=True) + "\n")
@@ -0,0 +1,132 @@
1
+ Metadata-Version: 2.4
2
+ Name: contextiq
3
+ Version: 0.1.0
4
+ Summary: Turn messy files into agent-ready context.
5
+ Author: ContextIQ Contributors
6
+ License-Expression: MIT
7
+ Keywords: rag,agents,llm,ingestion,chunking,search,context
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Classifier: Topic :: Software Development :: Libraries
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Classifier: Topic :: Text Processing
19
+ Requires-Python: >=3.10
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Provides-Extra: docs
23
+ Requires-Dist: python-docx>=1.1.0; extra == "docs"
24
+ Requires-Dist: pypdf>=5.0.0; extra == "docs"
25
+ Provides-Extra: dev
26
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
27
+ Dynamic: license-file
28
+
29
+ # ContextIQ
30
+
31
+ ContextIQ turns messy files into agent-ready context.
32
+
33
+ It is a local-first ingestion pipeline for developers building RAG systems, agent memory layers, document search, and eval datasets. Point it at a folder and it produces clean JSONL and Markdown exports with chunked, traceable content.
34
+
35
+ ## Why it exists
36
+
37
+ Most AI tooling starts after your data is already clean. Real projects get stuck much earlier:
38
+
39
+ - PDFs are noisy
40
+ - Word docs lose structure
41
+ - repos and notes mix formats
42
+ - chunks are inconsistent
43
+ - source traceability is easy to lose
44
+
45
+ ContextIQ focuses on the missing middle: consistent ingestion, chunking, and export.
46
+
47
+ ## Features
48
+
49
+ - Local-first CLI
50
+ - Recursive file ingestion
51
+ - Built-in support for:
52
+ - `.txt`, `.md`, `.rst`
53
+ - `.json`, `.jsonl`
54
+ - `.csv`, `.tsv`
55
+ - `.html`, `.htm`
56
+ - optional `.pdf` via `pypdf`
57
+ - optional `.docx` via `python-docx`
58
+ - Document-aware chunking
59
+ - Source-preserving metadata
60
+ - JSONL and Markdown exports
61
+ - Run manifest with counts, warnings, and timings
62
+
63
+ ## Quickstart
64
+
65
+ ```bash
66
+ python -m venv .venv
67
+ . .venv/bin/activate
68
+ pip install -e .[dev]
69
+ contextiq ingest ./examples --out ./build/context
70
+ ```
71
+
72
+ On Windows PowerShell:
73
+
74
+ ```powershell
75
+ python -m venv .venv
76
+ .venv\Scripts\Activate.ps1
77
+ pip install -e .[dev]
78
+ contextiq ingest .\examples --out .\build\context
79
+ ```
80
+
81
+ ## CLI
82
+
83
+ ```bash
84
+ contextiq ingest <path> --out <directory>
85
+ ```
86
+
87
+ Useful flags:
88
+
89
+ - `--include-ext .md,.txt,.json`
90
+ - `--exclude-glob "*.min.js,*.lock"`
91
+ - `--chunk-size 1200`
92
+ - `--chunk-overlap 150`
93
+ - `--formats jsonl,markdown`
94
+ - `--fail-on-warning`
95
+
96
+ ## Output
97
+
98
+ `contextiq ingest` writes:
99
+
100
+ - `documents.jsonl`: normalized source documents
101
+ - `chunks.jsonl`: chunked outputs for RAG/agents
102
+ - `chunks.md`: human-readable review file
103
+ - `manifest.json`: summary of the run
104
+
105
+ Each chunk preserves:
106
+
107
+ - source path
108
+ - document id
109
+ - chunk id
110
+ - byte and character ranges when available
111
+ - headings / section hints
112
+
113
+ ## Example
114
+
115
+ ```bash
116
+ contextiq ingest ./docs --out ./dist/context --chunk-size 900 --chunk-overlap 120
117
+ ```
118
+
119
+ ## Development
120
+
121
+ ```bash
122
+ pip install -e .[dev]
123
+ pytest
124
+ ```
125
+
126
+ ## Roadmap
127
+
128
+ - embeddings plugin interface
129
+ - vector DB exporters
130
+ - OCR pipeline
131
+ - table extraction
132
+ - citation-aware retrieval benchmarks
@@ -0,0 +1,21 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ src/contextiq/__init__.py
5
+ src/contextiq/__main__.py
6
+ src/contextiq/chunking.py
7
+ src/contextiq/cli.py
8
+ src/contextiq/discovery.py
9
+ src/contextiq/exporters.py
10
+ src/contextiq/loaders.py
11
+ src/contextiq/models.py
12
+ src/contextiq/pipeline.py
13
+ src/contextiq/utils.py
14
+ src/contextiq.egg-info/PKG-INFO
15
+ src/contextiq.egg-info/SOURCES.txt
16
+ src/contextiq.egg-info/dependency_links.txt
17
+ src/contextiq.egg-info/entry_points.txt
18
+ src/contextiq.egg-info/requires.txt
19
+ src/contextiq.egg-info/top_level.txt
20
+ tests/test_chunking.py
21
+ tests/test_pipeline.py
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ contextiq = contextiq.cli:main
@@ -0,0 +1,7 @@
1
+
2
+ [dev]
3
+ pytest>=8.0.0
4
+
5
+ [docs]
6
+ python-docx>=1.1.0
7
+ pypdf>=5.0.0
@@ -0,0 +1 @@
1
+ contextiq
@@ -0,0 +1,19 @@
1
+ from contextiq.chunking import chunk_document
2
+ from contextiq.models import Document
3
+
4
+
5
+ def test_chunk_document_preserves_source_metadata():
6
+ text = "# Intro\n\n" + ("A" * 700) + "\n\n## Details\n\n" + ("B" * 700)
7
+ document = Document(
8
+ doc_id="doc1",
9
+ source_path="notes/sample.md",
10
+ source_type="md",
11
+ text=text,
12
+ )
13
+
14
+ chunks = chunk_document(document, chunk_size=800, chunk_overlap=100)
15
+
16
+ assert len(chunks) >= 2
17
+ assert all(chunk.doc_id == "doc1" for chunk in chunks)
18
+ assert all(chunk.source_path == "notes/sample.md" for chunk in chunks)
19
+ assert chunks[0].start_char == 0
@@ -0,0 +1,20 @@
1
+ import json
2
+
3
+ from contextiq.pipeline import default_config, run_ingest
4
+
5
+
6
+ def test_pipeline_ingests_directory(tmp_path):
7
+ docs = tmp_path / "docs"
8
+ docs.mkdir()
9
+ (docs / "a.md").write_text("# Title\n\nHello world\n\nSecond paragraph", encoding="utf-8")
10
+ (docs / "b.txt").write_text("Plain text document", encoding="utf-8")
11
+ (docs / "data.json").write_text(json.dumps({"name": "contextiq", "kind": "demo"}), encoding="utf-8")
12
+
13
+ out_dir = tmp_path / "out"
14
+ config = default_config(docs, out_dir)
15
+
16
+ result = run_ingest(config)
17
+
18
+ assert len(result.documents) == 3
19
+ assert len(result.chunks) >= 3
20
+ assert result.warnings == []