contextiq 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contextiq/__init__.py +5 -0
- contextiq/__main__.py +5 -0
- contextiq/chunking.py +60 -0
- contextiq/cli.py +63 -0
- contextiq/discovery.py +51 -0
- contextiq/exporters.py +47 -0
- contextiq/loaders.py +138 -0
- contextiq/models.py +59 -0
- contextiq/pipeline.py +70 -0
- contextiq/utils.py +36 -0
- contextiq-0.1.0.dist-info/METADATA +132 -0
- contextiq-0.1.0.dist-info/RECORD +16 -0
- contextiq-0.1.0.dist-info/WHEEL +5 -0
- contextiq-0.1.0.dist-info/entry_points.txt +2 -0
- contextiq-0.1.0.dist-info/licenses/LICENSE +21 -0
- contextiq-0.1.0.dist-info/top_level.txt +1 -0
contextiq/__init__.py
ADDED
contextiq/__main__.py
ADDED
contextiq/chunking.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from .models import Chunk, Document
|
|
4
|
+
from .utils import stable_id
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def chunk_document(document: Document, chunk_size: int, chunk_overlap: int) -> list[Chunk]:
|
|
8
|
+
if not document.text:
|
|
9
|
+
return []
|
|
10
|
+
|
|
11
|
+
text = document.text
|
|
12
|
+
chunks: list[Chunk] = []
|
|
13
|
+
start = 0
|
|
14
|
+
index = 0
|
|
15
|
+
|
|
16
|
+
while start < len(text):
|
|
17
|
+
end = min(len(text), start + chunk_size)
|
|
18
|
+
if end < len(text):
|
|
19
|
+
candidate = text.rfind("\n\n", start, end)
|
|
20
|
+
if candidate > start + max(200, chunk_size // 3):
|
|
21
|
+
end = candidate
|
|
22
|
+
else:
|
|
23
|
+
sentence_break = text.rfind(". ", start, end)
|
|
24
|
+
if sentence_break > start + max(120, chunk_size // 4):
|
|
25
|
+
end = sentence_break + 1
|
|
26
|
+
|
|
27
|
+
chunk_text = text[start:end].strip()
|
|
28
|
+
if chunk_text:
|
|
29
|
+
chunks.append(
|
|
30
|
+
Chunk(
|
|
31
|
+
chunk_id=stable_id(document.doc_id, str(index), str(start), str(end)),
|
|
32
|
+
doc_id=document.doc_id,
|
|
33
|
+
source_path=document.source_path,
|
|
34
|
+
text=chunk_text,
|
|
35
|
+
start_char=start,
|
|
36
|
+
end_char=end,
|
|
37
|
+
section_title=_resolve_section_title(document, start, end),
|
|
38
|
+
metadata={"source_type": document.source_type},
|
|
39
|
+
)
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
if end >= len(text):
|
|
43
|
+
break
|
|
44
|
+
|
|
45
|
+
next_start = max(end - chunk_overlap, start + 1)
|
|
46
|
+
if next_start <= start:
|
|
47
|
+
next_start = end
|
|
48
|
+
start = next_start
|
|
49
|
+
index += 1
|
|
50
|
+
|
|
51
|
+
return chunks
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _resolve_section_title(document: Document, start: int, end: int) -> str | None:
|
|
55
|
+
for section in document.sections:
|
|
56
|
+
if section.start_char <= start < section.end_char:
|
|
57
|
+
return section.title
|
|
58
|
+
if start <= section.start_char < end:
|
|
59
|
+
return section.title
|
|
60
|
+
return None
|
contextiq/cli.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from .exporters import export_result
|
|
8
|
+
from .pipeline import default_config, run_ingest
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
12
|
+
parser = argparse.ArgumentParser(prog="contextiq", description="Turn messy files into agent-ready context.")
|
|
13
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
14
|
+
|
|
15
|
+
ingest = subparsers.add_parser("ingest", help="Ingest a directory of files.")
|
|
16
|
+
ingest.add_argument("path", help="Root directory to ingest")
|
|
17
|
+
ingest.add_argument("--out", required=True, help="Output directory")
|
|
18
|
+
ingest.add_argument("--include-ext", help="Comma-separated file extensions to include")
|
|
19
|
+
ingest.add_argument("--exclude-glob", help="Comma-separated glob patterns to exclude")
|
|
20
|
+
ingest.add_argument("--chunk-size", type=int, default=1200, help="Target chunk size in characters")
|
|
21
|
+
ingest.add_argument("--chunk-overlap", type=int, default=150, help="Chunk overlap in characters")
|
|
22
|
+
ingest.add_argument("--formats", default="jsonl,markdown", help="Comma-separated output formats")
|
|
23
|
+
ingest.add_argument("--fail-on-warning", action="store_true", help="Exit non-zero if warnings occur")
|
|
24
|
+
return parser
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def main(argv: list[str] | None = None) -> int:
|
|
28
|
+
args = build_parser().parse_args(argv)
|
|
29
|
+
if args.command != "ingest":
|
|
30
|
+
return 1
|
|
31
|
+
|
|
32
|
+
root = Path(args.path).resolve()
|
|
33
|
+
out_dir = Path(args.out).resolve()
|
|
34
|
+
config = default_config(root, out_dir)
|
|
35
|
+
|
|
36
|
+
if args.include_ext:
|
|
37
|
+
config.include_extensions = {normalize_extension(ext) for ext in args.include_ext.split(",") if ext.strip()}
|
|
38
|
+
if args.exclude_glob:
|
|
39
|
+
config.exclude_globs = [item.strip() for item in args.exclude_glob.split(",") if item.strip()]
|
|
40
|
+
config.chunk_size = args.chunk_size
|
|
41
|
+
config.chunk_overlap = args.chunk_overlap
|
|
42
|
+
config.formats = {item.strip().lower() for item in args.formats.split(",") if item.strip()}
|
|
43
|
+
config.fail_on_warning = args.fail_on_warning
|
|
44
|
+
|
|
45
|
+
result = run_ingest(config)
|
|
46
|
+
export_result(out_dir, result, root, config.formats, config.to_manifest_config())
|
|
47
|
+
|
|
48
|
+
print(f"Ingested {len(result.documents)} documents into {len(result.chunks)} chunks.")
|
|
49
|
+
if result.warnings:
|
|
50
|
+
print(f"Warnings: {len(result.warnings)}")
|
|
51
|
+
print(f"Output: {out_dir}")
|
|
52
|
+
return 0
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def normalize_extension(extension: str) -> str:
|
|
56
|
+
ext = extension.strip().lower()
|
|
57
|
+
if not ext:
|
|
58
|
+
return ext
|
|
59
|
+
return ext if ext.startswith(".") else f".{ext}"
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
if __name__ == "__main__":
|
|
63
|
+
raise SystemExit(main(sys.argv[1:]))
|
contextiq/discovery.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Iterable
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
DEFAULT_EXTENSIONS = {
|
|
9
|
+
".txt",
|
|
10
|
+
".md",
|
|
11
|
+
".rst",
|
|
12
|
+
".json",
|
|
13
|
+
".jsonl",
|
|
14
|
+
".csv",
|
|
15
|
+
".tsv",
|
|
16
|
+
".html",
|
|
17
|
+
".htm",
|
|
18
|
+
".pdf",
|
|
19
|
+
".docx",
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
DEFAULT_EXCLUDES = {
|
|
23
|
+
".git",
|
|
24
|
+
".hg",
|
|
25
|
+
".svn",
|
|
26
|
+
".venv",
|
|
27
|
+
"venv",
|
|
28
|
+
"node_modules",
|
|
29
|
+
"__pycache__",
|
|
30
|
+
"dist",
|
|
31
|
+
"build",
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass(slots=True)
|
|
36
|
+
class DiscoveryConfig:
|
|
37
|
+
include_extensions: set[str]
|
|
38
|
+
exclude_globs: list[str]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def iter_files(root: Path, config: DiscoveryConfig) -> Iterable[Path]:
|
|
42
|
+
for path in root.rglob("*"):
|
|
43
|
+
if not path.is_file():
|
|
44
|
+
continue
|
|
45
|
+
if any(part in DEFAULT_EXCLUDES for part in path.parts):
|
|
46
|
+
continue
|
|
47
|
+
if config.include_extensions and path.suffix.lower() not in config.include_extensions:
|
|
48
|
+
continue
|
|
49
|
+
if any(path.match(pattern) for pattern in config.exclude_globs):
|
|
50
|
+
continue
|
|
51
|
+
yield path
|
contextiq/exporters.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import datetime as dt
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from .models import Chunk, IngestResult
|
|
7
|
+
from .utils import ensure_dir, write_json, write_jsonl
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def export_result(
|
|
11
|
+
out_dir: Path,
|
|
12
|
+
result: IngestResult,
|
|
13
|
+
source_root: Path,
|
|
14
|
+
formats: set[str],
|
|
15
|
+
config: dict,
|
|
16
|
+
) -> None:
|
|
17
|
+
ensure_dir(out_dir)
|
|
18
|
+
write_jsonl(out_dir / "documents.jsonl", (doc.to_dict() for doc in result.documents))
|
|
19
|
+
write_jsonl(out_dir / "chunks.jsonl", (chunk.to_dict() for chunk in result.chunks))
|
|
20
|
+
|
|
21
|
+
if "markdown" in formats:
|
|
22
|
+
_write_markdown(out_dir / "chunks.md", result.chunks)
|
|
23
|
+
|
|
24
|
+
manifest = {
|
|
25
|
+
"generated_at": dt.datetime.utcnow().isoformat() + "Z",
|
|
26
|
+
"source_root": str(source_root),
|
|
27
|
+
"summary": result.summary(),
|
|
28
|
+
"warnings": result.warnings,
|
|
29
|
+
"config": config,
|
|
30
|
+
}
|
|
31
|
+
write_json(out_dir / "manifest.json", manifest)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _write_markdown(path: Path, chunks: list[Chunk]) -> None:
|
|
35
|
+
lines = ["# ContextIQ Chunk Export", ""]
|
|
36
|
+
for chunk in chunks:
|
|
37
|
+
lines.append(f"## {chunk.chunk_id}")
|
|
38
|
+
lines.append("")
|
|
39
|
+
lines.append(f"- Source: `{chunk.source_path}`")
|
|
40
|
+
lines.append(f"- Document: `{chunk.doc_id}`")
|
|
41
|
+
lines.append(f"- Range: `{chunk.start_char}:{chunk.end_char}`")
|
|
42
|
+
if chunk.section_title:
|
|
43
|
+
lines.append(f"- Section: `{chunk.section_title}`")
|
|
44
|
+
lines.append("")
|
|
45
|
+
lines.append(chunk.text)
|
|
46
|
+
lines.append("")
|
|
47
|
+
path.write_text("\n".join(lines), encoding="utf-8")
|
contextiq/loaders.py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
import json
|
|
5
|
+
from html.parser import HTMLParser
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from .models import Document, Section
|
|
9
|
+
from .utils import normalize_text, stable_id
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class PlainTextExtractor(HTMLParser):
|
|
13
|
+
def __init__(self) -> None:
|
|
14
|
+
super().__init__()
|
|
15
|
+
self.parts: list[str] = []
|
|
16
|
+
|
|
17
|
+
def handle_data(self, data: str) -> None:
|
|
18
|
+
if data.strip():
|
|
19
|
+
self.parts.append(data.strip())
|
|
20
|
+
|
|
21
|
+
def get_text(self) -> str:
|
|
22
|
+
return "\n".join(self.parts)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def extract_sections(text: str) -> list[Section]:
|
|
26
|
+
sections: list[Section] = []
|
|
27
|
+
lines = text.splitlines()
|
|
28
|
+
cursor = 0
|
|
29
|
+
heading_positions: list[tuple[str, int, int]] = []
|
|
30
|
+
for line in lines:
|
|
31
|
+
stripped = line.strip()
|
|
32
|
+
line_len = len(line) + 1
|
|
33
|
+
if stripped.startswith("#"):
|
|
34
|
+
level = len(stripped) - len(stripped.lstrip("#"))
|
|
35
|
+
title = stripped[level:].strip() or "Section"
|
|
36
|
+
heading_positions.append((title, level, cursor))
|
|
37
|
+
cursor += line_len
|
|
38
|
+
|
|
39
|
+
for index, (title, level, start) in enumerate(heading_positions):
|
|
40
|
+
end = heading_positions[index + 1][2] if index + 1 < len(heading_positions) else len(text)
|
|
41
|
+
sections.append(Section(title=title, level=level, start_char=start, end_char=end))
|
|
42
|
+
return sections
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def load_document(path: Path, root: Path) -> Document:
|
|
46
|
+
suffix = path.suffix.lower()
|
|
47
|
+
rel_path = path.relative_to(root).as_posix()
|
|
48
|
+
warnings: list[str] = []
|
|
49
|
+
|
|
50
|
+
if suffix in {".txt", ".md", ".rst"}:
|
|
51
|
+
text = path.read_text(encoding="utf-8", errors="ignore")
|
|
52
|
+
source_type = suffix[1:]
|
|
53
|
+
elif suffix in {".json", ".jsonl"}:
|
|
54
|
+
text = _load_json_like(path, suffix)
|
|
55
|
+
source_type = suffix[1:]
|
|
56
|
+
elif suffix in {".csv", ".tsv"}:
|
|
57
|
+
text = _load_delimited(path, "\t" if suffix == ".tsv" else ",")
|
|
58
|
+
source_type = suffix[1:]
|
|
59
|
+
elif suffix in {".html", ".htm"}:
|
|
60
|
+
text = _load_html(path)
|
|
61
|
+
source_type = "html"
|
|
62
|
+
elif suffix == ".pdf":
|
|
63
|
+
text, warning = _load_pdf(path)
|
|
64
|
+
source_type = "pdf"
|
|
65
|
+
if warning:
|
|
66
|
+
warnings.append(warning)
|
|
67
|
+
elif suffix == ".docx":
|
|
68
|
+
text, warning = _load_docx(path)
|
|
69
|
+
source_type = "docx"
|
|
70
|
+
if warning:
|
|
71
|
+
warnings.append(warning)
|
|
72
|
+
else:
|
|
73
|
+
raise ValueError(f"Unsupported file type: {path}")
|
|
74
|
+
|
|
75
|
+
normalized = normalize_text(text)
|
|
76
|
+
return Document(
|
|
77
|
+
doc_id=stable_id(rel_path),
|
|
78
|
+
source_path=rel_path,
|
|
79
|
+
source_type=source_type,
|
|
80
|
+
text=normalized,
|
|
81
|
+
metadata={"extension": suffix},
|
|
82
|
+
sections=extract_sections(normalized),
|
|
83
|
+
warnings=warnings,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _load_json_like(path: Path, suffix: str) -> str:
|
|
88
|
+
if suffix == ".json":
|
|
89
|
+
data = json.loads(path.read_text(encoding="utf-8", errors="ignore"))
|
|
90
|
+
return json.dumps(data, indent=2, ensure_ascii=False)
|
|
91
|
+
|
|
92
|
+
rows = []
|
|
93
|
+
with path.open("r", encoding="utf-8", errors="ignore") as handle:
|
|
94
|
+
for line in handle:
|
|
95
|
+
stripped = line.strip()
|
|
96
|
+
if stripped:
|
|
97
|
+
rows.append(json.loads(stripped))
|
|
98
|
+
return "\n\n".join(json.dumps(row, indent=2, ensure_ascii=False) for row in rows)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _load_delimited(path: Path, delimiter: str) -> str:
|
|
102
|
+
with path.open("r", encoding="utf-8", errors="ignore", newline="") as handle:
|
|
103
|
+
reader = csv.reader(handle, delimiter=delimiter)
|
|
104
|
+
lines = [" | ".join(cell.strip() for cell in row) for row in reader]
|
|
105
|
+
return "\n".join(lines)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _load_html(path: Path) -> str:
|
|
109
|
+
parser = PlainTextExtractor()
|
|
110
|
+
parser.feed(path.read_text(encoding="utf-8", errors="ignore"))
|
|
111
|
+
return parser.get_text()
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _load_pdf(path: Path) -> tuple[str, str | None]:
|
|
115
|
+
try:
|
|
116
|
+
from pypdf import PdfReader
|
|
117
|
+
except ImportError:
|
|
118
|
+
return (
|
|
119
|
+
"",
|
|
120
|
+
"Skipped PDF content because 'pypdf' is not installed. Install with: pip install contextiq[docs]",
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
reader = PdfReader(str(path))
|
|
124
|
+
parts = [page.extract_text() or "" for page in reader.pages]
|
|
125
|
+
return "\n\n".join(parts), None
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _load_docx(path: Path) -> tuple[str, str | None]:
|
|
129
|
+
try:
|
|
130
|
+
from docx import Document as WordDocument
|
|
131
|
+
except ImportError:
|
|
132
|
+
return (
|
|
133
|
+
"",
|
|
134
|
+
"Skipped DOCX content because 'python-docx' is not installed. Install with: pip install contextiq[docs]",
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
doc = WordDocument(str(path))
|
|
138
|
+
return "\n".join(paragraph.text for paragraph in doc.paragraphs), None
|
contextiq/models.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import asdict, dataclass, field
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass(slots=True)
|
|
8
|
+
class Section:
|
|
9
|
+
title: str
|
|
10
|
+
level: int
|
|
11
|
+
start_char: int
|
|
12
|
+
end_char: int
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass(slots=True)
|
|
16
|
+
class Document:
|
|
17
|
+
doc_id: str
|
|
18
|
+
source_path: str
|
|
19
|
+
source_type: str
|
|
20
|
+
text: str
|
|
21
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
22
|
+
sections: list[Section] = field(default_factory=list)
|
|
23
|
+
warnings: list[str] = field(default_factory=list)
|
|
24
|
+
|
|
25
|
+
def to_dict(self) -> dict[str, Any]:
|
|
26
|
+
payload = asdict(self)
|
|
27
|
+
payload["char_count"] = len(self.text)
|
|
28
|
+
return payload
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass(slots=True)
|
|
32
|
+
class Chunk:
|
|
33
|
+
chunk_id: str
|
|
34
|
+
doc_id: str
|
|
35
|
+
source_path: str
|
|
36
|
+
text: str
|
|
37
|
+
start_char: int
|
|
38
|
+
end_char: int
|
|
39
|
+
section_title: str | None
|
|
40
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
41
|
+
|
|
42
|
+
def to_dict(self) -> dict[str, Any]:
|
|
43
|
+
payload = asdict(self)
|
|
44
|
+
payload["char_count"] = len(self.text)
|
|
45
|
+
return payload
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass(slots=True)
|
|
49
|
+
class IngestResult:
|
|
50
|
+
documents: list[Document]
|
|
51
|
+
chunks: list[Chunk]
|
|
52
|
+
warnings: list[str]
|
|
53
|
+
|
|
54
|
+
def summary(self) -> dict[str, Any]:
|
|
55
|
+
return {
|
|
56
|
+
"document_count": len(self.documents),
|
|
57
|
+
"chunk_count": len(self.chunks),
|
|
58
|
+
"warning_count": len(self.warnings),
|
|
59
|
+
}
|
contextiq/pipeline.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from .chunking import chunk_document
|
|
7
|
+
from .discovery import DEFAULT_EXTENSIONS, DiscoveryConfig, iter_files
|
|
8
|
+
from .loaders import load_document
|
|
9
|
+
from .models import IngestResult
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass(slots=True)
|
|
13
|
+
class IngestConfig:
|
|
14
|
+
root: Path
|
|
15
|
+
output_dir: Path
|
|
16
|
+
include_extensions: set[str]
|
|
17
|
+
exclude_globs: list[str]
|
|
18
|
+
chunk_size: int
|
|
19
|
+
chunk_overlap: int
|
|
20
|
+
formats: set[str]
|
|
21
|
+
fail_on_warning: bool = False
|
|
22
|
+
|
|
23
|
+
def to_manifest_config(self) -> dict:
|
|
24
|
+
return {
|
|
25
|
+
"include_extensions": sorted(self.include_extensions),
|
|
26
|
+
"exclude_globs": self.exclude_globs,
|
|
27
|
+
"chunk_size": self.chunk_size,
|
|
28
|
+
"chunk_overlap": self.chunk_overlap,
|
|
29
|
+
"formats": sorted(self.formats),
|
|
30
|
+
"fail_on_warning": self.fail_on_warning,
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def default_config(root: Path, output_dir: Path) -> IngestConfig:
|
|
35
|
+
return IngestConfig(
|
|
36
|
+
root=root,
|
|
37
|
+
output_dir=output_dir,
|
|
38
|
+
include_extensions=set(DEFAULT_EXTENSIONS),
|
|
39
|
+
exclude_globs=[],
|
|
40
|
+
chunk_size=1200,
|
|
41
|
+
chunk_overlap=150,
|
|
42
|
+
formats={"jsonl", "markdown"},
|
|
43
|
+
fail_on_warning=False,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def run_ingest(config: IngestConfig) -> IngestResult:
|
|
48
|
+
discovery = DiscoveryConfig(
|
|
49
|
+
include_extensions={ext.lower() for ext in config.include_extensions},
|
|
50
|
+
exclude_globs=config.exclude_globs,
|
|
51
|
+
)
|
|
52
|
+
documents = []
|
|
53
|
+
chunks = []
|
|
54
|
+
warnings: list[str] = []
|
|
55
|
+
|
|
56
|
+
for path in iter_files(config.root, discovery):
|
|
57
|
+
document = load_document(path, config.root)
|
|
58
|
+
if not document.text:
|
|
59
|
+
warnings.extend(f"{document.source_path}: {warning}" for warning in document.warnings)
|
|
60
|
+
documents.append(document)
|
|
61
|
+
continue
|
|
62
|
+
|
|
63
|
+
documents.append(document)
|
|
64
|
+
warnings.extend(f"{document.source_path}: {warning}" for warning in document.warnings)
|
|
65
|
+
chunks.extend(chunk_document(document, config.chunk_size, config.chunk_overlap))
|
|
66
|
+
|
|
67
|
+
if config.fail_on_warning and warnings:
|
|
68
|
+
raise ValueError("Warnings encountered during ingest:\n" + "\n".join(warnings))
|
|
69
|
+
|
|
70
|
+
return IngestResult(documents=documents, chunks=chunks, warnings=warnings)
|
contextiq/utils.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import json
|
|
5
|
+
import re
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Iterable
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
WHITESPACE_RE = re.compile(r"[ \t]+")
|
|
11
|
+
BLANKLINE_RE = re.compile(r"\n{3,}")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def stable_id(*parts: str) -> str:
|
|
15
|
+
joined = "::".join(parts)
|
|
16
|
+
return hashlib.sha1(joined.encode("utf-8")).hexdigest()[:16]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def normalize_text(text: str) -> str:
|
|
20
|
+
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
|
21
|
+
text = "\n".join(WHITESPACE_RE.sub(" ", line).rstrip() for line in text.splitlines())
|
|
22
|
+
return BLANKLINE_RE.sub("\n\n", text).strip()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def ensure_dir(path: Path) -> None:
|
|
26
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def write_json(path: Path, data: object) -> None:
|
|
30
|
+
path.write_text(json.dumps(data, indent=2, ensure_ascii=True) + "\n", encoding="utf-8")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def write_jsonl(path: Path, rows: Iterable[dict]) -> None:
|
|
34
|
+
with path.open("w", encoding="utf-8") as handle:
|
|
35
|
+
for row in rows:
|
|
36
|
+
handle.write(json.dumps(row, ensure_ascii=True) + "\n")
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: contextiq
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Turn messy files into agent-ready context.
|
|
5
|
+
Author: ContextIQ Contributors
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Keywords: rag,agents,llm,ingestion,chunking,search,context
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Classifier: Topic :: Text Processing
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Provides-Extra: docs
|
|
23
|
+
Requires-Dist: python-docx>=1.1.0; extra == "docs"
|
|
24
|
+
Requires-Dist: pypdf>=5.0.0; extra == "docs"
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
|
|
29
|
+
# ContextIQ
|
|
30
|
+
|
|
31
|
+
ContextIQ turns messy files into agent-ready context.
|
|
32
|
+
|
|
33
|
+
It is a local-first ingestion pipeline for developers building RAG systems, agent memory layers, document search, and eval datasets. Point it at a folder and it produces clean JSONL and Markdown exports with chunked, traceable content.
|
|
34
|
+
|
|
35
|
+
## Why it exists
|
|
36
|
+
|
|
37
|
+
Most AI tooling starts after your data is already clean. Real projects get stuck much earlier:
|
|
38
|
+
|
|
39
|
+
- PDFs are noisy
|
|
40
|
+
- Word docs lose structure
|
|
41
|
+
- repos and notes mix formats
|
|
42
|
+
- chunks are inconsistent
|
|
43
|
+
- source traceability is easy to lose
|
|
44
|
+
|
|
45
|
+
ContextIQ focuses on the missing middle: consistent ingestion, chunking, and export.
|
|
46
|
+
|
|
47
|
+
## Features
|
|
48
|
+
|
|
49
|
+
- Local-first CLI
|
|
50
|
+
- Recursive file ingestion
|
|
51
|
+
- Built-in support for:
|
|
52
|
+
- `.txt`, `.md`, `.rst`
|
|
53
|
+
- `.json`, `.jsonl`
|
|
54
|
+
- `.csv`, `.tsv`
|
|
55
|
+
- `.html`, `.htm`
|
|
56
|
+
- optional `.pdf` via `pypdf`
|
|
57
|
+
- optional `.docx` via `python-docx`
|
|
58
|
+
- Document-aware chunking
|
|
59
|
+
- Source-preserving metadata
|
|
60
|
+
- JSONL and Markdown exports
|
|
61
|
+
- Run manifest with counts, warnings, and timings
|
|
62
|
+
|
|
63
|
+
## Quickstart
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
python -m venv .venv
|
|
67
|
+
. .venv/bin/activate
|
|
68
|
+
pip install -e .[dev]
|
|
69
|
+
contextiq ingest ./examples --out ./build/context
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
On Windows PowerShell:
|
|
73
|
+
|
|
74
|
+
```powershell
|
|
75
|
+
python -m venv .venv
|
|
76
|
+
.venv\Scripts\Activate.ps1
|
|
77
|
+
pip install -e .[dev]
|
|
78
|
+
contextiq ingest .\examples --out .\build\context
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## CLI
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
contextiq ingest <path> --out <directory>
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
Useful flags:
|
|
88
|
+
|
|
89
|
+
- `--include-ext .md,.txt,.json`
|
|
90
|
+
- `--exclude-glob "*.min.js,*.lock"`
|
|
91
|
+
- `--chunk-size 1200`
|
|
92
|
+
- `--chunk-overlap 150`
|
|
93
|
+
- `--formats jsonl,markdown`
|
|
94
|
+
- `--fail-on-warning`
|
|
95
|
+
|
|
96
|
+
## Output
|
|
97
|
+
|
|
98
|
+
`contextiq ingest` writes:
|
|
99
|
+
|
|
100
|
+
- `documents.jsonl`: normalized source documents
|
|
101
|
+
- `chunks.jsonl`: chunked outputs for RAG/agents
|
|
102
|
+
- `chunks.md`: human-readable review file
|
|
103
|
+
- `manifest.json`: summary of the run
|
|
104
|
+
|
|
105
|
+
Each chunk preserves:
|
|
106
|
+
|
|
107
|
+
- source path
|
|
108
|
+
- document id
|
|
109
|
+
- chunk id
|
|
110
|
+
- byte and character ranges when available
|
|
111
|
+
- headings / section hints
|
|
112
|
+
|
|
113
|
+
## Example
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
contextiq ingest ./docs --out ./dist/context --chunk-size 900 --chunk-overlap 120
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
## Development
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
pip install -e .[dev]
|
|
123
|
+
pytest
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
## Roadmap
|
|
127
|
+
|
|
128
|
+
- embeddings plugin interface
|
|
129
|
+
- vector DB exporters
|
|
130
|
+
- OCR pipeline
|
|
131
|
+
- table extraction
|
|
132
|
+
- citation-aware retrieval benchmarks
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
contextiq/__init__.py,sha256=FH5sl2acGwieJuoxRx2_GQnlcYlWAGXAT_VWA12YreM,75
|
|
2
|
+
contextiq/__main__.py,sha256=PSQ4rpL0dG6f-qH4N7H-gD9igQkdHzH4yVZDcW8lfZo,80
|
|
3
|
+
contextiq/chunking.py,sha256=9OsV1SEZIHmjRbclMKzUdlSq9hm2ZyYULIYxT5XP_7I,1914
|
|
4
|
+
contextiq/cli.py,sha256=gYIGawXR6KkSpHIFzK75Ept8yzw1TbvqRcIXMcc2mYQ,2585
|
|
5
|
+
contextiq/discovery.py,sha256=RxKAOhOYrwFzzVt-hNgnf-2lH6hG63bUT2YoJlOXgFk,1017
|
|
6
|
+
contextiq/exporters.py,sha256=Uu8nyk3wYLdAImA4sixXKd0aOpli2hu8t7QomkFPSgc,1518
|
|
7
|
+
contextiq/loaders.py,sha256=qFhMpIWcHt9CTV8VYKmhm6msODoEXjBJUtrf7E8bu1k,4492
|
|
8
|
+
contextiq/models.py,sha256=9RaRHyWstuncK6jNv6ajQYBCQajKCgCsE5uvzNg_1Gc,1357
|
|
9
|
+
contextiq/pipeline.py,sha256=aii9pNvyGnrE9r2BbgbiaNxAsONGQ4bKI97I5df59Fc,2248
|
|
10
|
+
contextiq/utils.py,sha256=RWhTsK3DMREfhwPusi-E7SG1uHgvkNUAa5qPA05OQYQ,1006
|
|
11
|
+
contextiq-0.1.0.dist-info/licenses/LICENSE,sha256=vZ6K0Ke-h7-cLmni9LqWKOjZl0dNuNjtp9iSRg_JTB8,1079
|
|
12
|
+
contextiq-0.1.0.dist-info/METADATA,sha256=3MwNOSPcBdgMa0ckIF19J5eYxmJMBW4e0cMC7Ky16aQ,3282
|
|
13
|
+
contextiq-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
14
|
+
contextiq-0.1.0.dist-info/entry_points.txt,sha256=_zlsqfsWLhcXaq8562sIunqf0k34Gyj7_V_1BCSYFP4,49
|
|
15
|
+
contextiq-0.1.0.dist-info/top_level.txt,sha256=7oOR5pTvHfJG2XzeIykufAXd39x83_Vo-UzkAMwWNl0,10
|
|
16
|
+
contextiq-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 ContextIQ Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
contextiq
|