embedmr 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,12 @@
1
+ include README.md
2
+ include pyproject.toml
3
+
4
+ recursive-include src/embedmr *.py
5
+
6
+ prune tests
7
+ prune .venv
8
+ prune .pytest_cache
9
+ prune build
10
+ prune dist
11
+ prune src/*.egg-info
12
+ global-exclude __pycache__ *.pyc *.pyo *.tmp *.log
embedmr-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,6 @@
1
+ Metadata-Version: 2.4
2
+ Name: embedmr
3
+ Version: 0.1.0
4
+ Summary: Add your description here
5
+ Requires-Python: >=3.13
6
+ Description-Content-Type: text/markdown
File without changes
@@ -0,0 +1,29 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "embedmr"
7
+ version = "0.1.0"
8
+ description = "Add your description here"
9
+ readme = "README.md"
10
+ requires-python = ">=3.13"
11
+ dependencies = []
12
+
13
+ [dependency-groups]
14
+ dev = [
15
+ "build>=1.4.0",
16
+ "mypy>=1.19.1",
17
+ "pytest>=9.0.2",
18
+ "ruff>=0.15.1",
19
+ "twine>=6.2.0",
20
+ ]
21
+
22
+ [tool.setuptools]
23
+ package-dir = {"" = "src"}
24
+
25
+ [tool.setuptools.packages.find]
26
+ where = ["src"]
27
+
28
+ [project.scripts]
29
+ embedmr = "embedmr.cli:main"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,3 @@
1
+ # src/embedmr/__init__.py
2
+ __all__ = ["__version__"]
3
+ __version__ = "0.0.1"
File without changes
@@ -0,0 +1,181 @@
1
+ # src/embedmr/cache/cache_index_sqlite.py
2
+ from __future__ import annotations
3
+
4
+ import sqlite3
5
+ import time
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from typing import Optional
9
+
10
+ from embedmr.runtime.constants import CacheStatus
11
+
12
+
13
+ @dataclass(frozen=True, slots=True)
14
+ class CacheRecord:
15
+ cache_key: str
16
+ vec_ref: Optional[str]
17
+ dim: Optional[int]
18
+ status: CacheStatus
19
+ created_at: Optional[int]
20
+ error: Optional[str]
21
+
22
+
23
+ class SQLiteCacheIndex:
24
+ """
25
+ Stage 0 must-have schema + atomic state transitions.
26
+
27
+ Table: cache_index
28
+ cache_key TEXT PRIMARY KEY
29
+ vec_ref TEXT
30
+ dim INTEGER
31
+ status TEXT (READY|IN_PROGRESS|FAILED)
32
+ created_at INTEGER (unix epoch seconds)
33
+ error TEXT
34
+ """
35
+
36
+ def __init__(self, db_path: str | Path) -> None:
37
+ self._path = Path(db_path)
38
+ self._path.parent.mkdir(parents=True, exist_ok=True)
39
+
40
+ def connect(self) -> sqlite3.Connection:
41
+ con = sqlite3.connect(str(self._path), timeout=30.0, isolation_level=None) # autocommit; use BEGIN manually
42
+ con.execute("PRAGMA journal_mode=WAL;")
43
+ con.execute("PRAGMA synchronous=NORMAL;")
44
+ con.execute("PRAGMA foreign_keys=ON;")
45
+ con.execute("PRAGMA busy_timeout=30000;")
46
+ return con
47
+
48
+ def init_db(self) -> None:
49
+ with self.connect() as con:
50
+ con.execute(
51
+ """
52
+ CREATE TABLE IF NOT EXISTS cache_index (
53
+ cache_key TEXT PRIMARY KEY,
54
+ vec_ref TEXT,
55
+ dim INTEGER,
56
+ status TEXT NOT NULL,
57
+ created_at INTEGER,
58
+ error TEXT
59
+ );
60
+ """
61
+ )
62
+ con.execute("CREATE INDEX IF NOT EXISTS idx_cache_status ON cache_index(status);")
63
+
64
+ def get(self, cache_key: str) -> Optional[CacheRecord]:
65
+ with self.connect() as con:
66
+ row = con.execute(
67
+ "SELECT cache_key, vec_ref, dim, status, created_at, error FROM cache_index WHERE cache_key=?",
68
+ (cache_key,),
69
+ ).fetchone()
70
+ if row is None:
71
+ return None
72
+ return CacheRecord(
73
+ cache_key=row[0],
74
+ vec_ref=row[1],
75
+ dim=row[2],
76
+ status=CacheStatus(row[3]),
77
+ created_at=row[4],
78
+ error=row[5],
79
+ )
80
+
81
+ def try_mark_in_progress(self, cache_key: str) -> bool:
82
+ """
83
+ Single-flight primitive (local host v1):
84
+ - If row doesn't exist: insert IN_PROGRESS -> True
85
+ - If exists and status==FAILED: move to IN_PROGRESS (retry) -> True
86
+ - If exists and status==IN_PROGRESS/READY: -> False
87
+ Atomic via transaction + conditional updates.
88
+ """
89
+ now = int(time.time())
90
+ with self.connect() as con:
91
+ con.execute("BEGIN IMMEDIATE;")
92
+ try:
93
+ existing = con.execute(
94
+ "SELECT status FROM cache_index WHERE cache_key=?",
95
+ (cache_key,),
96
+ ).fetchone()
97
+ if existing is None:
98
+ con.execute(
99
+ """
100
+ INSERT INTO cache_index(cache_key, vec_ref, dim, status, created_at, error)
101
+ VALUES(?, NULL, NULL, ?, ?, NULL)
102
+ """,
103
+ (cache_key, CacheStatus.IN_PROGRESS.value, now),
104
+ )
105
+ con.execute("COMMIT;")
106
+ return True
107
+
108
+ status = CacheStatus(existing[0])
109
+ if status == CacheStatus.FAILED:
110
+ con.execute(
111
+ """
112
+ UPDATE cache_index
113
+ SET status=?, vec_ref=NULL, dim=NULL, error=NULL
114
+ WHERE cache_key=? AND status=?
115
+ """,
116
+ (
117
+ CacheStatus.IN_PROGRESS.value,
118
+ cache_key,
119
+ CacheStatus.FAILED.value,
120
+ ),
121
+ )
122
+ # If conditional update matched, we own it.
123
+ changed = con.total_changes > 0
124
+ con.execute("COMMIT;")
125
+ return changed
126
+
127
+ con.execute("COMMIT;")
128
+ return False
129
+ except Exception:
130
+ con.execute("ROLLBACK;")
131
+ raise
132
+
133
+ def mark_ready(self, cache_key: str, *, vec_ref: str, dim: int) -> None:
134
+ if dim <= 0:
135
+ raise ValueError("dim must be positive")
136
+ with self.connect() as con:
137
+ con.execute("BEGIN IMMEDIATE;")
138
+ try:
139
+ con.execute(
140
+ """
141
+ INSERT INTO cache_index(cache_key, vec_ref, dim, status, created_at, error)
142
+ VALUES(?, ?, ?, ?, COALESCE((SELECT created_at FROM cache_index WHERE cache_key=?), ?), NULL)
143
+ ON CONFLICT(cache_key) DO UPDATE SET
144
+ vec_ref=excluded.vec_ref,
145
+ dim=excluded.dim,
146
+ status=excluded.status,
147
+ error=NULL
148
+ """,
149
+ (
150
+ cache_key,
151
+ vec_ref,
152
+ dim,
153
+ CacheStatus.READY.value,
154
+ cache_key,
155
+ int(time.time()),
156
+ ),
157
+ )
158
+ con.execute("COMMIT;")
159
+ except Exception:
160
+ con.execute("ROLLBACK;")
161
+ raise
162
+
163
+ def mark_failed(self, cache_key: str, *, error: str) -> None:
164
+ err = (error or "").strip()[:2000]
165
+ with self.connect() as con:
166
+ con.execute("BEGIN IMMEDIATE;")
167
+ try:
168
+ con.execute(
169
+ """
170
+ INSERT INTO cache_index(cache_key, vec_ref, dim, status, created_at, error)
171
+ VALUES(?, NULL, NULL, ?, ?, ?)
172
+ ON CONFLICT(cache_key) DO UPDATE SET
173
+ status=excluded.status,
174
+ error=excluded.error
175
+ """,
176
+ (cache_key, CacheStatus.FAILED.value, int(time.time()), err),
177
+ )
178
+ con.execute("COMMIT;")
179
+ except Exception:
180
+ con.execute("ROLLBACK;")
181
+ raise
@@ -0,0 +1,5 @@
1
+ # src/embedmr/chunking/__init__.py
2
+ from .chunker import ChunkerConfig, make_chunks
3
+ from .validate import validate_chunks_jsonl
4
+
5
+ __all__ = ["ChunkerConfig", "make_chunks", "validate_chunks_jsonl"]
@@ -0,0 +1,170 @@
1
+ # src/embedmr/chunking/chunker.py
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import Any, Dict, Iterable, Iterator, List, Optional, Sequence, Tuple
8
+
9
+ from embedmr.core.hashing import sha256_hex_str
10
+ from embedmr.core.normalize import NormalizerConfig, normalize_text
11
+ from embedmr.core.schemas import ChunkRow
12
+ from embedmr.dataio.atomic import atomic_write_text, AtomicWriteConfig
13
+
14
+
15
+ @dataclass(frozen=True, slots=True)
16
+ class ChunkerConfig:
17
+ """
18
+ Deterministic v1: character window with overlap, on newline-normalized text.
19
+ """
20
+ chunk_size: int = 1000
21
+ overlap: int = 200
22
+ chunker_version: str = "chunk:v1"
23
+ # doc_id stability:
24
+ # - if user provides doc_id in JSON/JSONL, use it
25
+ # - else derive from normalized full document text (content-based)
26
+ docid_normalizer: NormalizerConfig = NormalizerConfig(lowercase=False)
27
+
28
+ def __post_init__(self) -> None:
29
+ if self.chunk_size <= 0:
30
+ raise ValueError("chunk_size must be > 0")
31
+ if self.overlap < 0 or self.overlap >= self.chunk_size:
32
+ raise ValueError("overlap must be >=0 and < chunk_size")
33
+ if "|" in self.chunker_version:
34
+ raise ValueError("chunker_version must not contain '|' (used as delimiter elsewhere)")
35
+
36
+
37
+ def _normalize_newlines_only(text: str) -> str:
38
+ # Deterministic across OS tools: CRLF/CR -> LF
39
+ return text.replace("\r\n", "\n").replace("\r", "\n")
40
+
41
+
42
+ def _derive_doc_id(full_text: str, *, cfg: ChunkerConfig) -> str:
43
+ # Content-based stable doc id
44
+ text_for_id = normalize_text(full_text, cfg=cfg.docid_normalizer)
45
+ return sha256_hex_str(text_for_id)
46
+
47
+
48
+ def _chunk_ranges(n: int, *, size: int, overlap: int) -> Iterator[Tuple[int, int]]:
49
+ if n == 0:
50
+ return
51
+ step = size - overlap
52
+ start = 0
53
+ while start < n:
54
+ end = min(n, start + size)
55
+ yield start, end
56
+ if end == n:
57
+ break
58
+ start += step
59
+
60
+
61
+ def _read_text_file(path: Path) -> str:
62
+ return path.read_text(encoding="utf-8")
63
+
64
+
65
+ def _iter_input_documents(inputs: Sequence[str | Path]) -> Iterator[Tuple[str, str, Optional[Dict[str, Any]]]]:
66
+ """
67
+ Yields (doc_id_or_empty, text, metadata_or_none).
68
+
69
+ Accepted input files:
70
+ - .txt : doc_id derived, metadata includes {"source_path": "..."}
71
+ - .json : single object with fields {"text": "...", "doc_id"?, "metadata"?}
72
+ - .jsonl : many objects with fields {"text": "...", "doc_id"?, "metadata"?}
73
+ """
74
+ paths: List[Path] = []
75
+ for x in inputs:
76
+ p = Path(x)
77
+ if p.is_dir():
78
+ paths.extend([q for q in p.rglob("*") if q.is_file()])
79
+ elif p.is_file():
80
+ paths.append(p)
81
+ else:
82
+ raise FileNotFoundError(f"Input path not found: {p}")
83
+
84
+ for p in sorted(paths, key=lambda z: str(z).lower()):
85
+ suf = p.suffix.lower()
86
+ if suf == ".txt":
87
+ text = _read_text_file(p)
88
+ md = {"source_path": str(p)}
89
+ yield ("", text, md)
90
+ elif suf == ".json":
91
+ obj = json.loads(p.read_text(encoding="utf-8"))
92
+ if not isinstance(obj, dict) or "text" not in obj:
93
+ raise ValueError(f"{p} must be an object with at least a 'text' field")
94
+ yield (str(obj.get("doc_id") or ""), str(obj["text"]), obj.get("metadata"))
95
+ elif suf == ".jsonl":
96
+ with p.open("r", encoding="utf-8") as f:
97
+ for line_no, line in enumerate(f, start=1):
98
+ s = line.strip()
99
+ if not s:
100
+ continue
101
+ obj = json.loads(s)
102
+ if not isinstance(obj, dict) or "text" not in obj:
103
+ raise ValueError(f"{p}:{line_no} must be an object with at least a 'text' field")
104
+ yield (str(obj.get("doc_id") or ""), str(obj["text"]), obj.get("metadata"))
105
+ else:
106
+ # ignore unknown files
107
+ continue
108
+
109
+
110
+ def make_chunks(
111
+ *,
112
+ inputs: Sequence[str | Path],
113
+ output_jsonl: str | Path,
114
+ cfg: ChunkerConfig = ChunkerConfig(),
115
+ ) -> Dict[str, int]:
116
+ """
117
+ Produces chunks.jsonl (atomic write).
118
+ Returns basic counts: {"docs": X, "chunks": Y}.
119
+ """
120
+ out_path = Path(output_jsonl)
121
+ out_path.parent.mkdir(parents=True, exist_ok=True)
122
+
123
+ rows: List[Dict[str, Any]] = []
124
+ docs = 0
125
+ chunks = 0
126
+
127
+ for provided_doc_id, raw_text, md in _iter_input_documents(inputs):
128
+ docs += 1
129
+ text = _normalize_newlines_only(raw_text)
130
+ doc_id = provided_doc_id.strip() or _derive_doc_id(text, cfg=cfg)
131
+
132
+ # Deterministic chunking on newline-normalized text
133
+ idx = 0
134
+ for start, end in _chunk_ranges(len(text), size=cfg.chunk_size, overlap=cfg.overlap):
135
+ chunk_text = text[start:end]
136
+ chunk_id = f"{doc_id}:{idx:06d}" # stable within doc
137
+ idx += 1
138
+
139
+ meta: Optional[Dict[str, Any]] = None
140
+ if md is not None:
141
+ if not isinstance(md, dict):
142
+ raise ValueError("metadata must be an object/dict when provided")
143
+ meta = dict(md) # copy
144
+ else:
145
+ meta = None
146
+
147
+ # Always include offsets (deterministic) for traceability
148
+ offsets = {"start_char": start, "end_char": end}
149
+ if meta is None:
150
+ meta = offsets
151
+ else:
152
+ meta = {**meta, **offsets}
153
+
154
+ row = ChunkRow(
155
+ doc_id=doc_id,
156
+ chunk_id=chunk_id,
157
+ text=chunk_text,
158
+ chunker_version=cfg.chunker_version,
159
+ metadata=meta,
160
+ )
161
+ rows.append(row.to_json())
162
+ chunks += 1
163
+
164
+ # Atomic write all rows (Stage 2 output is usually manageable; v2 can stream)
165
+ text_out = "\n".join(json.dumps(r, ensure_ascii=False, separators=(",", ":")) for r in rows)
166
+ if text_out:
167
+ text_out += "\n"
168
+ atomic_write_text(out_path, text_out, cfg=AtomicWriteConfig(fsync=True))
169
+
170
+ return {"docs": docs, "chunks": chunks}
@@ -0,0 +1,71 @@
1
+ # src/embedmr/chunking/validate.py
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import Any, Dict, Optional, Sequence, Tuple
8
+
9
+
10
+ @dataclass(frozen=True, slots=True)
11
+ class ValidationResult:
12
+ ok: bool
13
+ docs: int
14
+ chunks: int
15
+ errors: Tuple[str, ...]
16
+
17
+
18
+ def _req_str(obj: Dict[str, Any], key: str, where: str) -> str:
19
+ v = obj.get(key)
20
+ if not isinstance(v, str) or not v.strip():
21
+ raise ValueError(f"{where}: '{key}' must be a non-empty string")
22
+ return v
23
+
24
+
25
+ def validate_chunks_jsonl(path: str | Path, *, allowed_chunker_versions: Optional[Sequence[str]] = None) -> ValidationResult:
26
+ p = Path(path)
27
+ if not p.is_file():
28
+ return ValidationResult(ok=False, docs=0, chunks=0, errors=(f"File not found: {p}",))
29
+
30
+ errors = []
31
+ docs_seen = set()
32
+ chunks = 0
33
+
34
+ with p.open("r", encoding="utf-8") as f:
35
+ for line_no, line in enumerate(f, start=1):
36
+ s = line.strip()
37
+ if not s:
38
+ continue
39
+ where = f"{p}:{line_no}"
40
+ try:
41
+ obj = json.loads(s)
42
+ if not isinstance(obj, dict):
43
+ raise ValueError(f"{where}: each line must be a JSON object")
44
+
45
+ doc_id = _req_str(obj, "doc_id", where)
46
+ chunk_id = _req_str(obj, "chunk_id", where)
47
+ text = _req_str(obj, "text", where)
48
+ chunker_version = _req_str(obj, "chunker_version", where)
49
+
50
+ if allowed_chunker_versions is not None and chunker_version not in allowed_chunker_versions:
51
+ raise ValueError(f"{where}: chunker_version '{chunker_version}' not allowed")
52
+
53
+ # Invariant: chunk_id stable within doc; we enforce our v1 format expectation
54
+ if not chunk_id.startswith(doc_id + ":"):
55
+ raise ValueError(f"{where}: chunk_id must start with '{doc_id}:'")
56
+
57
+ md = obj.get("metadata")
58
+ if md is not None and not isinstance(md, dict):
59
+ raise ValueError(f"{where}: metadata must be an object/dict if present")
60
+
61
+ # Basic sanity
62
+ if len(text) == 0:
63
+ raise ValueError(f"{where}: text must not be empty")
64
+
65
+ docs_seen.add(doc_id)
66
+ chunks += 1
67
+
68
+ except Exception as e:
69
+ errors.append(str(e))
70
+
71
+ return ValidationResult(ok=(len(errors) == 0), docs=len(docs_seen), chunks=chunks, errors=tuple(errors))
@@ -0,0 +1,62 @@
1
+ # src/embedmr/cli.py
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ from embedmr.chunking.chunker import ChunkerConfig, make_chunks
9
+ from embedmr.chunking.validate import validate_chunks_jsonl
10
+
11
+
12
+ def _cmd_make_chunks(args: argparse.Namespace) -> int:
13
+ cfg = ChunkerConfig(
14
+ chunk_size=args.chunk_size,
15
+ overlap=args.overlap,
16
+ chunker_version=args.chunker_version,
17
+ )
18
+ stats = make_chunks(inputs=args.inputs, output_jsonl=args.output, cfg=cfg)
19
+ print(f"Wrote {args.output} (docs={stats['docs']}, chunks={stats['chunks']})")
20
+ return 0
21
+
22
+
23
+ def _cmd_validate(args: argparse.Namespace) -> int:
24
+ res = validate_chunks_jsonl(args.path)
25
+ if res.ok:
26
+ print(f"OK: {args.path} (docs={res.docs}, chunks={res.chunks})")
27
+ return 0
28
+ print(f"INVALID: {args.path} (docs={res.docs}, chunks={res.chunks})", file=sys.stderr)
29
+ for e in res.errors[:50]:
30
+ print(f" - {e}", file=sys.stderr)
31
+ if len(res.errors) > 50:
32
+ print(f" ... (+{len(res.errors)-50} more)", file=sys.stderr)
33
+ return 2
34
+
35
+
36
+ def build_parser() -> argparse.ArgumentParser:
37
+ p = argparse.ArgumentParser(prog="embedmr")
38
+ sub = p.add_subparsers(dest="cmd", required=True)
39
+
40
+ mk = sub.add_parser("make-chunks", help="Create chunks.jsonl from extracted text (.txt/.json/.jsonl)")
41
+ mk.add_argument("--input", dest="inputs", nargs="+", required=True, help="Input file(s) or dir(s)")
42
+ mk.add_argument("--output", required=True, help="Output chunks.jsonl")
43
+ mk.add_argument("--chunk-size", type=int, default=1000)
44
+ mk.add_argument("--overlap", type=int, default=200)
45
+ mk.add_argument("--chunker-version", default="chunk:v1")
46
+ mk.set_defaults(fn=_cmd_make_chunks)
47
+
48
+ vd = sub.add_parser("validate", help="Validate chunks.jsonl schema + invariants")
49
+ vd.add_argument("path", help="Path to chunks.jsonl")
50
+ vd.set_defaults(fn=_cmd_validate)
51
+
52
+ return p
53
+
54
+
55
+ def main(argv: list[str] | None = None) -> int:
56
+ parser = build_parser()
57
+ args = parser.parse_args(argv)
58
+ return int(args.fn(args))
59
+
60
+
61
+ if __name__ == "__main__":
62
+ raise SystemExit(main())
File without changes
@@ -0,0 +1,49 @@
1
+ # src/embedmr/core/fingerprint.py
2
+ from __future__ import annotations
3
+
4
+ from dataclasses import dataclass
5
+
6
+ _ILLEGAL = "|"
7
+
8
+
9
+ def _ensure_no_pipe(name: str, value: str) -> None:
10
+ if _ILLEGAL in value:
11
+ raise ValueError(f"{name} must not contain '{_ILLEGAL}': {value!r}")
12
+
13
+
14
+ @dataclass(frozen=True, slots=True)
15
+ class EmbedderSpec:
16
+ embedder_id: str
17
+ embedder_version: str
18
+ dim: int
19
+ pooling: str # e.g., "mean", "cls", "last"
20
+
21
+ def __post_init__(self) -> None:
22
+ _ensure_no_pipe("embedder_id", self.embedder_id)
23
+ _ensure_no_pipe("embedder_version", self.embedder_version)
24
+ _ensure_no_pipe("pooling", self.pooling)
25
+ if not isinstance(self.dim, int) or self.dim <= 0:
26
+ raise ValueError(f"dim must be a positive int, got {self.dim!r}")
27
+
28
+
29
+ def build_embedder_fingerprint(
30
+ spec: EmbedderSpec,
31
+ *,
32
+ normalize_version: str,
33
+ chunker_version: str,
34
+ ) -> str:
35
+ _ensure_no_pipe("normalize_version", normalize_version)
36
+ _ensure_no_pipe("chunker_version", chunker_version)
37
+
38
+ # Stage 0 invariant:
39
+ # embedder_id|embedder_version|dim|pooling|normalize_version|chunker_version
40
+ return "|".join(
41
+ [
42
+ spec.embedder_id,
43
+ spec.embedder_version,
44
+ str(spec.dim),
45
+ spec.pooling,
46
+ normalize_version,
47
+ chunker_version,
48
+ ]
49
+ )
@@ -0,0 +1,36 @@
1
+ # src/embedmr/core/hashing.py
2
+ from __future__ import annotations
3
+
4
+ import hashlib
5
+
6
+
7
+ def sha256_hex(data: bytes) -> str:
8
+ return hashlib.sha256(data).hexdigest()
9
+
10
+
11
+ def sha256_hex_str(s: str) -> str:
12
+ if not isinstance(s, str):
13
+ raise TypeError(f"expected str, got {type(s)!r}")
14
+ return sha256_hex(s.encode("utf-8"))
15
+
16
+
17
+ def cache_key_for_text_norm(*, embedder_fingerprint: str, text_norm: str) -> str:
18
+ """
19
+ Stage 0 invariant:
20
+ cache_key = sha256(embedder_fingerprint + "|" + sha256(text_norm))
21
+ """
22
+ inner = sha256_hex_str(text_norm)
23
+ payload = f"{embedder_fingerprint}|{inner}"
24
+ return sha256_hex_str(payload)
25
+
26
+
27
+ def stable_hash_to_int(s: str, *, bits: int = 64) -> int:
28
+ """
29
+ Stable across processes/runs/platforms (unlike Python's hash()).
30
+ Uses sha256(s) and returns the first `bits` bits as an int.
31
+ """
32
+ if bits not in (32, 64, 128):
33
+ raise ValueError("bits must be one of {32, 64, 128}")
34
+ digest = hashlib.sha256(s.encode("utf-8")).digest()
35
+ nbytes = bits // 8
36
+ return int.from_bytes(digest[:nbytes], "big", signed=False)
@@ -0,0 +1,51 @@
1
+ # src/embedmr/core/normalize.py
2
+ from __future__ import annotations
3
+
4
+ import re
5
+ import unicodedata
6
+ from dataclasses import dataclass
7
+
8
+ _WS_RE = re.compile(r"\s+")
9
+
10
+
11
+ @dataclass(frozen=True, slots=True)
12
+ class NormalizerConfig:
13
+ """
14
+ Stage 0 invariant:
15
+ - Unicode NFKC
16
+ - newline normalize CRLF -> LF
17
+ - trim
18
+ - collapse whitespace to single space
19
+ - optional lowercase
20
+ """
21
+ lowercase: bool = False
22
+
23
+ @property
24
+ def normalize_version(self) -> str:
25
+ # Keep version stable and explicit; change only when semantics change.
26
+ base = "norm:v1:nfkc+crlf2lf+trim+ws1"
27
+ return f"{base}+lower" if self.lowercase else base
28
+
29
+
30
+ def normalize_text(text: str, *, cfg: NormalizerConfig = NormalizerConfig()) -> str:
31
+ if text is None:
32
+ raise TypeError("text must be a str, got None")
33
+
34
+ if not isinstance(text, str):
35
+ raise TypeError(f"text must be a str, got {type(text)!r}")
36
+
37
+ # 1) Unicode NFKC
38
+ s = unicodedata.normalize("NFKC", text)
39
+
40
+ # 2) newline normalize \r\n -> \n (also normalize stray \r)
41
+ s = s.replace("\r\n", "\n").replace("\r", "\n")
42
+
43
+ # 3) optional lowercase
44
+ if cfg.lowercase:
45
+ s = s.lower()
46
+
47
+ # 4) trim + 5) collapse whitespace (incl newlines/tabs) to single space
48
+ s = s.strip()
49
+ s = _WS_RE.sub(" ", s)
50
+
51
+ return s
@@ -0,0 +1,67 @@
1
+ # src/embedmr/core/schemas.py
2
+ from __future__ import annotations
3
+
4
+ from dataclasses import dataclass
5
+ from typing import Any, Dict, Optional
6
+
7
+
8
+ @dataclass(frozen=True, slots=True)
9
+ class ChunkRow:
10
+ doc_id: str
11
+ chunk_id: str
12
+ text: str
13
+ chunker_version: str
14
+ metadata: Optional[Dict[str, Any]] = None
15
+
16
+ def to_json(self) -> Dict[str, Any]:
17
+ out: Dict[str, Any] = {
18
+ "doc_id": self.doc_id,
19
+ "chunk_id": self.chunk_id,
20
+ "text": self.text,
21
+ "chunker_version": self.chunker_version,
22
+ }
23
+ if self.metadata is not None:
24
+ out["metadata"] = self.metadata
25
+ return out
26
+
27
+
28
+ @dataclass(frozen=True, slots=True)
29
+ class IntermediateRow:
30
+ doc_id: str
31
+ chunk_id: str
32
+ cache_key: str
33
+ vec_ref: str
34
+ dim: int
35
+ metadata: Optional[Dict[str, Any]] = None
36
+
37
+ def to_json(self) -> Dict[str, Any]:
38
+ out: Dict[str, Any] = {
39
+ "doc_id": self.doc_id,
40
+ "chunk_id": self.chunk_id,
41
+ "cache_key": self.cache_key,
42
+ "vec_ref": self.vec_ref,
43
+ "dim": self.dim,
44
+ }
45
+ if self.metadata is not None:
46
+ out["metadata"] = self.metadata
47
+ return out
48
+
49
+
50
+ @dataclass(frozen=True, slots=True)
51
+ class MappingRow:
52
+ doc_id: str
53
+ chunk_id: str
54
+ cache_key: str
55
+
56
+ def to_json(self) -> Dict[str, Any]:
57
+ return {"doc_id": self.doc_id, "chunk_id": self.chunk_id, "cache_key": self.cache_key}
58
+
59
+
60
+ @dataclass(frozen=True, slots=True)
61
+ class EmbeddingRow:
62
+ cache_key: str
63
+ vec_ref: str
64
+ dim: int
65
+
66
+ def to_json(self) -> Dict[str, Any]:
67
+ return {"cache_key": self.cache_key, "vec_ref": self.vec_ref, "dim": self.dim}
@@ -0,0 +1,5 @@
1
+ # src/embedmr/dataio/__init__.py
2
+ from .jsonl import iter_jsonl, write_jsonl_atomic
3
+ from .atomic import atomic_write_bytes, atomic_write_text
4
+
5
+ __all__ = ["iter_jsonl", "write_jsonl_atomic", "atomic_write_bytes", "atomic_write_text"]
@@ -0,0 +1,42 @@
1
+ # src/embedmr/dataio/atomic.py
2
+ from __future__ import annotations
3
+
4
+ import os
5
+ import tempfile
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+
9
+
10
+ @dataclass(frozen=True, slots=True)
11
+ class AtomicWriteConfig:
12
+ fsync: bool = True
13
+ encoding: str = "utf-8"
14
+
15
+
16
+ def _fsync_file(f) -> None:
17
+ f.flush()
18
+ os.fsync(f.fileno())
19
+
20
+
21
+ def atomic_write_bytes(path: str | Path, data: bytes, *, cfg: AtomicWriteConfig = AtomicWriteConfig()) -> None:
22
+ p = Path(path)
23
+ p.parent.mkdir(parents=True, exist_ok=True)
24
+
25
+ fd, tmp_name = tempfile.mkstemp(prefix=p.name + ".", suffix=".tmp", dir=str(p.parent))
26
+ tmp_path = Path(tmp_name)
27
+ try:
28
+ with os.fdopen(fd, "wb") as f:
29
+ f.write(data)
30
+ if cfg.fsync:
31
+ _fsync_file(f)
32
+ os.replace(str(tmp_path), str(p))
33
+ finally:
34
+ if tmp_path.exists():
35
+ try:
36
+ tmp_path.unlink()
37
+ except OSError:
38
+ pass
39
+
40
+
41
+ def atomic_write_text(path: str | Path, text: str, *, cfg: AtomicWriteConfig = AtomicWriteConfig()) -> None:
42
+ atomic_write_bytes(path, text.encode(cfg.encoding), cfg=cfg)
@@ -0,0 +1,79 @@
1
+ # src/embedmr/dataio/jsonl.py
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import Any, Dict, Iterable, Iterator, List, Sequence
8
+
9
+ from embedmr.dataio.atomic import atomic_write_text, AtomicWriteConfig
10
+
11
+
12
+ def _collect_input_files(inputs: Sequence[str | Path]) -> List[Path]:
13
+ files: List[Path] = []
14
+ for x in inputs:
15
+ p = Path(x)
16
+ if p.is_dir():
17
+ # Deterministic: sort by full path string
18
+ for fp in sorted([q for q in p.rglob("*") if q.is_file()], key=lambda z: str(z).lower()):
19
+ files.append(fp)
20
+ elif p.is_file():
21
+ files.append(p)
22
+ else:
23
+ raise FileNotFoundError(f"Input path not found: {p}")
24
+ return files
25
+
26
+
27
+ def iter_jsonl(*inputs: str | Path) -> Iterator[Dict[str, Any]]:
28
+ """
29
+ Read JSONL from:
30
+ - a single file
31
+ - a directory (all files inside)
32
+ - multiple files/dirs
33
+ Deterministic file order.
34
+ """
35
+ files = _collect_input_files(list(inputs))
36
+ for fpath in files:
37
+ if fpath.suffix.lower() != ".jsonl":
38
+ continue
39
+ with fpath.open("r", encoding="utf-8") as f:
40
+ for line_no, line in enumerate(f, start=1):
41
+ s = line.strip()
42
+ if not s:
43
+ continue
44
+ try:
45
+ obj = json.loads(s)
46
+ except json.JSONDecodeError as e:
47
+ raise ValueError(f"Invalid JSON in {fpath}:{line_no}: {e}") from e
48
+ if not isinstance(obj, dict):
49
+ raise ValueError(f"Each JSONL line must be an object in {fpath}:{line_no}")
50
+ yield obj
51
+
52
+
53
+ @dataclass(frozen=True, slots=True)
54
+ class JsonlWriteConfig:
55
+ atomic: bool = True
56
+ fsync: bool = True
57
+
58
+
59
+ def write_jsonl_atomic(path: str | Path, rows: Iterable[Dict[str, Any]], *, cfg: JsonlWriteConfig = JsonlWriteConfig()) -> None:
60
+ """
61
+ Writes a JSONL file. Default is atomic write (tmp -> fsync -> rename).
62
+ """
63
+ p = Path(path)
64
+ p.parent.mkdir(parents=True, exist_ok=True)
65
+
66
+ lines: List[str] = []
67
+ for r in rows:
68
+ lines.append(json.dumps(r, ensure_ascii=False, separators=(",", ":")))
69
+
70
+ text = "\n".join(lines) + ("\n" if lines else "")
71
+ if cfg.atomic:
72
+ atomic_write_text(p, text, cfg=AtomicWriteConfig(fsync=cfg.fsync))
73
+ else:
74
+ with p.open("w", encoding="utf-8") as f:
75
+ f.write(text)
76
+ if cfg.fsync:
77
+ import os
78
+ f.flush()
79
+ os.fsync(f.fileno())
File without changes
@@ -0,0 +1,17 @@
1
+ # src/embedmr/runtime/constants.py
2
+ from __future__ import annotations
3
+
4
+ from enum import Enum
5
+
6
+
7
+ class CacheStatus(str, Enum):
8
+ READY = "READY"
9
+ IN_PROGRESS = "IN_PROGRESS"
10
+ FAILED = "FAILED"
11
+
12
+
13
+ class TaskStatus(str, Enum):
14
+ PENDING = "PENDING"
15
+ RUNNING = "RUNNING"
16
+ DONE = "DONE"
17
+ FAILED = "FAILED"
@@ -0,0 +1,20 @@
1
+ MANIFEST.in
2
+ README.md
3
+ pyproject.toml
4
+ src/embedmr/__init__.py
5
+ src/embedmr/cli.py
6
+ src/embedmr/cache/__init__.py
7
+ src/embedmr/cache/cache_index_sqlite.py
8
+ src/embedmr/chunking/__init__.py
9
+ src/embedmr/chunking/chunker.py
10
+ src/embedmr/chunking/validate.py
11
+ src/embedmr/core/__init__.py
12
+ src/embedmr/core/fingerprint.py
13
+ src/embedmr/core/hashing.py
14
+ src/embedmr/core/normalize.py
15
+ src/embedmr/core/schemas.py
16
+ src/embedmr/dataio/__init__.py
17
+ src/embedmr/dataio/atomic.py
18
+ src/embedmr/dataio/jsonl.py
19
+ src/embedmr/runtime/__init__.py
20
+ src/embedmr/runtime/constants.py