PyPI - embedmr - Versions diffs - 0.1.0__tar.gz - Mend

embedmr 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

embedmr-0.1.0/MANIFEST.in +12 -0
embedmr-0.1.0/PKG-INFO +6 -0
embedmr-0.1.0/README.md +0 -0
embedmr-0.1.0/pyproject.toml +29 -0
embedmr-0.1.0/setup.cfg +4 -0
embedmr-0.1.0/src/embedmr/__init__.py +3 -0
embedmr-0.1.0/src/embedmr/cache/__init__.py +0 -0
embedmr-0.1.0/src/embedmr/cache/cache_index_sqlite.py +181 -0
embedmr-0.1.0/src/embedmr/chunking/__init__.py +5 -0
embedmr-0.1.0/src/embedmr/chunking/chunker.py +170 -0
embedmr-0.1.0/src/embedmr/chunking/validate.py +71 -0
embedmr-0.1.0/src/embedmr/cli.py +62 -0
embedmr-0.1.0/src/embedmr/core/__init__.py +0 -0
embedmr-0.1.0/src/embedmr/core/fingerprint.py +49 -0
embedmr-0.1.0/src/embedmr/core/hashing.py +36 -0
embedmr-0.1.0/src/embedmr/core/normalize.py +51 -0
embedmr-0.1.0/src/embedmr/core/schemas.py +67 -0
embedmr-0.1.0/src/embedmr/dataio/__init__.py +5 -0
embedmr-0.1.0/src/embedmr/dataio/atomic.py +42 -0
embedmr-0.1.0/src/embedmr/dataio/jsonl.py +79 -0
embedmr-0.1.0/src/embedmr/runtime/__init__.py +0 -0
embedmr-0.1.0/src/embedmr/runtime/constants.py +17 -0
embedmr-0.1.0/src/embedmr.egg-info/SOURCES.txt +20 -0

embedmr-0.1.0/MANIFEST.in ADDED Viewed

@@ -0,0 +1,12 @@
+include README.md
+include pyproject.toml
+recursive-include src/embedmr *.py
+prune tests
+prune .venv
+prune .pytest_cache
+prune build
+prune dist
+prune src/*.egg-info
+global-exclude __pycache__ *.pyc *.pyo *.tmp *.log

embedmr-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,6 @@
+Metadata-Version: 2.4
+Name: embedmr
+Version: 0.1.0
+Summary: Add your description here
+Requires-Python: >=3.13
+Description-Content-Type: text/markdown

embedmr-0.1.0/README.md ADDED Viewed

File without changes

embedmr-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,29 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "embedmr"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.13"
+dependencies = []
+[dependency-groups]
+dev = [
+    "build>=1.4.0",
+    "mypy>=1.19.1",
+    "pytest>=9.0.2",
+    "ruff>=0.15.1",
+    "twine>=6.2.0",
+]
+[tool.setuptools]
+package-dir = {"" = "src"}
+[tool.setuptools.packages.find]
+where = ["src"]
+[project.scripts]
+embedmr = "embedmr.cli:main"

embedmr-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

embedmr-0.1.0/src/embedmr/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+# src/embedmr/__init__.py
+__all__ = ["__version__"]
+__version__ = "0.0.1"

embedmr-0.1.0/src/embedmr/cache/__init__.py ADDED Viewed

File without changes

embedmr-0.1.0/src/embedmr/cache/cache_index_sqlite.py ADDED Viewed

@@ -0,0 +1,181 @@
+# src/embedmr/cache/cache_index_sqlite.py
+from __future__ import annotations
+import sqlite3
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+from embedmr.runtime.constants import CacheStatus
+@dataclass(frozen=True, slots=True)
+class CacheRecord:
+    cache_key: str
+    vec_ref: Optional[str]
+    dim: Optional[int]
+    status: CacheStatus
+    created_at: Optional[int]
+    error: Optional[str]
+class SQLiteCacheIndex:
+    """
+    Stage 0 must-have schema + atomic state transitions.
+    Table: cache_index
+      cache_key TEXT PRIMARY KEY
+      vec_ref   TEXT
+      dim       INTEGER
+      status    TEXT (READY|IN_PROGRESS|FAILED)
+      created_at INTEGER (unix epoch seconds)
+      error     TEXT
+    """
+    def __init__(self, db_path: str | Path) -> None:
+        self._path = Path(db_path)
+        self._path.parent.mkdir(parents=True, exist_ok=True)
+    def connect(self) -> sqlite3.Connection:
+        con = sqlite3.connect(str(self._path), timeout=30.0, isolation_level=None)  # autocommit; use BEGIN manually
+        con.execute("PRAGMA journal_mode=WAL;")
+        con.execute("PRAGMA synchronous=NORMAL;")
+        con.execute("PRAGMA foreign_keys=ON;")
+        con.execute("PRAGMA busy_timeout=30000;")
+        return con
+    def init_db(self) -> None:
+        with self.connect() as con:
+            con.execute(
+                """
+                CREATE TABLE IF NOT EXISTS cache_index (
+                    cache_key  TEXT PRIMARY KEY,
+                    vec_ref    TEXT,
+                    dim        INTEGER,
+                    status     TEXT NOT NULL,
+                    created_at INTEGER,
+                    error      TEXT
+                );
+                """
+            )
+            con.execute("CREATE INDEX IF NOT EXISTS idx_cache_status ON cache_index(status);")
+    def get(self, cache_key: str) -> Optional[CacheRecord]:
+        with self.connect() as con:
+            row = con.execute(
+                "SELECT cache_key, vec_ref, dim, status, created_at, error FROM cache_index WHERE cache_key=?",
+                (cache_key,),
+            ).fetchone()
+        if row is None:
+            return None
+        return CacheRecord(
+            cache_key=row[0],
+            vec_ref=row[1],
+            dim=row[2],
+            status=CacheStatus(row[3]),
+            created_at=row[4],
+            error=row[5],
+        )
+    def try_mark_in_progress(self, cache_key: str) -> bool:
+        """
+        Single-flight primitive (local host v1):
+        - If row doesn't exist: insert IN_PROGRESS -> True
+        - If exists and status==FAILED: move to IN_PROGRESS (retry) -> True
+        - If exists and status==IN_PROGRESS/READY: -> False
+        Atomic via transaction + conditional updates.
+        """
+        now = int(time.time())
+        with self.connect() as con:
+            con.execute("BEGIN IMMEDIATE;")
+            try:
+                existing = con.execute(
+                    "SELECT status FROM cache_index WHERE cache_key=?",
+                    (cache_key,),
+                ).fetchone()
+                if existing is None:
+                    con.execute(
+                        """
+                        INSERT INTO cache_index(cache_key, vec_ref, dim, status, created_at, error)
+                        VALUES(?, NULL, NULL, ?, ?, NULL)
+                        """,
+                        (cache_key, CacheStatus.IN_PROGRESS.value, now),
+                    )
+                    con.execute("COMMIT;")
+                    return True
+                status = CacheStatus(existing[0])
+                if status == CacheStatus.FAILED:
+                    con.execute(
+                        """
+                        UPDATE cache_index
+                           SET status=?, vec_ref=NULL, dim=NULL, error=NULL
+                         WHERE cache_key=? AND status=?
+                        """,
+                        (
+                            CacheStatus.IN_PROGRESS.value,
+                            cache_key,
+                            CacheStatus.FAILED.value,
+                        ),
+                    )
+                    # If conditional update matched, we own it.
+                    changed = con.total_changes > 0
+                    con.execute("COMMIT;")
+                    return changed
+                con.execute("COMMIT;")
+                return False
+            except Exception:
+                con.execute("ROLLBACK;")
+                raise
+    def mark_ready(self, cache_key: str, *, vec_ref: str, dim: int) -> None:
+        if dim <= 0:
+            raise ValueError("dim must be positive")
+        with self.connect() as con:
+            con.execute("BEGIN IMMEDIATE;")
+            try:
+                con.execute(
+                    """
+                    INSERT INTO cache_index(cache_key, vec_ref, dim, status, created_at, error)
+                    VALUES(?, ?, ?, ?, COALESCE((SELECT created_at FROM cache_index WHERE cache_key=?), ?), NULL)
+                    ON CONFLICT(cache_key) DO UPDATE SET
+                        vec_ref=excluded.vec_ref,
+                        dim=excluded.dim,
+                        status=excluded.status,
+                        error=NULL
+                    """,
+                    (
+                        cache_key,
+                        vec_ref,
+                        dim,
+                        CacheStatus.READY.value,
+                        cache_key,
+                        int(time.time()),
+                    ),
+                )
+                con.execute("COMMIT;")
+            except Exception:
+                con.execute("ROLLBACK;")
+                raise
+    def mark_failed(self, cache_key: str, *, error: str) -> None:
+        err = (error or "").strip()[:2000]
+        with self.connect() as con:
+            con.execute("BEGIN IMMEDIATE;")
+            try:
+                con.execute(
+                    """
+                    INSERT INTO cache_index(cache_key, vec_ref, dim, status, created_at, error)
+                    VALUES(?, NULL, NULL, ?, ?, ?)
+                    ON CONFLICT(cache_key) DO UPDATE SET
+                        status=excluded.status,
+                        error=excluded.error
+                    """,
+                    (cache_key, CacheStatus.FAILED.value, int(time.time()), err),
+                )
+                con.execute("COMMIT;")
+            except Exception:
+                con.execute("ROLLBACK;")
+                raise

embedmr-0.1.0/src/embedmr/chunking/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+# src/embedmr/chunking/__init__.py
+from .chunker import ChunkerConfig, make_chunks
+from .validate import validate_chunks_jsonl
+__all__ = ["ChunkerConfig", "make_chunks", "validate_chunks_jsonl"]

embedmr-0.1.0/src/embedmr/chunking/chunker.py ADDED Viewed

@@ -0,0 +1,170 @@
+# src/embedmr/chunking/chunker.py
+from __future__ import annotations
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, Iterable, Iterator, List, Optional, Sequence, Tuple
+from embedmr.core.hashing import sha256_hex_str
+from embedmr.core.normalize import NormalizerConfig, normalize_text
+from embedmr.core.schemas import ChunkRow
+from embedmr.dataio.atomic import atomic_write_text, AtomicWriteConfig
+@dataclass(frozen=True, slots=True)
+class ChunkerConfig:
+    """
+    Deterministic v1: character window with overlap, on newline-normalized text.
+    """
+    chunk_size: int = 1000
+    overlap: int = 200
+    chunker_version: str = "chunk:v1"
+    # doc_id stability:
+    # - if user provides doc_id in JSON/JSONL, use it
+    # - else derive from normalized full document text (content-based)
+    docid_normalizer: NormalizerConfig = NormalizerConfig(lowercase=False)
+    def __post_init__(self) -> None:
+        if self.chunk_size <= 0:
+            raise ValueError("chunk_size must be > 0")
+        if self.overlap < 0 or self.overlap >= self.chunk_size:
+            raise ValueError("overlap must be >=0 and < chunk_size")
+        if "|" in self.chunker_version:
+            raise ValueError("chunker_version must not contain '|' (used as delimiter elsewhere)")
+def _normalize_newlines_only(text: str) -> str:
+    # Deterministic across OS tools: CRLF/CR -> LF
+    return text.replace("\r\n", "\n").replace("\r", "\n")
+def _derive_doc_id(full_text: str, *, cfg: ChunkerConfig) -> str:
+    # Content-based stable doc id
+    text_for_id = normalize_text(full_text, cfg=cfg.docid_normalizer)
+    return sha256_hex_str(text_for_id)
+def _chunk_ranges(n: int, *, size: int, overlap: int) -> Iterator[Tuple[int, int]]:
+    if n == 0:
+        return
+    step = size - overlap
+    start = 0
+    while start < n:
+        end = min(n, start + size)
+        yield start, end
+        if end == n:
+            break
+        start += step
+def _read_text_file(path: Path) -> str:
+    return path.read_text(encoding="utf-8")
+def _iter_input_documents(inputs: Sequence[str | Path]) -> Iterator[Tuple[str, str, Optional[Dict[str, Any]]]]:
+    """
+    Yields (doc_id_or_empty, text, metadata_or_none).
+    Accepted input files:
+      - .txt : doc_id derived, metadata includes {"source_path": "..."}
+      - .json : single object with fields {"text": "...", "doc_id"?, "metadata"?}
+      - .jsonl : many objects with fields {"text": "...", "doc_id"?, "metadata"?}
+    """
+    paths: List[Path] = []
+    for x in inputs:
+        p = Path(x)
+        if p.is_dir():
+            paths.extend([q for q in p.rglob("*") if q.is_file()])
+        elif p.is_file():
+            paths.append(p)
+        else:
+            raise FileNotFoundError(f"Input path not found: {p}")
+    for p in sorted(paths, key=lambda z: str(z).lower()):
+        suf = p.suffix.lower()
+        if suf == ".txt":
+            text = _read_text_file(p)
+            md = {"source_path": str(p)}
+            yield ("", text, md)
+        elif suf == ".json":
+            obj = json.loads(p.read_text(encoding="utf-8"))
+            if not isinstance(obj, dict) or "text" not in obj:
+                raise ValueError(f"{p} must be an object with at least a 'text' field")
+            yield (str(obj.get("doc_id") or ""), str(obj["text"]), obj.get("metadata"))
+        elif suf == ".jsonl":
+            with p.open("r", encoding="utf-8") as f:
+                for line_no, line in enumerate(f, start=1):
+                    s = line.strip()
+                    if not s:
+                        continue
+                    obj = json.loads(s)
+                    if not isinstance(obj, dict) or "text" not in obj:
+                        raise ValueError(f"{p}:{line_no} must be an object with at least a 'text' field")
+                    yield (str(obj.get("doc_id") or ""), str(obj["text"]), obj.get("metadata"))
+        else:
+            # ignore unknown files
+            continue
+def make_chunks(
+    *,
+    inputs: Sequence[str | Path],
+    output_jsonl: str | Path,
+    cfg: ChunkerConfig = ChunkerConfig(),
+) -> Dict[str, int]:
+    """
+    Produces chunks.jsonl (atomic write).
+    Returns basic counts: {"docs": X, "chunks": Y}.
+    """
+    out_path = Path(output_jsonl)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    rows: List[Dict[str, Any]] = []
+    docs = 0
+    chunks = 0
+    for provided_doc_id, raw_text, md in _iter_input_documents(inputs):
+        docs += 1
+        text = _normalize_newlines_only(raw_text)
+        doc_id = provided_doc_id.strip() or _derive_doc_id(text, cfg=cfg)
+        # Deterministic chunking on newline-normalized text
+        idx = 0
+        for start, end in _chunk_ranges(len(text), size=cfg.chunk_size, overlap=cfg.overlap):
+            chunk_text = text[start:end]
+            chunk_id = f"{doc_id}:{idx:06d}"  # stable within doc
+            idx += 1
+            meta: Optional[Dict[str, Any]] = None
+            if md is not None:
+                if not isinstance(md, dict):
+                    raise ValueError("metadata must be an object/dict when provided")
+                meta = dict(md)  # copy
+            else:
+                meta = None
+            # Always include offsets (deterministic) for traceability
+            offsets = {"start_char": start, "end_char": end}
+            if meta is None:
+                meta = offsets
+            else:
+                meta = {**meta, **offsets}
+            row = ChunkRow(
+                doc_id=doc_id,
+                chunk_id=chunk_id,
+                text=chunk_text,
+                chunker_version=cfg.chunker_version,
+                metadata=meta,
+            )
+            rows.append(row.to_json())
+            chunks += 1
+    # Atomic write all rows (Stage 2 output is usually manageable; v2 can stream)
+    text_out = "\n".join(json.dumps(r, ensure_ascii=False, separators=(",", ":")) for r in rows)
+    if text_out:
+        text_out += "\n"
+    atomic_write_text(out_path, text_out, cfg=AtomicWriteConfig(fsync=True))
+    return {"docs": docs, "chunks": chunks}

embedmr-0.1.0/src/embedmr/chunking/validate.py ADDED Viewed

@@ -0,0 +1,71 @@
+# src/embedmr/chunking/validate.py
+from __future__ import annotations
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, Optional, Sequence, Tuple
+@dataclass(frozen=True, slots=True)
+class ValidationResult:
+    ok: bool
+    docs: int
+    chunks: int
+    errors: Tuple[str, ...]
+def _req_str(obj: Dict[str, Any], key: str, where: str) -> str:
+    v = obj.get(key)
+    if not isinstance(v, str) or not v.strip():
+        raise ValueError(f"{where}: '{key}' must be a non-empty string")
+    return v
+def validate_chunks_jsonl(path: str | Path, *, allowed_chunker_versions: Optional[Sequence[str]] = None) -> ValidationResult:
+    p = Path(path)
+    if not p.is_file():
+        return ValidationResult(ok=False, docs=0, chunks=0, errors=(f"File not found: {p}",))
+    errors = []
+    docs_seen = set()
+    chunks = 0
+    with p.open("r", encoding="utf-8") as f:
+        for line_no, line in enumerate(f, start=1):
+            s = line.strip()
+            if not s:
+                continue
+            where = f"{p}:{line_no}"
+            try:
+                obj = json.loads(s)
+                if not isinstance(obj, dict):
+                    raise ValueError(f"{where}: each line must be a JSON object")
+                doc_id = _req_str(obj, "doc_id", where)
+                chunk_id = _req_str(obj, "chunk_id", where)
+                text = _req_str(obj, "text", where)
+                chunker_version = _req_str(obj, "chunker_version", where)
+                if allowed_chunker_versions is not None and chunker_version not in allowed_chunker_versions:
+                    raise ValueError(f"{where}: chunker_version '{chunker_version}' not allowed")
+                # Invariant: chunk_id stable within doc; we enforce our v1 format expectation
+                if not chunk_id.startswith(doc_id + ":"):
+                    raise ValueError(f"{where}: chunk_id must start with '{doc_id}:'")
+                md = obj.get("metadata")
+                if md is not None and not isinstance(md, dict):
+                    raise ValueError(f"{where}: metadata must be an object/dict if present")
+                # Basic sanity
+                if len(text) == 0:
+                    raise ValueError(f"{where}: text must not be empty")
+                docs_seen.add(doc_id)
+                chunks += 1
+            except Exception as e:
+                errors.append(str(e))
+    return ValidationResult(ok=(len(errors) == 0), docs=len(docs_seen), chunks=chunks, errors=tuple(errors))

embedmr-0.1.0/src/embedmr/cli.py ADDED Viewed

@@ -0,0 +1,62 @@
+# src/embedmr/cli.py
+from __future__ import annotations
+import argparse
+import sys
+from pathlib import Path
+from embedmr.chunking.chunker import ChunkerConfig, make_chunks
+from embedmr.chunking.validate import validate_chunks_jsonl
+def _cmd_make_chunks(args: argparse.Namespace) -> int:
+    cfg = ChunkerConfig(
+        chunk_size=args.chunk_size,
+        overlap=args.overlap,
+        chunker_version=args.chunker_version,
+    )
+    stats = make_chunks(inputs=args.inputs, output_jsonl=args.output, cfg=cfg)
+    print(f"Wrote {args.output} (docs={stats['docs']}, chunks={stats['chunks']})")
+    return 0
+def _cmd_validate(args: argparse.Namespace) -> int:
+    res = validate_chunks_jsonl(args.path)
+    if res.ok:
+        print(f"OK: {args.path} (docs={res.docs}, chunks={res.chunks})")
+        return 0
+    print(f"INVALID: {args.path} (docs={res.docs}, chunks={res.chunks})", file=sys.stderr)
+    for e in res.errors[:50]:
+        print(f"  - {e}", file=sys.stderr)
+    if len(res.errors) > 50:
+        print(f"  ... (+{len(res.errors)-50} more)", file=sys.stderr)
+    return 2
+def build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(prog="embedmr")
+    sub = p.add_subparsers(dest="cmd", required=True)
+    mk = sub.add_parser("make-chunks", help="Create chunks.jsonl from extracted text (.txt/.json/.jsonl)")
+    mk.add_argument("--input", dest="inputs", nargs="+", required=True, help="Input file(s) or dir(s)")
+    mk.add_argument("--output", required=True, help="Output chunks.jsonl")
+    mk.add_argument("--chunk-size", type=int, default=1000)
+    mk.add_argument("--overlap", type=int, default=200)
+    mk.add_argument("--chunker-version", default="chunk:v1")
+    mk.set_defaults(fn=_cmd_make_chunks)
+    vd = sub.add_parser("validate", help="Validate chunks.jsonl schema + invariants")
+    vd.add_argument("path", help="Path to chunks.jsonl")
+    vd.set_defaults(fn=_cmd_validate)
+    return p
+def main(argv: list[str] | None = None) -> int:
+    parser = build_parser()
+    args = parser.parse_args(argv)
+    return int(args.fn(args))
+if __name__ == "__main__":
+    raise SystemExit(main())

embedmr-0.1.0/src/embedmr/core/__init__.py ADDED Viewed

File without changes

embedmr-0.1.0/src/embedmr/core/fingerprint.py ADDED Viewed

@@ -0,0 +1,49 @@
+# src/embedmr/core/fingerprint.py
+from __future__ import annotations
+from dataclasses import dataclass
+_ILLEGAL = "|"
+def _ensure_no_pipe(name: str, value: str) -> None:
+    if _ILLEGAL in value:
+        raise ValueError(f"{name} must not contain '{_ILLEGAL}': {value!r}")
+@dataclass(frozen=True, slots=True)
+class EmbedderSpec:
+    embedder_id: str
+    embedder_version: str
+    dim: int
+    pooling: str  # e.g., "mean", "cls", "last"
+    def __post_init__(self) -> None:
+        _ensure_no_pipe("embedder_id", self.embedder_id)
+        _ensure_no_pipe("embedder_version", self.embedder_version)
+        _ensure_no_pipe("pooling", self.pooling)
+        if not isinstance(self.dim, int) or self.dim <= 0:
+            raise ValueError(f"dim must be a positive int, got {self.dim!r}")
+def build_embedder_fingerprint(
+    spec: EmbedderSpec,
+    *,
+    normalize_version: str,
+    chunker_version: str,
+) -> str:
+    _ensure_no_pipe("normalize_version", normalize_version)
+    _ensure_no_pipe("chunker_version", chunker_version)
+    # Stage 0 invariant:
+    # embedder_id|embedder_version|dim|pooling|normalize_version|chunker_version
+    return "|".join(
+        [
+            spec.embedder_id,
+            spec.embedder_version,
+            str(spec.dim),
+            spec.pooling,
+            normalize_version,
+            chunker_version,
+        ]
+    )

embedmr-0.1.0/src/embedmr/core/hashing.py ADDED Viewed

@@ -0,0 +1,36 @@
+# src/embedmr/core/hashing.py
+from __future__ import annotations
+import hashlib
+def sha256_hex(data: bytes) -> str:
+    return hashlib.sha256(data).hexdigest()
+def sha256_hex_str(s: str) -> str:
+    if not isinstance(s, str):
+        raise TypeError(f"expected str, got {type(s)!r}")
+    return sha256_hex(s.encode("utf-8"))
+def cache_key_for_text_norm(*, embedder_fingerprint: str, text_norm: str) -> str:
+    """
+    Stage 0 invariant:
+      cache_key = sha256(embedder_fingerprint + "|" + sha256(text_norm))
+    """
+    inner = sha256_hex_str(text_norm)
+    payload = f"{embedder_fingerprint}|{inner}"
+    return sha256_hex_str(payload)
+def stable_hash_to_int(s: str, *, bits: int = 64) -> int:
+    """
+    Stable across processes/runs/platforms (unlike Python's hash()).
+    Uses sha256(s) and returns the first `bits` bits as an int.
+    """
+    if bits not in (32, 64, 128):
+        raise ValueError("bits must be one of {32, 64, 128}")
+    digest = hashlib.sha256(s.encode("utf-8")).digest()
+    nbytes = bits // 8
+    return int.from_bytes(digest[:nbytes], "big", signed=False)

embedmr-0.1.0/src/embedmr/core/normalize.py ADDED Viewed

@@ -0,0 +1,51 @@
+# src/embedmr/core/normalize.py
+from __future__ import annotations
+import re
+import unicodedata
+from dataclasses import dataclass
+_WS_RE = re.compile(r"\s+")
+@dataclass(frozen=True, slots=True)
+class NormalizerConfig:
+    """
+    Stage 0 invariant:
+      - Unicode NFKC
+      - newline normalize CRLF -> LF
+      - trim
+      - collapse whitespace to single space
+      - optional lowercase
+    """
+    lowercase: bool = False
+    @property
+    def normalize_version(self) -> str:
+        # Keep version stable and explicit; change only when semantics change.
+        base = "norm:v1:nfkc+crlf2lf+trim+ws1"
+        return f"{base}+lower" if self.lowercase else base
+def normalize_text(text: str, *, cfg: NormalizerConfig = NormalizerConfig()) -> str:
+    if text is None:
+        raise TypeError("text must be a str, got None")
+    if not isinstance(text, str):
+        raise TypeError(f"text must be a str, got {type(text)!r}")
+    # 1) Unicode NFKC
+    s = unicodedata.normalize("NFKC", text)
+    # 2) newline normalize \r\n -> \n (also normalize stray \r)
+    s = s.replace("\r\n", "\n").replace("\r", "\n")
+    # 3) optional lowercase
+    if cfg.lowercase:
+        s = s.lower()
+    # 4) trim + 5) collapse whitespace (incl newlines/tabs) to single space
+    s = s.strip()
+    s = _WS_RE.sub(" ", s)
+    return s

embedmr-0.1.0/src/embedmr/core/schemas.py ADDED Viewed

@@ -0,0 +1,67 @@
+# src/embedmr/core/schemas.py
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+@dataclass(frozen=True, slots=True)
+class ChunkRow:
+    doc_id: str
+    chunk_id: str
+    text: str
+    chunker_version: str
+    metadata: Optional[Dict[str, Any]] = None
+    def to_json(self) -> Dict[str, Any]:
+        out: Dict[str, Any] = {
+            "doc_id": self.doc_id,
+            "chunk_id": self.chunk_id,
+            "text": self.text,
+            "chunker_version": self.chunker_version,
+        }
+        if self.metadata is not None:
+            out["metadata"] = self.metadata
+        return out
+@dataclass(frozen=True, slots=True)
+class IntermediateRow:
+    doc_id: str
+    chunk_id: str
+    cache_key: str
+    vec_ref: str
+    dim: int
+    metadata: Optional[Dict[str, Any]] = None
+    def to_json(self) -> Dict[str, Any]:
+        out: Dict[str, Any] = {
+            "doc_id": self.doc_id,
+            "chunk_id": self.chunk_id,
+            "cache_key": self.cache_key,
+            "vec_ref": self.vec_ref,
+            "dim": self.dim,
+        }
+        if self.metadata is not None:
+            out["metadata"] = self.metadata
+        return out
+@dataclass(frozen=True, slots=True)
+class MappingRow:
+    doc_id: str
+    chunk_id: str
+    cache_key: str
+    def to_json(self) -> Dict[str, Any]:
+        return {"doc_id": self.doc_id, "chunk_id": self.chunk_id, "cache_key": self.cache_key}
+@dataclass(frozen=True, slots=True)
+class EmbeddingRow:
+    cache_key: str
+    vec_ref: str
+    dim: int
+    def to_json(self) -> Dict[str, Any]:
+        return {"cache_key": self.cache_key, "vec_ref": self.vec_ref, "dim": self.dim}

embedmr-0.1.0/src/embedmr/dataio/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+# src/embedmr/dataio/__init__.py
+from .jsonl import iter_jsonl, write_jsonl_atomic
+from .atomic import atomic_write_bytes, atomic_write_text
+__all__ = ["iter_jsonl", "write_jsonl_atomic", "atomic_write_bytes", "atomic_write_text"]

embedmr-0.1.0/src/embedmr/dataio/atomic.py ADDED Viewed

@@ -0,0 +1,42 @@
+# src/embedmr/dataio/atomic.py
+from __future__ import annotations
+import os
+import tempfile
+from dataclasses import dataclass
+from pathlib import Path
+@dataclass(frozen=True, slots=True)
+class AtomicWriteConfig:
+    fsync: bool = True
+    encoding: str = "utf-8"
+def _fsync_file(f) -> None:
+    f.flush()
+    os.fsync(f.fileno())
+def atomic_write_bytes(path: str | Path, data: bytes, *, cfg: AtomicWriteConfig = AtomicWriteConfig()) -> None:
+    p = Path(path)
+    p.parent.mkdir(parents=True, exist_ok=True)
+    fd, tmp_name = tempfile.mkstemp(prefix=p.name + ".", suffix=".tmp", dir=str(p.parent))
+    tmp_path = Path(tmp_name)
+    try:
+        with os.fdopen(fd, "wb") as f:
+            f.write(data)
+            if cfg.fsync:
+                _fsync_file(f)
+        os.replace(str(tmp_path), str(p))
+    finally:
+        if tmp_path.exists():
+            try:
+                tmp_path.unlink()
+            except OSError:
+                pass
+def atomic_write_text(path: str | Path, text: str, *, cfg: AtomicWriteConfig = AtomicWriteConfig()) -> None:
+    atomic_write_bytes(path, text.encode(cfg.encoding), cfg=cfg)

embedmr-0.1.0/src/embedmr/dataio/jsonl.py ADDED Viewed

@@ -0,0 +1,79 @@
+# src/embedmr/dataio/jsonl.py
+from __future__ import annotations
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, Iterable, Iterator, List, Sequence
+from embedmr.dataio.atomic import atomic_write_text, AtomicWriteConfig
+def _collect_input_files(inputs: Sequence[str | Path]) -> List[Path]:
+    files: List[Path] = []
+    for x in inputs:
+        p = Path(x)
+        if p.is_dir():
+            # Deterministic: sort by full path string
+            for fp in sorted([q for q in p.rglob("*") if q.is_file()], key=lambda z: str(z).lower()):
+                files.append(fp)
+        elif p.is_file():
+            files.append(p)
+        else:
+            raise FileNotFoundError(f"Input path not found: {p}")
+    return files
+def iter_jsonl(*inputs: str | Path) -> Iterator[Dict[str, Any]]:
+    """
+    Read JSONL from:
+      - a single file
+      - a directory (all files inside)
+      - multiple files/dirs
+    Deterministic file order.
+    """
+    files = _collect_input_files(list(inputs))
+    for fpath in files:
+        if fpath.suffix.lower() != ".jsonl":
+            continue
+        with fpath.open("r", encoding="utf-8") as f:
+            for line_no, line in enumerate(f, start=1):
+                s = line.strip()
+                if not s:
+                    continue
+                try:
+                    obj = json.loads(s)
+                except json.JSONDecodeError as e:
+                    raise ValueError(f"Invalid JSON in {fpath}:{line_no}: {e}") from e
+                if not isinstance(obj, dict):
+                    raise ValueError(f"Each JSONL line must be an object in {fpath}:{line_no}")
+                yield obj
+@dataclass(frozen=True, slots=True)
+class JsonlWriteConfig:
+    atomic: bool = True
+    fsync: bool = True
+def write_jsonl_atomic(path: str | Path, rows: Iterable[Dict[str, Any]], *, cfg: JsonlWriteConfig = JsonlWriteConfig()) -> None:
+    """
+    Writes a JSONL file. Default is atomic write (tmp -> fsync -> rename).
+    """
+    p = Path(path)
+    p.parent.mkdir(parents=True, exist_ok=True)
+    lines: List[str] = []
+    for r in rows:
+        lines.append(json.dumps(r, ensure_ascii=False, separators=(",", ":")))
+    text = "\n".join(lines) + ("\n" if lines else "")
+    if cfg.atomic:
+        atomic_write_text(p, text, cfg=AtomicWriteConfig(fsync=cfg.fsync))
+    else:
+        with p.open("w", encoding="utf-8") as f:
+            f.write(text)
+            if cfg.fsync:
+                import os
+                f.flush()
+                os.fsync(f.fileno())

embedmr-0.1.0/src/embedmr/runtime/__init__.py ADDED Viewed

File without changes

embedmr-0.1.0/src/embedmr/runtime/constants.py ADDED Viewed

@@ -0,0 +1,17 @@
+# src/embedmr/runtime/constants.py
+from __future__ import annotations
+from enum import Enum
+class CacheStatus(str, Enum):
+    READY = "READY"
+    IN_PROGRESS = "IN_PROGRESS"
+    FAILED = "FAILED"
+class TaskStatus(str, Enum):
+    PENDING = "PENDING"
+    RUNNING = "RUNNING"
+    DONE = "DONE"
+    FAILED = "FAILED"

embedmr-0.1.0/src/embedmr.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,20 @@
+MANIFEST.in
+README.md
+pyproject.toml
+src/embedmr/__init__.py
+src/embedmr/cli.py
+src/embedmr/cache/__init__.py
+src/embedmr/cache/cache_index_sqlite.py
+src/embedmr/chunking/__init__.py
+src/embedmr/chunking/chunker.py
+src/embedmr/chunking/validate.py
+src/embedmr/core/__init__.py
+src/embedmr/core/fingerprint.py
+src/embedmr/core/hashing.py
+src/embedmr/core/normalize.py
+src/embedmr/core/schemas.py
+src/embedmr/dataio/__init__.py
+src/embedmr/dataio/atomic.py
+src/embedmr/dataio/jsonl.py
+src/embedmr/runtime/__init__.py
+src/embedmr/runtime/constants.py