okf-ingest 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,41 @@
1
+ Metadata-Version: 2.4
2
+ Name: okf-ingest
3
+ Version: 0.1.0
4
+ Summary: Unified ingestion tool for Open Knowledge Format (OKF) bundles — validate, load into a portable DuckDB catalog, and semantically search.
5
+ Author-email: Travis Jakel <travis.s.jakel@gmail.com>
6
+ License-Expression: Apache-2.0
7
+ Project-URL: Homepage, https://github.com/travisjakel/okf-ingest
8
+ Project-URL: Repository, https://github.com/travisjakel/okf-ingest
9
+ Project-URL: OKF Specification, https://github.com/GoogleCloudPlatform/knowledge-catalog
10
+ Keywords: okf,open-knowledge-format,knowledge,rag,duckdb,agents
11
+ Requires-Python: >=3.9
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: pyyaml>=6
14
+ Requires-Dist: duckdb>=1.0
15
+
16
+ # okf (Python binding)
17
+
18
+ Python binding of **okf-ingest** — a unified ingestion tool for
19
+ [Open Knowledge Format](https://github.com/GoogleCloudPlatform/knowledge-catalog)
20
+ (OKF) bundles. Validate a bundle, load it into a portable DuckDB catalog, and
21
+ semantically search it.
22
+
23
+ ```bash
24
+ pip install okf-ingest
25
+ okf validate ./bundle
26
+ okf ingest ./bundle --db catalog.duckdb
27
+ okf embed catalog.duckdb # uses local Ollama nomic-embed-text by default
28
+ okf rag catalog.duckdb --query "how is revenue computed?" -k 5
29
+ ```
30
+
31
+ ```python
32
+ import okf
33
+ con, summary = okf.ingest("./bundle", db_path="catalog.duckdb")
34
+ okf.embed(con) # pluggable embedder
35
+ okf.rag_search(con, "revenue", k=5)
36
+ ```
37
+
38
+ The catalog format is shared with the R binding (`okf` on CRAN-style install),
39
+ so you can ingest/embed in one language and query from the other. See the
40
+ [project README](https://github.com/travisjakel/okf-ingest) and `docs/` for the
41
+ full spec-conformance notes and architecture.
@@ -0,0 +1,26 @@
1
+ # okf (Python binding)
2
+
3
+ Python binding of **okf-ingest** — a unified ingestion tool for
4
+ [Open Knowledge Format](https://github.com/GoogleCloudPlatform/knowledge-catalog)
5
+ (OKF) bundles. Validate a bundle, load it into a portable DuckDB catalog, and
6
+ semantically search it.
7
+
8
+ ```bash
9
+ pip install okf-ingest
10
+ okf validate ./bundle
11
+ okf ingest ./bundle --db catalog.duckdb
12
+ okf embed catalog.duckdb # uses local Ollama nomic-embed-text by default
13
+ okf rag catalog.duckdb --query "how is revenue computed?" -k 5
14
+ ```
15
+
16
+ ```python
17
+ import okf
18
+ con, summary = okf.ingest("./bundle", db_path="catalog.duckdb")
19
+ okf.embed(con) # pluggable embedder
20
+ okf.rag_search(con, "revenue", k=5)
21
+ ```
22
+
23
+ The catalog format is shared with the R binding (`okf` on CRAN-style install),
24
+ so you can ingest/embed in one language and query from the other. See the
25
+ [project README](https://github.com/travisjakel/okf-ingest) and `docs/` for the
26
+ full spec-conformance notes and architecture.
@@ -0,0 +1,9 @@
1
+ """okf — Open Knowledge Format ingestion (Python binding)."""
2
+ from .okf import read_bundle, validate, links, ingest, search, Bundle, Concept
3
+ # Note: import the function as `rag_search` so it does not shadow the `okf.rag`
4
+ # submodule (i.e. `import okf.rag` keeps returning the module, not this fn).
5
+ from .rag import chunk_body, ollama_embedder, embed
6
+ from .rag import rag as rag_search
7
+
8
+ __all__ = ["read_bundle", "validate", "links", "ingest", "search", "Bundle", "Concept",
9
+ "chunk_body", "ollama_embedder", "embed", "rag_search"]
@@ -0,0 +1,4 @@
1
+ import sys
2
+ from .cli import main
3
+
4
+ sys.exit(main())
@@ -0,0 +1,129 @@
1
+ #!/usr/bin/env python3
2
+ """okf — command-line interface (Python). Mirrors r/okf/bin/okf.R.
3
+
4
+ okf validate <bundle> [--strict] [--json]
5
+ okf ingest <bundle> --db <path> [--id <id>] [--json]
6
+ okf query <db> [--sql "..."] [--search <term>] [--concepts] [--links] [--findings] [--json]
7
+
8
+ Exit codes: 0 ok · 1 conformance failure · 2 usage error.
9
+ """
10
+ import argparse, json, os, sys
11
+
12
+ # allow running as `python py/okf/cli.py ...` (put the package parent on path)
13
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
14
+ import okf.okf as okf # noqa: E402
15
+ from okf.rag import embed as rag_embed, rag as rag_search, ollama_embedder # noqa: E402
16
+
17
+ import duckdb # noqa: E402
18
+
19
+
20
+ def _print(rows, cols, as_json):
21
+ if as_json:
22
+ print(json.dumps([dict(zip(cols, r)) for r in rows], indent=2, default=str))
23
+ else:
24
+ print(" | ".join(cols))
25
+ for r in rows:
26
+ print(" | ".join("" if v is None else str(v) for v in r))
27
+
28
+
29
+ def main(argv=None):
30
+ p = argparse.ArgumentParser(prog="okf", add_help=True)
31
+ sub = p.add_subparsers(dest="cmd")
32
+
33
+ v = sub.add_parser("validate"); v.add_argument("bundle")
34
+ v.add_argument("--strict", action="store_true"); v.add_argument("--json", action="store_true")
35
+
36
+ i = sub.add_parser("ingest"); i.add_argument("bundle")
37
+ i.add_argument("--db", default=":memory:"); i.add_argument("--id", default=None)
38
+ i.add_argument("--json", action="store_true")
39
+
40
+ q = sub.add_parser("query"); q.add_argument("db")
41
+ q.add_argument("--sql"); q.add_argument("--search")
42
+ q.add_argument("--concepts", action="store_true"); q.add_argument("--links", action="store_true")
43
+ q.add_argument("--findings", action="store_true"); q.add_argument("--json", action="store_true")
44
+
45
+ e = sub.add_parser("embed"); e.add_argument("db")
46
+ e.add_argument("--model", default="nomic-embed-text"); e.add_argument("--json", action="store_true")
47
+
48
+ r = sub.add_parser("rag"); r.add_argument("db"); r.add_argument("--query", required=True)
49
+ r.add_argument("-k", type=int, default=5); r.add_argument("--model", default="nomic-embed-text")
50
+ r.add_argument("--json", action="store_true")
51
+
52
+ a = p.parse_args(argv)
53
+ if not a.cmd:
54
+ p.print_help(); return 2
55
+
56
+ if a.cmd == "validate":
57
+ b = okf.read_bundle(a.bundle)
58
+ val = okf.validate(b)
59
+ nerr = sum(1 for f in val if f["severity"] == "error")
60
+ nwarn = sum(1 for f in val if f["severity"] == "warn")
61
+ conf = nerr == 0
62
+ if a.json:
63
+ print(json.dumps({"bundle": a.bundle, "conformant": conf, "errors": nerr,
64
+ "warnings": nwarn, "findings": val}, indent=2))
65
+ else:
66
+ print(f"bundle: {a.bundle}\nconformant: {conf} (errors: {nerr}, warnings: {nwarn})")
67
+ for f in val:
68
+ print(f" [{f['severity']:<5}] {f['rule']:<22} {f['path']} — {f['message']}")
69
+ return 1 if (not conf or (a.strict and nwarn > 0)) else 0
70
+
71
+ if a.cmd == "ingest":
72
+ con, s = okf.ingest(a.bundle, db_path=a.db, bundle_id=a.id)
73
+ con.close()
74
+ if a.json:
75
+ print(json.dumps({"bundle": a.bundle, "db": a.db, **s}, indent=2))
76
+ else:
77
+ print(f"ingested {a.bundle} -> {a.db}\n concepts={s['n_concepts']} "
78
+ f"conformant={s['n_conformant']} ({s['conformant']}) errors={s['errors']} "
79
+ f"warnings={s['warnings']} links={s['links_total']} broken={s['links_broken']}")
80
+ return 0 if s["conformant"] else 1
81
+
82
+ if a.cmd == "query":
83
+ con = duckdb.connect(a.db, read_only=True)
84
+ try:
85
+ if a.sql:
86
+ cur = con.execute(a.sql)
87
+ elif a.search:
88
+ cur = con.execute("SELECT path,type,title FROM okf_concept WHERE body ILIKE ? ORDER BY path",
89
+ [f"%{a.search}%"])
90
+ elif a.links:
91
+ cur = con.execute("SELECT * FROM okf_link")
92
+ elif a.findings:
93
+ cur = con.execute("SELECT * FROM okf_validation ORDER BY severity, path")
94
+ else:
95
+ cur = con.execute("SELECT path,reserved,type,title FROM okf_concept ORDER BY path")
96
+ cols = [d[0] for d in cur.description]
97
+ _print(cur.fetchall(), cols, a.json)
98
+ finally:
99
+ con.close()
100
+ return 0
101
+
102
+ if a.cmd == "embed":
103
+ con = duckdb.connect(a.db)
104
+ try:
105
+ n = rag_embed(con, embedder=ollama_embedder(a.model))
106
+ finally:
107
+ con.close()
108
+ print(json.dumps({"db": a.db, "chunks": n}) if a.json else f"embedded {n} chunks into {a.db}")
109
+ return 0
110
+
111
+ if a.cmd == "rag":
112
+ con = duckdb.connect(a.db, read_only=True)
113
+ try:
114
+ rows = rag_search(con, a.query, embedder=ollama_embedder(a.model), k=a.k)
115
+ finally:
116
+ con.close()
117
+ if a.json:
118
+ cols = ["path", "title", "chunk_id", "score", "text"]
119
+ print(json.dumps([dict(zip(cols, r)) for r in rows], indent=2, default=str))
120
+ else:
121
+ for path, title, cid, score, text in rows:
122
+ print(f"[{score:.3f}] {path}#{cid} — {title}\n {text[:160].replace(chr(10),' ')}")
123
+ return 0
124
+
125
+ p.print_help(); return 2
126
+
127
+
128
+ if __name__ == "__main__":
129
+ sys.exit(main())
@@ -0,0 +1,264 @@
1
+ """okf — Open Knowledge Format ingestion (Python binding).
2
+
3
+ Mirrors the R reference binding (r/okf/R/okf.R) and writes a byte-compatible
4
+ DuckDB catalog against the same schema (schema/catalog.sql), so a bundle
5
+ ingested by either language yields the same catalog. Implements OKF v0.1
6
+ permissive consumption: never rejects a bundle for recommended-field issues.
7
+
8
+ Public API:
9
+ read_bundle(root) -> Bundle (concepts + raw links)
10
+ validate(bundle) -> list[Finding]
11
+ links(bundle) -> list[Link]
12
+ ingest(root, db_path) -> (duckdb.Connection, summary dict)
13
+ search(con, term) -> rows
14
+ """
15
+ from __future__ import annotations
16
+ import os, re, json, hashlib, datetime
17
+ from dataclasses import dataclass, field
18
+ from typing import Any, Optional
19
+ import yaml
20
+ import duckdb
21
+
22
+ RESERVED = {"index.md", "log.md"}
23
+
24
+
25
+ class _OKFLoader(yaml.SafeLoader):
26
+ """SafeLoader that leaves ISO timestamps as plain strings (matching the R
27
+ binding) instead of coercing them to datetime — keeps `timestamp` verbatim
28
+ and frontmatter JSON-serializable."""
29
+ pass
30
+
31
+
32
+ _OKFLoader.yaml_implicit_resolvers = {
33
+ k: [(tag, rx) for tag, rx in v if tag != "tag:yaml.org,2002:timestamp"]
34
+ for k, v in yaml.SafeLoader.yaml_implicit_resolvers.items()
35
+ }
36
+ _ISO = re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$")
37
+ _LINK = re.compile(r"\]\(\s*([^)\s]+)")
38
+ _SCHEME = re.compile(r"^[a-zA-Z][a-zA-Z0-9+.-]*:")
39
+
40
+ # Mirror of schema/catalog.sql (that file is canonical; keep in sync).
41
+ SCHEMA = """
42
+ CREATE TABLE IF NOT EXISTS okf_bundle (bundle_id TEXT PRIMARY KEY, root TEXT,
43
+ okf_version TEXT, source_kind TEXT, ingested_at TEXT, n_concepts INTEGER,
44
+ n_conformant INTEGER, conformant BOOLEAN);
45
+ CREATE TABLE IF NOT EXISTS okf_concept (bundle_id TEXT, path TEXT, reserved BOOLEAN,
46
+ type TEXT, title TEXT, description TEXT, resource TEXT, tags TEXT, timestamp TEXT,
47
+ body TEXT, frontmatter TEXT, parse_error TEXT, content_hash TEXT,
48
+ PRIMARY KEY (bundle_id, path));
49
+ CREATE TABLE IF NOT EXISTS okf_link (bundle_id TEXT, src_path TEXT, dst_raw TEXT,
50
+ dst_path TEXT, resolved BOOLEAN);
51
+ CREATE TABLE IF NOT EXISTS okf_validation (bundle_id TEXT, path TEXT, severity TEXT,
52
+ rule TEXT, message TEXT);
53
+ CREATE TABLE IF NOT EXISTS okf_chunk (bundle_id TEXT, path TEXT, chunk_id INTEGER,
54
+ text TEXT, embedding FLOAT[]);
55
+ """
56
+
57
+
58
+ @dataclass
59
+ class Concept:
60
+ path: str
61
+ reserved: bool
62
+ type: Optional[str]
63
+ title: Optional[str]
64
+ description: Optional[str]
65
+ resource: Optional[str]
66
+ tags: Any
67
+ timestamp: Optional[str]
68
+ body: str
69
+ frontmatter: Optional[dict]
70
+ parse_error: Optional[str]
71
+ links_raw: list
72
+ content_hash: str
73
+
74
+
75
+ @dataclass
76
+ class Bundle:
77
+ bundle_id: str
78
+ root: str
79
+ okf_version: Optional[str]
80
+ source_kind: str
81
+ concepts: list = field(default_factory=list)
82
+ known: set = field(default_factory=set)
83
+
84
+
85
+ def _s(x):
86
+ if x is None:
87
+ return None
88
+ if isinstance(x, (list, dict)):
89
+ return None
90
+ return str(x)
91
+
92
+
93
+ def parse_file(path: str) -> dict:
94
+ with open(path, "r", encoding="utf-8") as fh:
95
+ raw = fh.read().split("\n")
96
+ txt = "\n".join(raw)
97
+ i = 0
98
+ while i < len(raw) and raw[i].strip() == "":
99
+ i += 1
100
+ if i >= len(raw) or not re.match(r"^---\s*$", raw[i]):
101
+ return {"meta": None, "body": txt, "err": "no_frontmatter"}
102
+ fences = [j for j, ln in enumerate(raw) if re.match(r"^---\s*$", ln)]
103
+ opn = next(f for f in fences if f >= i)
104
+ after = [f for f in fences if f > opn]
105
+ if not after:
106
+ return {"meta": None, "body": txt, "err": "unclosed_frontmatter"}
107
+ close = after[0]
108
+ fm = "\n".join(raw[opn + 1:close])
109
+ body = "\n".join(raw[close + 1:]) if close < len(raw) - 1 else ""
110
+ try:
111
+ meta = yaml.load(fm, Loader=_OKFLoader)
112
+ except Exception:
113
+ meta = None
114
+ if meta is None or not isinstance(meta, dict):
115
+ return {"meta": None, "body": body, "err": "yaml_parse_error"}
116
+ return {"meta": meta, "body": body, "err": None}
117
+
118
+
119
+ def extract_links(body: str) -> list:
120
+ return _LINK.findall(body)
121
+
122
+
123
+ def _norm(p: str) -> str:
124
+ out = []
125
+ for s in p.replace("\\", "/").split("/"):
126
+ if s in ("", "."):
127
+ continue
128
+ if s == "..":
129
+ if out:
130
+ out.pop()
131
+ continue
132
+ out.append(s)
133
+ return "/".join(out)
134
+
135
+
136
+ def _is_external(raw: str) -> bool:
137
+ return bool(_SCHEME.match(raw.split("#", 1)[0]))
138
+
139
+
140
+ def resolve_link(raw: str, src_rel: str, known: set) -> Optional[str]:
141
+ t = raw.split("#", 1)[0]
142
+ if t.startswith("/"):
143
+ cand = t[1:]
144
+ else:
145
+ d = os.path.dirname(src_rel)
146
+ cand = t if d == "" else f"{d}/{t}"
147
+ cand = _norm(cand)
148
+ return cand if cand in known else None
149
+
150
+
151
+ def read_bundle(root: str, bundle_id: Optional[str] = None, source_kind: str = "dir") -> Bundle:
152
+ root = os.path.realpath(root).replace("\\", "/")
153
+ files = []
154
+ for dp, _, fns in os.walk(root):
155
+ for fn in fns:
156
+ if fn.endswith(".md"):
157
+ files.append(os.path.join(dp, fn))
158
+ files.sort()
159
+ concepts = []
160
+ for f in files:
161
+ rel = os.path.relpath(f, root).replace("\\", "/")
162
+ p = parse_file(f)
163
+ meta = p["meta"] or {}
164
+ concepts.append(Concept(
165
+ path=rel, reserved=os.path.basename(f) in RESERVED,
166
+ type=_s(meta.get("type")), title=_s(meta.get("title")),
167
+ description=_s(meta.get("description")), resource=_s(meta.get("resource")),
168
+ tags=meta.get("tags"), timestamp=_s(meta.get("timestamp")),
169
+ body=p["body"], frontmatter=p["meta"], parse_error=p["err"],
170
+ links_raw=extract_links(p["body"]),
171
+ content_hash=hashlib.sha1(p["body"].encode("utf-8")).hexdigest()))
172
+ known = {c.path for c in concepts}
173
+ idx = [c for c in concepts if c.path == "index.md"]
174
+ okf_version = _s((idx[0].frontmatter or {}).get("okf_version")) if idx else None
175
+ if bundle_id is None:
176
+ bundle_id = hashlib.sha1(root.encode("utf-8")).hexdigest()
177
+ return Bundle(bundle_id, root, okf_version, source_kind, concepts, known)
178
+
179
+
180
+ def links(b: Bundle) -> list:
181
+ out = []
182
+ for c in b.concepts:
183
+ for raw in c.links_raw:
184
+ if _is_external(raw):
185
+ continue
186
+ dst = resolve_link(raw, c.path, b.known)
187
+ out.append({"src_path": c.path, "dst_raw": raw,
188
+ "dst_path": dst, "resolved": dst is not None})
189
+ return out
190
+
191
+
192
+ def validate(b: Bundle) -> list:
193
+ out = []
194
+ def add(path, sev, rule, msg):
195
+ out.append({"path": path, "severity": sev, "rule": rule, "message": msg})
196
+ for c in b.concepts:
197
+ if c.reserved:
198
+ continue
199
+ if c.parse_error is not None:
200
+ add(c.path, "error", "frontmatter_unparseable",
201
+ f"no parseable frontmatter ({c.parse_error})")
202
+ continue
203
+ if not c.type:
204
+ add(c.path, "error", "missing_type", "frontmatter has no non-empty type")
205
+ if c.title is None:
206
+ add(c.path, "warn", "missing_title", "recommended field title absent")
207
+ if c.description is None:
208
+ add(c.path, "warn", "missing_description", "recommended field description absent")
209
+ if c.timestamp is None:
210
+ add(c.path, "warn", "missing_timestamp", "recommended field timestamp absent")
211
+ elif not _ISO.match(c.timestamp):
212
+ add(c.path, "warn", "timestamp_not_iso8601", f"timestamp not ISO-8601: {c.timestamp}")
213
+ for lk in links(b):
214
+ if not lk["resolved"]:
215
+ add(lk["src_path"], "warn", "broken_link", f"unresolved link: {lk['dst_raw']}")
216
+ return out
217
+
218
+
219
+ def ingest(root, db_path: str = ":memory:", ingested_at: Optional[str] = None,
220
+ bundle_id: Optional[str] = None, source_kind: str = "dir"):
221
+ b = root if isinstance(root, Bundle) else read_bundle(root, bundle_id, source_kind)
222
+ val = validate(b)
223
+ lk = links(b)
224
+ if ingested_at is None:
225
+ ingested_at = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
226
+
227
+ err_paths = {f["path"] for f in val if f["severity"] == "error"}
228
+ non_reserved = [c for c in b.concepts if not c.reserved]
229
+ n_conf = sum(1 for c in non_reserved if c.path not in err_paths)
230
+
231
+ con = duckdb.connect(db_path)
232
+ for stmt in (s.strip() for s in SCHEMA.split(";") if s.strip()):
233
+ con.execute(stmt)
234
+
235
+ con.execute("INSERT INTO okf_bundle VALUES (?,?,?,?,?,?,?,?)",
236
+ [b.bundle_id, b.root, b.okf_version, b.source_kind, ingested_at,
237
+ len(non_reserved), n_conf, len(err_paths) == 0])
238
+ for c in b.concepts:
239
+ con.execute("INSERT INTO okf_concept VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)",
240
+ [b.bundle_id, c.path, c.reserved, c.type, c.title, c.description,
241
+ c.resource, None if c.tags is None else json.dumps(c.tags),
242
+ c.timestamp, c.body, json.dumps(c.frontmatter or {}),
243
+ c.parse_error, c.content_hash])
244
+ for lk_ in lk:
245
+ con.execute("INSERT INTO okf_link VALUES (?,?,?,?,?)",
246
+ [b.bundle_id, lk_["src_path"], lk_["dst_raw"], lk_["dst_path"], lk_["resolved"]])
247
+ for f in val:
248
+ con.execute("INSERT INTO okf_validation VALUES (?,?,?,?,?)",
249
+ [b.bundle_id, f["path"], f["severity"], f["rule"], f["message"]])
250
+
251
+ summary = {
252
+ "n_files": len(b.concepts), "n_concepts": len(non_reserved), "n_conformant": n_conf,
253
+ "conformant": len(err_paths) == 0,
254
+ "errors": sum(1 for f in val if f["severity"] == "error"),
255
+ "warnings": sum(1 for f in val if f["severity"] == "warn"),
256
+ "links_total": len(lk), "links_broken": sum(1 for x in lk if not x["resolved"]),
257
+ }
258
+ return con, summary
259
+
260
+
261
+ def search(con, term: str):
262
+ return con.execute(
263
+ "SELECT path, type, title FROM okf_concept WHERE body ILIKE ? ORDER BY path",
264
+ [f"%{term}%"]).fetchall()
@@ -0,0 +1,78 @@
1
+ """okf RAG layer (Python) — mirrors the R binding's embeddings/RAG functions.
2
+
3
+ Pluggable embedder (default: local Ollama nomic-embed-text); brute-force cosine
4
+ search via DuckDB's native list_cosine_similarity. An embedder is a callable
5
+ texts:list[str] -> list[list[float]].
6
+ """
7
+ from __future__ import annotations
8
+ import os, re, json, urllib.request
9
+ from typing import Callable, Optional
10
+
11
+
12
+ def chunk_body(body: str, target_chars: int = 600) -> list:
13
+ paras = [p.strip() for p in re.split(r"\n[ \t]*\n", body or "") if p.strip()]
14
+ chunks, cur = [], ""
15
+ for p in paras:
16
+ if not cur:
17
+ cur = p
18
+ elif len(cur) + len(p) + 2 <= target_chars:
19
+ cur = f"{cur}\n\n{p}"
20
+ else:
21
+ chunks.append(cur); cur = p
22
+ if cur:
23
+ chunks.append(cur)
24
+ return chunks
25
+
26
+
27
+ def ollama_embedder(model: str = "nomic-embed-text",
28
+ url: Optional[str] = None) -> Callable:
29
+ url = url or os.environ.get("OLLAMA_URL", "http://localhost:11434")
30
+
31
+ def embed(texts):
32
+ out = []
33
+ for t in texts:
34
+ req = urllib.request.Request(
35
+ f"{url}/api/embeddings",
36
+ data=json.dumps({"model": model, "prompt": t}).encode("utf-8"),
37
+ headers={"Content-Type": "application/json"})
38
+ with urllib.request.urlopen(req, timeout=120) as r:
39
+ out.append(json.loads(r.read())["embedding"])
40
+ return out
41
+ return embed
42
+
43
+
44
+ def _vec_lit(v) -> str:
45
+ return "[" + ",".join(f"{float(z):.8g}" for z in v) + "]::FLOAT[]"
46
+
47
+
48
+ def embed(con, embedder: Optional[Callable] = None, target_chars: int = 600) -> int:
49
+ embedder = embedder or ollama_embedder()
50
+ con.execute("CREATE TABLE IF NOT EXISTS okf_chunk (bundle_id TEXT, path TEXT, "
51
+ "chunk_id INTEGER, text TEXT, embedding FLOAT[])")
52
+ rows = con.execute(
53
+ "SELECT bundle_id, path, body FROM okf_concept WHERE reserved = FALSE ORDER BY path"
54
+ ).fetchall()
55
+ con.execute("DELETE FROM okf_chunk")
56
+ n = 0
57
+ for bundle_id, path, body in rows:
58
+ chs = chunk_body(body, target_chars)
59
+ if not chs:
60
+ continue
61
+ embs = embedder(chs)
62
+ for k, (text, vec) in enumerate(zip(chs, embs), start=1):
63
+ con.execute(
64
+ f"INSERT INTO okf_chunk VALUES (?,?,?,?, {_vec_lit(vec)})",
65
+ [bundle_id, path, k, text])
66
+ n += 1
67
+ return n
68
+
69
+
70
+ def rag(con, query: str, embedder: Optional[Callable] = None, k: int = 5):
71
+ embedder = embedder or ollama_embedder()
72
+ qv = embedder([query])[0]
73
+ return con.execute(
74
+ f"""SELECT ch.path, c.title, ch.chunk_id,
75
+ list_cosine_similarity(ch.embedding, {_vec_lit(qv)}) AS score, ch.text
76
+ FROM okf_chunk ch JOIN okf_concept c USING (bundle_id, path)
77
+ WHERE ch.embedding IS NOT NULL ORDER BY score DESC LIMIT {int(k)}"""
78
+ ).fetchall()
@@ -0,0 +1,41 @@
1
+ Metadata-Version: 2.4
2
+ Name: okf-ingest
3
+ Version: 0.1.0
4
+ Summary: Unified ingestion tool for Open Knowledge Format (OKF) bundles — validate, load into a portable DuckDB catalog, and semantically search.
5
+ Author-email: Travis Jakel <travis.s.jakel@gmail.com>
6
+ License-Expression: Apache-2.0
7
+ Project-URL: Homepage, https://github.com/travisjakel/okf-ingest
8
+ Project-URL: Repository, https://github.com/travisjakel/okf-ingest
9
+ Project-URL: OKF Specification, https://github.com/GoogleCloudPlatform/knowledge-catalog
10
+ Keywords: okf,open-knowledge-format,knowledge,rag,duckdb,agents
11
+ Requires-Python: >=3.9
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: pyyaml>=6
14
+ Requires-Dist: duckdb>=1.0
15
+
16
+ # okf (Python binding)
17
+
18
+ Python binding of **okf-ingest** — a unified ingestion tool for
19
+ [Open Knowledge Format](https://github.com/GoogleCloudPlatform/knowledge-catalog)
20
+ (OKF) bundles. Validate a bundle, load it into a portable DuckDB catalog, and
21
+ semantically search it.
22
+
23
+ ```bash
24
+ pip install okf-ingest
25
+ okf validate ./bundle
26
+ okf ingest ./bundle --db catalog.duckdb
27
+ okf embed catalog.duckdb # uses local Ollama nomic-embed-text by default
28
+ okf rag catalog.duckdb --query "how is revenue computed?" -k 5
29
+ ```
30
+
31
+ ```python
32
+ import okf
33
+ con, summary = okf.ingest("./bundle", db_path="catalog.duckdb")
34
+ okf.embed(con) # pluggable embedder
35
+ okf.rag_search(con, "revenue", k=5)
36
+ ```
37
+
38
+ The catalog format is shared with the R binding (`okf` on CRAN-style install),
39
+ so you can ingest/embed in one language and query from the other. See the
40
+ [project README](https://github.com/travisjakel/okf-ingest) and `docs/` for the
41
+ full spec-conformance notes and architecture.
@@ -0,0 +1,13 @@
1
+ README.md
2
+ pyproject.toml
3
+ okf/__init__.py
4
+ okf/__main__.py
5
+ okf/cli.py
6
+ okf/okf.py
7
+ okf/rag.py
8
+ okf_ingest.egg-info/PKG-INFO
9
+ okf_ingest.egg-info/SOURCES.txt
10
+ okf_ingest.egg-info/dependency_links.txt
11
+ okf_ingest.egg-info/entry_points.txt
12
+ okf_ingest.egg-info/requires.txt
13
+ okf_ingest.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ okf = okf.cli:main
@@ -0,0 +1,2 @@
1
+ pyyaml>=6
2
+ duckdb>=1.0
@@ -0,0 +1,25 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "okf-ingest"
7
+ version = "0.1.0"
8
+ description = "Unified ingestion tool for Open Knowledge Format (OKF) bundles — validate, load into a portable DuckDB catalog, and semantically search."
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = "Apache-2.0"
12
+ authors = [{ name = "Travis Jakel", email = "travis.s.jakel@gmail.com" }]
13
+ keywords = ["okf", "open-knowledge-format", "knowledge", "rag", "duckdb", "agents"]
14
+ dependencies = ["pyyaml>=6", "duckdb>=1.0"]
15
+
16
+ [project.urls]
17
+ Homepage = "https://github.com/travisjakel/okf-ingest"
18
+ Repository = "https://github.com/travisjakel/okf-ingest"
19
+ "OKF Specification" = "https://github.com/GoogleCloudPlatform/knowledge-catalog"
20
+
21
+ [project.scripts]
22
+ okf = "okf.cli:main"
23
+
24
+ [tool.setuptools]
25
+ packages = ["okf"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+