okf-ingest 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- okf_ingest-0.1.0/PKG-INFO +41 -0
- okf_ingest-0.1.0/README.md +26 -0
- okf_ingest-0.1.0/okf/__init__.py +9 -0
- okf_ingest-0.1.0/okf/__main__.py +4 -0
- okf_ingest-0.1.0/okf/cli.py +129 -0
- okf_ingest-0.1.0/okf/okf.py +264 -0
- okf_ingest-0.1.0/okf/rag.py +78 -0
- okf_ingest-0.1.0/okf_ingest.egg-info/PKG-INFO +41 -0
- okf_ingest-0.1.0/okf_ingest.egg-info/SOURCES.txt +13 -0
- okf_ingest-0.1.0/okf_ingest.egg-info/dependency_links.txt +1 -0
- okf_ingest-0.1.0/okf_ingest.egg-info/entry_points.txt +2 -0
- okf_ingest-0.1.0/okf_ingest.egg-info/requires.txt +2 -0
- okf_ingest-0.1.0/okf_ingest.egg-info/top_level.txt +1 -0
- okf_ingest-0.1.0/pyproject.toml +25 -0
- okf_ingest-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: okf-ingest
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Unified ingestion tool for Open Knowledge Format (OKF) bundles — validate, load into a portable DuckDB catalog, and semantically search.
|
|
5
|
+
Author-email: Travis Jakel <travis.s.jakel@gmail.com>
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://github.com/travisjakel/okf-ingest
|
|
8
|
+
Project-URL: Repository, https://github.com/travisjakel/okf-ingest
|
|
9
|
+
Project-URL: OKF Specification, https://github.com/GoogleCloudPlatform/knowledge-catalog
|
|
10
|
+
Keywords: okf,open-knowledge-format,knowledge,rag,duckdb,agents
|
|
11
|
+
Requires-Python: >=3.9
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
Requires-Dist: pyyaml>=6
|
|
14
|
+
Requires-Dist: duckdb>=1.0
|
|
15
|
+
|
|
16
|
+
# okf (Python binding)
|
|
17
|
+
|
|
18
|
+
Python binding of **okf-ingest** — a unified ingestion tool for
|
|
19
|
+
[Open Knowledge Format](https://github.com/GoogleCloudPlatform/knowledge-catalog)
|
|
20
|
+
(OKF) bundles. Validate a bundle, load it into a portable DuckDB catalog, and
|
|
21
|
+
semantically search it.
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install okf-ingest
|
|
25
|
+
okf validate ./bundle
|
|
26
|
+
okf ingest ./bundle --db catalog.duckdb
|
|
27
|
+
okf embed catalog.duckdb # uses local Ollama nomic-embed-text by default
|
|
28
|
+
okf rag catalog.duckdb --query "how is revenue computed?" -k 5
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
import okf
|
|
33
|
+
con, summary = okf.ingest("./bundle", db_path="catalog.duckdb")
|
|
34
|
+
okf.embed(con) # pluggable embedder
|
|
35
|
+
okf.rag_search(con, "revenue", k=5)
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
The catalog format is shared with the R binding (`okf` on CRAN-style install),
|
|
39
|
+
so you can ingest/embed in one language and query from the other. See the
|
|
40
|
+
[project README](https://github.com/travisjakel/okf-ingest) and `docs/` for the
|
|
41
|
+
full spec-conformance notes and architecture.
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# okf (Python binding)
|
|
2
|
+
|
|
3
|
+
Python binding of **okf-ingest** — a unified ingestion tool for
|
|
4
|
+
[Open Knowledge Format](https://github.com/GoogleCloudPlatform/knowledge-catalog)
|
|
5
|
+
(OKF) bundles. Validate a bundle, load it into a portable DuckDB catalog, and
|
|
6
|
+
semantically search it.
|
|
7
|
+
|
|
8
|
+
```bash
|
|
9
|
+
pip install okf-ingest
|
|
10
|
+
okf validate ./bundle
|
|
11
|
+
okf ingest ./bundle --db catalog.duckdb
|
|
12
|
+
okf embed catalog.duckdb # uses local Ollama nomic-embed-text by default
|
|
13
|
+
okf rag catalog.duckdb --query "how is revenue computed?" -k 5
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
```python
|
|
17
|
+
import okf
|
|
18
|
+
con, summary = okf.ingest("./bundle", db_path="catalog.duckdb")
|
|
19
|
+
okf.embed(con) # pluggable embedder
|
|
20
|
+
okf.rag_search(con, "revenue", k=5)
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
The catalog format is shared with the R binding (`okf` on CRAN-style install),
|
|
24
|
+
so you can ingest/embed in one language and query from the other. See the
|
|
25
|
+
[project README](https://github.com/travisjakel/okf-ingest) and `docs/` for the
|
|
26
|
+
full spec-conformance notes and architecture.
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""okf — Open Knowledge Format ingestion (Python binding)."""
|
|
2
|
+
from .okf import read_bundle, validate, links, ingest, search, Bundle, Concept
|
|
3
|
+
# Note: import the function as `rag_search` so it does not shadow the `okf.rag`
|
|
4
|
+
# submodule (i.e. `import okf.rag` keeps returning the module, not this fn).
|
|
5
|
+
from .rag import chunk_body, ollama_embedder, embed
|
|
6
|
+
from .rag import rag as rag_search
|
|
7
|
+
|
|
8
|
+
__all__ = ["read_bundle", "validate", "links", "ingest", "search", "Bundle", "Concept",
|
|
9
|
+
"chunk_body", "ollama_embedder", "embed", "rag_search"]
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""okf — command-line interface (Python). Mirrors r/okf/bin/okf.R.
|
|
3
|
+
|
|
4
|
+
okf validate <bundle> [--strict] [--json]
|
|
5
|
+
okf ingest <bundle> --db <path> [--id <id>] [--json]
|
|
6
|
+
okf query <db> [--sql "..."] [--search <term>] [--concepts] [--links] [--findings] [--json]
|
|
7
|
+
|
|
8
|
+
Exit codes: 0 ok · 1 conformance failure · 2 usage error.
|
|
9
|
+
"""
|
|
10
|
+
import argparse, json, os, sys
|
|
11
|
+
|
|
12
|
+
# allow running as `python py/okf/cli.py ...` (put the package parent on path)
|
|
13
|
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
14
|
+
import okf.okf as okf # noqa: E402
|
|
15
|
+
from okf.rag import embed as rag_embed, rag as rag_search, ollama_embedder # noqa: E402
|
|
16
|
+
|
|
17
|
+
import duckdb # noqa: E402
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _print(rows, cols, as_json):
|
|
21
|
+
if as_json:
|
|
22
|
+
print(json.dumps([dict(zip(cols, r)) for r in rows], indent=2, default=str))
|
|
23
|
+
else:
|
|
24
|
+
print(" | ".join(cols))
|
|
25
|
+
for r in rows:
|
|
26
|
+
print(" | ".join("" if v is None else str(v) for v in r))
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def main(argv=None):
|
|
30
|
+
p = argparse.ArgumentParser(prog="okf", add_help=True)
|
|
31
|
+
sub = p.add_subparsers(dest="cmd")
|
|
32
|
+
|
|
33
|
+
v = sub.add_parser("validate"); v.add_argument("bundle")
|
|
34
|
+
v.add_argument("--strict", action="store_true"); v.add_argument("--json", action="store_true")
|
|
35
|
+
|
|
36
|
+
i = sub.add_parser("ingest"); i.add_argument("bundle")
|
|
37
|
+
i.add_argument("--db", default=":memory:"); i.add_argument("--id", default=None)
|
|
38
|
+
i.add_argument("--json", action="store_true")
|
|
39
|
+
|
|
40
|
+
q = sub.add_parser("query"); q.add_argument("db")
|
|
41
|
+
q.add_argument("--sql"); q.add_argument("--search")
|
|
42
|
+
q.add_argument("--concepts", action="store_true"); q.add_argument("--links", action="store_true")
|
|
43
|
+
q.add_argument("--findings", action="store_true"); q.add_argument("--json", action="store_true")
|
|
44
|
+
|
|
45
|
+
e = sub.add_parser("embed"); e.add_argument("db")
|
|
46
|
+
e.add_argument("--model", default="nomic-embed-text"); e.add_argument("--json", action="store_true")
|
|
47
|
+
|
|
48
|
+
r = sub.add_parser("rag"); r.add_argument("db"); r.add_argument("--query", required=True)
|
|
49
|
+
r.add_argument("-k", type=int, default=5); r.add_argument("--model", default="nomic-embed-text")
|
|
50
|
+
r.add_argument("--json", action="store_true")
|
|
51
|
+
|
|
52
|
+
a = p.parse_args(argv)
|
|
53
|
+
if not a.cmd:
|
|
54
|
+
p.print_help(); return 2
|
|
55
|
+
|
|
56
|
+
if a.cmd == "validate":
|
|
57
|
+
b = okf.read_bundle(a.bundle)
|
|
58
|
+
val = okf.validate(b)
|
|
59
|
+
nerr = sum(1 for f in val if f["severity"] == "error")
|
|
60
|
+
nwarn = sum(1 for f in val if f["severity"] == "warn")
|
|
61
|
+
conf = nerr == 0
|
|
62
|
+
if a.json:
|
|
63
|
+
print(json.dumps({"bundle": a.bundle, "conformant": conf, "errors": nerr,
|
|
64
|
+
"warnings": nwarn, "findings": val}, indent=2))
|
|
65
|
+
else:
|
|
66
|
+
print(f"bundle: {a.bundle}\nconformant: {conf} (errors: {nerr}, warnings: {nwarn})")
|
|
67
|
+
for f in val:
|
|
68
|
+
print(f" [{f['severity']:<5}] {f['rule']:<22} {f['path']} — {f['message']}")
|
|
69
|
+
return 1 if (not conf or (a.strict and nwarn > 0)) else 0
|
|
70
|
+
|
|
71
|
+
if a.cmd == "ingest":
|
|
72
|
+
con, s = okf.ingest(a.bundle, db_path=a.db, bundle_id=a.id)
|
|
73
|
+
con.close()
|
|
74
|
+
if a.json:
|
|
75
|
+
print(json.dumps({"bundle": a.bundle, "db": a.db, **s}, indent=2))
|
|
76
|
+
else:
|
|
77
|
+
print(f"ingested {a.bundle} -> {a.db}\n concepts={s['n_concepts']} "
|
|
78
|
+
f"conformant={s['n_conformant']} ({s['conformant']}) errors={s['errors']} "
|
|
79
|
+
f"warnings={s['warnings']} links={s['links_total']} broken={s['links_broken']}")
|
|
80
|
+
return 0 if s["conformant"] else 1
|
|
81
|
+
|
|
82
|
+
if a.cmd == "query":
|
|
83
|
+
con = duckdb.connect(a.db, read_only=True)
|
|
84
|
+
try:
|
|
85
|
+
if a.sql:
|
|
86
|
+
cur = con.execute(a.sql)
|
|
87
|
+
elif a.search:
|
|
88
|
+
cur = con.execute("SELECT path,type,title FROM okf_concept WHERE body ILIKE ? ORDER BY path",
|
|
89
|
+
[f"%{a.search}%"])
|
|
90
|
+
elif a.links:
|
|
91
|
+
cur = con.execute("SELECT * FROM okf_link")
|
|
92
|
+
elif a.findings:
|
|
93
|
+
cur = con.execute("SELECT * FROM okf_validation ORDER BY severity, path")
|
|
94
|
+
else:
|
|
95
|
+
cur = con.execute("SELECT path,reserved,type,title FROM okf_concept ORDER BY path")
|
|
96
|
+
cols = [d[0] for d in cur.description]
|
|
97
|
+
_print(cur.fetchall(), cols, a.json)
|
|
98
|
+
finally:
|
|
99
|
+
con.close()
|
|
100
|
+
return 0
|
|
101
|
+
|
|
102
|
+
if a.cmd == "embed":
|
|
103
|
+
con = duckdb.connect(a.db)
|
|
104
|
+
try:
|
|
105
|
+
n = rag_embed(con, embedder=ollama_embedder(a.model))
|
|
106
|
+
finally:
|
|
107
|
+
con.close()
|
|
108
|
+
print(json.dumps({"db": a.db, "chunks": n}) if a.json else f"embedded {n} chunks into {a.db}")
|
|
109
|
+
return 0
|
|
110
|
+
|
|
111
|
+
if a.cmd == "rag":
|
|
112
|
+
con = duckdb.connect(a.db, read_only=True)
|
|
113
|
+
try:
|
|
114
|
+
rows = rag_search(con, a.query, embedder=ollama_embedder(a.model), k=a.k)
|
|
115
|
+
finally:
|
|
116
|
+
con.close()
|
|
117
|
+
if a.json:
|
|
118
|
+
cols = ["path", "title", "chunk_id", "score", "text"]
|
|
119
|
+
print(json.dumps([dict(zip(cols, r)) for r in rows], indent=2, default=str))
|
|
120
|
+
else:
|
|
121
|
+
for path, title, cid, score, text in rows:
|
|
122
|
+
print(f"[{score:.3f}] {path}#{cid} — {title}\n {text[:160].replace(chr(10),' ')}")
|
|
123
|
+
return 0
|
|
124
|
+
|
|
125
|
+
p.print_help(); return 2
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
if __name__ == "__main__":
|
|
129
|
+
sys.exit(main())
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
"""okf — Open Knowledge Format ingestion (Python binding).
|
|
2
|
+
|
|
3
|
+
Mirrors the R reference binding (r/okf/R/okf.R) and writes a byte-compatible
|
|
4
|
+
DuckDB catalog against the same schema (schema/catalog.sql), so a bundle
|
|
5
|
+
ingested by either language yields the same catalog. Implements OKF v0.1
|
|
6
|
+
permissive consumption: never rejects a bundle for recommended-field issues.
|
|
7
|
+
|
|
8
|
+
Public API:
|
|
9
|
+
read_bundle(root) -> Bundle (concepts + raw links)
|
|
10
|
+
validate(bundle) -> list[Finding]
|
|
11
|
+
links(bundle) -> list[Link]
|
|
12
|
+
ingest(root, db_path) -> (duckdb.Connection, summary dict)
|
|
13
|
+
search(con, term) -> rows
|
|
14
|
+
"""
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
import os, re, json, hashlib, datetime
|
|
17
|
+
from dataclasses import dataclass, field
|
|
18
|
+
from typing import Any, Optional
|
|
19
|
+
import yaml
|
|
20
|
+
import duckdb
|
|
21
|
+
|
|
22
|
+
RESERVED = {"index.md", "log.md"}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class _OKFLoader(yaml.SafeLoader):
|
|
26
|
+
"""SafeLoader that leaves ISO timestamps as plain strings (matching the R
|
|
27
|
+
binding) instead of coercing them to datetime — keeps `timestamp` verbatim
|
|
28
|
+
and frontmatter JSON-serializable."""
|
|
29
|
+
pass
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
_OKFLoader.yaml_implicit_resolvers = {
|
|
33
|
+
k: [(tag, rx) for tag, rx in v if tag != "tag:yaml.org,2002:timestamp"]
|
|
34
|
+
for k, v in yaml.SafeLoader.yaml_implicit_resolvers.items()
|
|
35
|
+
}
|
|
36
|
+
_ISO = re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$")
|
|
37
|
+
_LINK = re.compile(r"\]\(\s*([^)\s]+)")
|
|
38
|
+
_SCHEME = re.compile(r"^[a-zA-Z][a-zA-Z0-9+.-]*:")
|
|
39
|
+
|
|
40
|
+
# Mirror of schema/catalog.sql (that file is canonical; keep in sync).
|
|
41
|
+
SCHEMA = """
|
|
42
|
+
CREATE TABLE IF NOT EXISTS okf_bundle (bundle_id TEXT PRIMARY KEY, root TEXT,
|
|
43
|
+
okf_version TEXT, source_kind TEXT, ingested_at TEXT, n_concepts INTEGER,
|
|
44
|
+
n_conformant INTEGER, conformant BOOLEAN);
|
|
45
|
+
CREATE TABLE IF NOT EXISTS okf_concept (bundle_id TEXT, path TEXT, reserved BOOLEAN,
|
|
46
|
+
type TEXT, title TEXT, description TEXT, resource TEXT, tags TEXT, timestamp TEXT,
|
|
47
|
+
body TEXT, frontmatter TEXT, parse_error TEXT, content_hash TEXT,
|
|
48
|
+
PRIMARY KEY (bundle_id, path));
|
|
49
|
+
CREATE TABLE IF NOT EXISTS okf_link (bundle_id TEXT, src_path TEXT, dst_raw TEXT,
|
|
50
|
+
dst_path TEXT, resolved BOOLEAN);
|
|
51
|
+
CREATE TABLE IF NOT EXISTS okf_validation (bundle_id TEXT, path TEXT, severity TEXT,
|
|
52
|
+
rule TEXT, message TEXT);
|
|
53
|
+
CREATE TABLE IF NOT EXISTS okf_chunk (bundle_id TEXT, path TEXT, chunk_id INTEGER,
|
|
54
|
+
text TEXT, embedding FLOAT[]);
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass
|
|
59
|
+
class Concept:
|
|
60
|
+
path: str
|
|
61
|
+
reserved: bool
|
|
62
|
+
type: Optional[str]
|
|
63
|
+
title: Optional[str]
|
|
64
|
+
description: Optional[str]
|
|
65
|
+
resource: Optional[str]
|
|
66
|
+
tags: Any
|
|
67
|
+
timestamp: Optional[str]
|
|
68
|
+
body: str
|
|
69
|
+
frontmatter: Optional[dict]
|
|
70
|
+
parse_error: Optional[str]
|
|
71
|
+
links_raw: list
|
|
72
|
+
content_hash: str
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@dataclass
|
|
76
|
+
class Bundle:
|
|
77
|
+
bundle_id: str
|
|
78
|
+
root: str
|
|
79
|
+
okf_version: Optional[str]
|
|
80
|
+
source_kind: str
|
|
81
|
+
concepts: list = field(default_factory=list)
|
|
82
|
+
known: set = field(default_factory=set)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _s(x):
|
|
86
|
+
if x is None:
|
|
87
|
+
return None
|
|
88
|
+
if isinstance(x, (list, dict)):
|
|
89
|
+
return None
|
|
90
|
+
return str(x)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def parse_file(path: str) -> dict:
|
|
94
|
+
with open(path, "r", encoding="utf-8") as fh:
|
|
95
|
+
raw = fh.read().split("\n")
|
|
96
|
+
txt = "\n".join(raw)
|
|
97
|
+
i = 0
|
|
98
|
+
while i < len(raw) and raw[i].strip() == "":
|
|
99
|
+
i += 1
|
|
100
|
+
if i >= len(raw) or not re.match(r"^---\s*$", raw[i]):
|
|
101
|
+
return {"meta": None, "body": txt, "err": "no_frontmatter"}
|
|
102
|
+
fences = [j for j, ln in enumerate(raw) if re.match(r"^---\s*$", ln)]
|
|
103
|
+
opn = next(f for f in fences if f >= i)
|
|
104
|
+
after = [f for f in fences if f > opn]
|
|
105
|
+
if not after:
|
|
106
|
+
return {"meta": None, "body": txt, "err": "unclosed_frontmatter"}
|
|
107
|
+
close = after[0]
|
|
108
|
+
fm = "\n".join(raw[opn + 1:close])
|
|
109
|
+
body = "\n".join(raw[close + 1:]) if close < len(raw) - 1 else ""
|
|
110
|
+
try:
|
|
111
|
+
meta = yaml.load(fm, Loader=_OKFLoader)
|
|
112
|
+
except Exception:
|
|
113
|
+
meta = None
|
|
114
|
+
if meta is None or not isinstance(meta, dict):
|
|
115
|
+
return {"meta": None, "body": body, "err": "yaml_parse_error"}
|
|
116
|
+
return {"meta": meta, "body": body, "err": None}
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def extract_links(body: str) -> list:
|
|
120
|
+
return _LINK.findall(body)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _norm(p: str) -> str:
|
|
124
|
+
out = []
|
|
125
|
+
for s in p.replace("\\", "/").split("/"):
|
|
126
|
+
if s in ("", "."):
|
|
127
|
+
continue
|
|
128
|
+
if s == "..":
|
|
129
|
+
if out:
|
|
130
|
+
out.pop()
|
|
131
|
+
continue
|
|
132
|
+
out.append(s)
|
|
133
|
+
return "/".join(out)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _is_external(raw: str) -> bool:
|
|
137
|
+
return bool(_SCHEME.match(raw.split("#", 1)[0]))
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def resolve_link(raw: str, src_rel: str, known: set) -> Optional[str]:
|
|
141
|
+
t = raw.split("#", 1)[0]
|
|
142
|
+
if t.startswith("/"):
|
|
143
|
+
cand = t[1:]
|
|
144
|
+
else:
|
|
145
|
+
d = os.path.dirname(src_rel)
|
|
146
|
+
cand = t if d == "" else f"{d}/{t}"
|
|
147
|
+
cand = _norm(cand)
|
|
148
|
+
return cand if cand in known else None
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def read_bundle(root: str, bundle_id: Optional[str] = None, source_kind: str = "dir") -> Bundle:
|
|
152
|
+
root = os.path.realpath(root).replace("\\", "/")
|
|
153
|
+
files = []
|
|
154
|
+
for dp, _, fns in os.walk(root):
|
|
155
|
+
for fn in fns:
|
|
156
|
+
if fn.endswith(".md"):
|
|
157
|
+
files.append(os.path.join(dp, fn))
|
|
158
|
+
files.sort()
|
|
159
|
+
concepts = []
|
|
160
|
+
for f in files:
|
|
161
|
+
rel = os.path.relpath(f, root).replace("\\", "/")
|
|
162
|
+
p = parse_file(f)
|
|
163
|
+
meta = p["meta"] or {}
|
|
164
|
+
concepts.append(Concept(
|
|
165
|
+
path=rel, reserved=os.path.basename(f) in RESERVED,
|
|
166
|
+
type=_s(meta.get("type")), title=_s(meta.get("title")),
|
|
167
|
+
description=_s(meta.get("description")), resource=_s(meta.get("resource")),
|
|
168
|
+
tags=meta.get("tags"), timestamp=_s(meta.get("timestamp")),
|
|
169
|
+
body=p["body"], frontmatter=p["meta"], parse_error=p["err"],
|
|
170
|
+
links_raw=extract_links(p["body"]),
|
|
171
|
+
content_hash=hashlib.sha1(p["body"].encode("utf-8")).hexdigest()))
|
|
172
|
+
known = {c.path for c in concepts}
|
|
173
|
+
idx = [c for c in concepts if c.path == "index.md"]
|
|
174
|
+
okf_version = _s((idx[0].frontmatter or {}).get("okf_version")) if idx else None
|
|
175
|
+
if bundle_id is None:
|
|
176
|
+
bundle_id = hashlib.sha1(root.encode("utf-8")).hexdigest()
|
|
177
|
+
return Bundle(bundle_id, root, okf_version, source_kind, concepts, known)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def links(b: Bundle) -> list:
|
|
181
|
+
out = []
|
|
182
|
+
for c in b.concepts:
|
|
183
|
+
for raw in c.links_raw:
|
|
184
|
+
if _is_external(raw):
|
|
185
|
+
continue
|
|
186
|
+
dst = resolve_link(raw, c.path, b.known)
|
|
187
|
+
out.append({"src_path": c.path, "dst_raw": raw,
|
|
188
|
+
"dst_path": dst, "resolved": dst is not None})
|
|
189
|
+
return out
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def validate(b: Bundle) -> list:
|
|
193
|
+
out = []
|
|
194
|
+
def add(path, sev, rule, msg):
|
|
195
|
+
out.append({"path": path, "severity": sev, "rule": rule, "message": msg})
|
|
196
|
+
for c in b.concepts:
|
|
197
|
+
if c.reserved:
|
|
198
|
+
continue
|
|
199
|
+
if c.parse_error is not None:
|
|
200
|
+
add(c.path, "error", "frontmatter_unparseable",
|
|
201
|
+
f"no parseable frontmatter ({c.parse_error})")
|
|
202
|
+
continue
|
|
203
|
+
if not c.type:
|
|
204
|
+
add(c.path, "error", "missing_type", "frontmatter has no non-empty type")
|
|
205
|
+
if c.title is None:
|
|
206
|
+
add(c.path, "warn", "missing_title", "recommended field title absent")
|
|
207
|
+
if c.description is None:
|
|
208
|
+
add(c.path, "warn", "missing_description", "recommended field description absent")
|
|
209
|
+
if c.timestamp is None:
|
|
210
|
+
add(c.path, "warn", "missing_timestamp", "recommended field timestamp absent")
|
|
211
|
+
elif not _ISO.match(c.timestamp):
|
|
212
|
+
add(c.path, "warn", "timestamp_not_iso8601", f"timestamp not ISO-8601: {c.timestamp}")
|
|
213
|
+
for lk in links(b):
|
|
214
|
+
if not lk["resolved"]:
|
|
215
|
+
add(lk["src_path"], "warn", "broken_link", f"unresolved link: {lk['dst_raw']}")
|
|
216
|
+
return out
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def ingest(root, db_path: str = ":memory:", ingested_at: Optional[str] = None,
|
|
220
|
+
bundle_id: Optional[str] = None, source_kind: str = "dir"):
|
|
221
|
+
b = root if isinstance(root, Bundle) else read_bundle(root, bundle_id, source_kind)
|
|
222
|
+
val = validate(b)
|
|
223
|
+
lk = links(b)
|
|
224
|
+
if ingested_at is None:
|
|
225
|
+
ingested_at = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
226
|
+
|
|
227
|
+
err_paths = {f["path"] for f in val if f["severity"] == "error"}
|
|
228
|
+
non_reserved = [c for c in b.concepts if not c.reserved]
|
|
229
|
+
n_conf = sum(1 for c in non_reserved if c.path not in err_paths)
|
|
230
|
+
|
|
231
|
+
con = duckdb.connect(db_path)
|
|
232
|
+
for stmt in (s.strip() for s in SCHEMA.split(";") if s.strip()):
|
|
233
|
+
con.execute(stmt)
|
|
234
|
+
|
|
235
|
+
con.execute("INSERT INTO okf_bundle VALUES (?,?,?,?,?,?,?,?)",
|
|
236
|
+
[b.bundle_id, b.root, b.okf_version, b.source_kind, ingested_at,
|
|
237
|
+
len(non_reserved), n_conf, len(err_paths) == 0])
|
|
238
|
+
for c in b.concepts:
|
|
239
|
+
con.execute("INSERT INTO okf_concept VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)",
|
|
240
|
+
[b.bundle_id, c.path, c.reserved, c.type, c.title, c.description,
|
|
241
|
+
c.resource, None if c.tags is None else json.dumps(c.tags),
|
|
242
|
+
c.timestamp, c.body, json.dumps(c.frontmatter or {}),
|
|
243
|
+
c.parse_error, c.content_hash])
|
|
244
|
+
for lk_ in lk:
|
|
245
|
+
con.execute("INSERT INTO okf_link VALUES (?,?,?,?,?)",
|
|
246
|
+
[b.bundle_id, lk_["src_path"], lk_["dst_raw"], lk_["dst_path"], lk_["resolved"]])
|
|
247
|
+
for f in val:
|
|
248
|
+
con.execute("INSERT INTO okf_validation VALUES (?,?,?,?,?)",
|
|
249
|
+
[b.bundle_id, f["path"], f["severity"], f["rule"], f["message"]])
|
|
250
|
+
|
|
251
|
+
summary = {
|
|
252
|
+
"n_files": len(b.concepts), "n_concepts": len(non_reserved), "n_conformant": n_conf,
|
|
253
|
+
"conformant": len(err_paths) == 0,
|
|
254
|
+
"errors": sum(1 for f in val if f["severity"] == "error"),
|
|
255
|
+
"warnings": sum(1 for f in val if f["severity"] == "warn"),
|
|
256
|
+
"links_total": len(lk), "links_broken": sum(1 for x in lk if not x["resolved"]),
|
|
257
|
+
}
|
|
258
|
+
return con, summary
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def search(con, term: str):
|
|
262
|
+
return con.execute(
|
|
263
|
+
"SELECT path, type, title FROM okf_concept WHERE body ILIKE ? ORDER BY path",
|
|
264
|
+
[f"%{term}%"]).fetchall()
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""okf RAG layer (Python) — mirrors the R binding's embeddings/RAG functions.
|
|
2
|
+
|
|
3
|
+
Pluggable embedder (default: local Ollama nomic-embed-text); brute-force cosine
|
|
4
|
+
search via DuckDB's native list_cosine_similarity. An embedder is a callable
|
|
5
|
+
texts:list[str] -> list[list[float]].
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
import os, re, json, urllib.request
|
|
9
|
+
from typing import Callable, Optional
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def chunk_body(body: str, target_chars: int = 600) -> list:
|
|
13
|
+
paras = [p.strip() for p in re.split(r"\n[ \t]*\n", body or "") if p.strip()]
|
|
14
|
+
chunks, cur = [], ""
|
|
15
|
+
for p in paras:
|
|
16
|
+
if not cur:
|
|
17
|
+
cur = p
|
|
18
|
+
elif len(cur) + len(p) + 2 <= target_chars:
|
|
19
|
+
cur = f"{cur}\n\n{p}"
|
|
20
|
+
else:
|
|
21
|
+
chunks.append(cur); cur = p
|
|
22
|
+
if cur:
|
|
23
|
+
chunks.append(cur)
|
|
24
|
+
return chunks
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def ollama_embedder(model: str = "nomic-embed-text",
|
|
28
|
+
url: Optional[str] = None) -> Callable:
|
|
29
|
+
url = url or os.environ.get("OLLAMA_URL", "http://localhost:11434")
|
|
30
|
+
|
|
31
|
+
def embed(texts):
|
|
32
|
+
out = []
|
|
33
|
+
for t in texts:
|
|
34
|
+
req = urllib.request.Request(
|
|
35
|
+
f"{url}/api/embeddings",
|
|
36
|
+
data=json.dumps({"model": model, "prompt": t}).encode("utf-8"),
|
|
37
|
+
headers={"Content-Type": "application/json"})
|
|
38
|
+
with urllib.request.urlopen(req, timeout=120) as r:
|
|
39
|
+
out.append(json.loads(r.read())["embedding"])
|
|
40
|
+
return out
|
|
41
|
+
return embed
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _vec_lit(v) -> str:
|
|
45
|
+
return "[" + ",".join(f"{float(z):.8g}" for z in v) + "]::FLOAT[]"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def embed(con, embedder: Optional[Callable] = None, target_chars: int = 600) -> int:
|
|
49
|
+
embedder = embedder or ollama_embedder()
|
|
50
|
+
con.execute("CREATE TABLE IF NOT EXISTS okf_chunk (bundle_id TEXT, path TEXT, "
|
|
51
|
+
"chunk_id INTEGER, text TEXT, embedding FLOAT[])")
|
|
52
|
+
rows = con.execute(
|
|
53
|
+
"SELECT bundle_id, path, body FROM okf_concept WHERE reserved = FALSE ORDER BY path"
|
|
54
|
+
).fetchall()
|
|
55
|
+
con.execute("DELETE FROM okf_chunk")
|
|
56
|
+
n = 0
|
|
57
|
+
for bundle_id, path, body in rows:
|
|
58
|
+
chs = chunk_body(body, target_chars)
|
|
59
|
+
if not chs:
|
|
60
|
+
continue
|
|
61
|
+
embs = embedder(chs)
|
|
62
|
+
for k, (text, vec) in enumerate(zip(chs, embs), start=1):
|
|
63
|
+
con.execute(
|
|
64
|
+
f"INSERT INTO okf_chunk VALUES (?,?,?,?, {_vec_lit(vec)})",
|
|
65
|
+
[bundle_id, path, k, text])
|
|
66
|
+
n += 1
|
|
67
|
+
return n
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def rag(con, query: str, embedder: Optional[Callable] = None, k: int = 5):
|
|
71
|
+
embedder = embedder or ollama_embedder()
|
|
72
|
+
qv = embedder([query])[0]
|
|
73
|
+
return con.execute(
|
|
74
|
+
f"""SELECT ch.path, c.title, ch.chunk_id,
|
|
75
|
+
list_cosine_similarity(ch.embedding, {_vec_lit(qv)}) AS score, ch.text
|
|
76
|
+
FROM okf_chunk ch JOIN okf_concept c USING (bundle_id, path)
|
|
77
|
+
WHERE ch.embedding IS NOT NULL ORDER BY score DESC LIMIT {int(k)}"""
|
|
78
|
+
).fetchall()
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: okf-ingest
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Unified ingestion tool for Open Knowledge Format (OKF) bundles — validate, load into a portable DuckDB catalog, and semantically search.
|
|
5
|
+
Author-email: Travis Jakel <travis.s.jakel@gmail.com>
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://github.com/travisjakel/okf-ingest
|
|
8
|
+
Project-URL: Repository, https://github.com/travisjakel/okf-ingest
|
|
9
|
+
Project-URL: OKF Specification, https://github.com/GoogleCloudPlatform/knowledge-catalog
|
|
10
|
+
Keywords: okf,open-knowledge-format,knowledge,rag,duckdb,agents
|
|
11
|
+
Requires-Python: >=3.9
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
Requires-Dist: pyyaml>=6
|
|
14
|
+
Requires-Dist: duckdb>=1.0
|
|
15
|
+
|
|
16
|
+
# okf (Python binding)
|
|
17
|
+
|
|
18
|
+
Python binding of **okf-ingest** — a unified ingestion tool for
|
|
19
|
+
[Open Knowledge Format](https://github.com/GoogleCloudPlatform/knowledge-catalog)
|
|
20
|
+
(OKF) bundles. Validate a bundle, load it into a portable DuckDB catalog, and
|
|
21
|
+
semantically search it.
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install okf-ingest
|
|
25
|
+
okf validate ./bundle
|
|
26
|
+
okf ingest ./bundle --db catalog.duckdb
|
|
27
|
+
okf embed catalog.duckdb # uses local Ollama nomic-embed-text by default
|
|
28
|
+
okf rag catalog.duckdb --query "how is revenue computed?" -k 5
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
import okf
|
|
33
|
+
con, summary = okf.ingest("./bundle", db_path="catalog.duckdb")
|
|
34
|
+
okf.embed(con) # pluggable embedder
|
|
35
|
+
okf.rag_search(con, "revenue", k=5)
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
The catalog format is shared with the R binding (`okf` on CRAN-style install),
|
|
39
|
+
so you can ingest/embed in one language and query from the other. See the
|
|
40
|
+
[project README](https://github.com/travisjakel/okf-ingest) and `docs/` for the
|
|
41
|
+
full spec-conformance notes and architecture.
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
okf/__init__.py
|
|
4
|
+
okf/__main__.py
|
|
5
|
+
okf/cli.py
|
|
6
|
+
okf/okf.py
|
|
7
|
+
okf/rag.py
|
|
8
|
+
okf_ingest.egg-info/PKG-INFO
|
|
9
|
+
okf_ingest.egg-info/SOURCES.txt
|
|
10
|
+
okf_ingest.egg-info/dependency_links.txt
|
|
11
|
+
okf_ingest.egg-info/entry_points.txt
|
|
12
|
+
okf_ingest.egg-info/requires.txt
|
|
13
|
+
okf_ingest.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
okf
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "okf-ingest"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Unified ingestion tool for Open Knowledge Format (OKF) bundles — validate, load into a portable DuckDB catalog, and semantically search."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = "Apache-2.0"
|
|
12
|
+
authors = [{ name = "Travis Jakel", email = "travis.s.jakel@gmail.com" }]
|
|
13
|
+
keywords = ["okf", "open-knowledge-format", "knowledge", "rag", "duckdb", "agents"]
|
|
14
|
+
dependencies = ["pyyaml>=6", "duckdb>=1.0"]
|
|
15
|
+
|
|
16
|
+
[project.urls]
|
|
17
|
+
Homepage = "https://github.com/travisjakel/okf-ingest"
|
|
18
|
+
Repository = "https://github.com/travisjakel/okf-ingest"
|
|
19
|
+
"OKF Specification" = "https://github.com/GoogleCloudPlatform/knowledge-catalog"
|
|
20
|
+
|
|
21
|
+
[project.scripts]
|
|
22
|
+
okf = "okf.cli:main"
|
|
23
|
+
|
|
24
|
+
[tool.setuptools]
|
|
25
|
+
packages = ["okf"]
|