rdf-engine 182__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,22 @@
1
+ # Note: This is restrictive but it might be possible
2
+ # to compartmentalize some specific ignores in (subfolders/workdirs)
3
+ # by using a .gitignore there
4
+
5
+
6
+ # First, ignore everything
7
+ *
8
+ # Now, whitelist anything that's a directory
9
+ !*/
10
+ # And all the file types you're interested in.
11
+ # don't ignore gitignore, hah
12
+ !.gitignore
13
+ # scripts and progs
14
+ !*.py
15
+ # doc
16
+ !*.md
17
+ # config
18
+ !*.ini
19
+ !*.toml
20
+ !*.yaml
21
+ !*.yml
22
+ !*.lock
@@ -0,0 +1,35 @@
1
+ Metadata-Version: 2.4
2
+ Name: rdf-engine
3
+ Version: 182
4
+ Summary: A rules engine for RDF data
5
+ Author-email: Majid alDosari <majid.aldosari@pnnl.gov>, Majid alDosari <majidaldosari-github@yahoo.com>
6
+ Requires-Python: >=3.11
7
+ Requires-Dist: pyoxigraph
8
+ Requires-Dist: rdflib
9
+ Description-Content-Type: text/markdown
10
+
11
+ ![PyPI - Status](https://img.shields.io/pypi/v/rdfengine)
12
+
13
+ # RDF-Engine
14
+
15
+ ## Why?
16
+
17
+ Motivation: This was developed as part of [BIM2RDF](https://github.com/PNNL/BIM2RDF)
18
+ where the conversion from BIM to RDF is framed as 'mapping rules'.
19
+
20
+ ## How?
21
+
22
+ Rules are processes that generate triples.
23
+ They are simply applied until no _new_ triples are produced.
24
+ [Oxigraph](https://github.com/oxigraph/oxigraph) is used to store data.
25
+
26
+ ## Features
27
+
28
+ * Handling of anonymous/blank nodes: they can be deanonimized
29
+ * Oxigraph can handle RDF-star data and querying
30
+
31
+
32
+ ## Development Philosophy
33
+ * **KISS**: It should only address executing rules.
34
+ Therefore, the code is expected to be feature complete (without need for adding more 'features').
35
+ * **Minimal dependencies**: follows from above.
@@ -0,0 +1,25 @@
1
+ ![PyPI - Status](https://img.shields.io/pypi/v/rdfengine)
2
+
3
+ # RDF-Engine
4
+
5
+ ## Why?
6
+
7
+ Motivation: This was developed as part of [BIM2RDF](https://github.com/PNNL/BIM2RDF)
8
+ where the conversion from BIM to RDF is framed as 'mapping rules'.
9
+
10
+ ## How?
11
+
12
+ Rules are processes that generate triples.
13
+ They are simply applied until no _new_ triples are produced.
14
+ [Oxigraph](https://github.com/oxigraph/oxigraph) is used to store data.
15
+
16
+ ## Features
17
+
18
+ * Handling of anonymous/blank nodes: they can be deanonimized
19
+ * Oxigraph can handle RDF-star data and querying
20
+
21
+
22
+ ## Development Philosophy
23
+ * **KISS**: It should only address executing rules.
24
+ Therefore, the code is expected to be feature complete (without need for adding more 'features').
25
+ * **Minimal dependencies**: follows from above.
@@ -0,0 +1,36 @@
1
+ [project]
2
+ name = "rdf-engine"
3
+ dynamic = ['version' ]
4
+ description = "A rules engine for RDF data"
5
+ authors = [
6
+ {"name" = "Majid alDosari", email="majid.aldosari@pnnl.gov"},
7
+ {"name" = "Majid alDosari", email="majidaldosari-github@yahoo.com"},
8
+ ]
9
+ readme = "README.md"
10
+ requires-python = ">=3.11"
11
+ dependencies = [
12
+ "pyoxigraph",
13
+ "rdflib", # just for 'deanon', otherwise dont want this
14
+ ]
15
+ [dependency-groups]
16
+ dev = [
17
+ 'ipython', 'ipdb',
18
+ 'pytest',
19
+ 'pre-commit',
20
+ 'fire'
21
+ ]
22
+
23
+ [project.scripts]
24
+ # todo: perhaps a 'program file'
25
+
26
+ [build-system]
27
+ # > uv build
28
+ # > uvx hatchling version major
29
+ # uv backend 'std'
30
+ # https://github.com/astral-sh/uv/issues/3957
31
+ requires = ["hatchling"]
32
+ build-backend = "hatchling.build"
33
+ [tool.hatch.version]
34
+ path = "src/rdf_engine/__init__.py"
35
+ [tool.hatch.build.targets.sdist]
36
+ include = ['src/**/*.py']
@@ -0,0 +1,3 @@
1
+ __version__ = "182"
2
+
3
+ from .engine import Engine, logger
@@ -0,0 +1,108 @@
1
+ """
2
+ canonicalization
3
+ """
4
+
5
+ class _quads:
6
+ from pyoxigraph import Quad
7
+ from typing import Iterable
8
+ def __call__(self, quads: Iterable[Quad]) -> Iterable[Quad]:
9
+ """
10
+ canonicalization of sets of quads
11
+ """
12
+ # the set of quads have to be broken into sets of triples
13
+ # after canonicalization,
14
+ # each set of triples has to be reassembled as quads
15
+ from pyoxigraph import BlankNode
16
+ from .data import index
17
+ for i,itriples in index(quads).items():
18
+ if isinstance(i.graph, BlankNode):
19
+ raise ValueError(f'not handling graph blank/anon node of graph {i.graph}')
20
+ if not i.outerpredicate:
21
+ c = triples(itriples)
22
+ yield from (self.Quad(*t, i.graph) for t in c)
23
+ else:
24
+ assert(i.outerpredicate)
25
+ # separately to keep computations down.
26
+ # assumption: the nested triple terms are not related anonymously
27
+ cs = triples(t.subject for t in itriples)
28
+ assert(len(itriples))
29
+ co = triples(t.object for t in itriples)
30
+ assert(len(cs) == len(co))
31
+ for s in cs:
32
+ for o in co:
33
+ yield self.Quad(s, i.outerpredicate, o, i.graph)
34
+
35
+ class _deanon:
36
+ from pyoxigraph import Triple
37
+ from typing import Iterable
38
+ from pyoxigraph import Quad
39
+ def __call__(slf,
40
+ quads: Iterable[Quad], *,
41
+ uri = "urn:anon:hash:") -> Iterable[Quad]:
42
+ """takes blank node value as an identifier for a uri"""
43
+ _ = map(lambda q: slf.quad(q, uri), quads)
44
+ return _
45
+ @classmethod
46
+ def quad(cls, q: Quad, uri):
47
+ if isinstance(q.subject, cls.Triple):
48
+ _ = cls.Triple(*(cls.f(n, uri) for n in q.subject))
49
+ q = cls.Quad(_, q.predicate, q.object, q.graph_name)
50
+ if isinstance(q.object, cls.Triple):
51
+ _ = cls.Triple(*(cls.f(n, uri) for n in q.object))
52
+ q = cls.Quad(q.subject, q.predicate, _, q.graph_name)
53
+ return cls.Quad(*(cls.f(n, uri) for n in q))
54
+
55
+ from pyoxigraph import BlankNode, NamedNode
56
+ @classmethod
57
+ def f(cls, n, uri: str):
58
+ if isinstance(n, cls.BlankNode):
59
+ return cls.NamedNode(uri+n.value)
60
+ else:
61
+ return n
62
+ deanon = _deanon()
63
+ quads = _quads()
64
+
65
+
66
+ from pyoxigraph import BlankNode, Quad, Triple
67
+ from typing import Iterable
68
+ def hasanon(d: Iterable[Quad| Triple]):
69
+ for q in d:
70
+ if isinstance(q.subject, BlankNode):
71
+ return True
72
+ if isinstance(q.object, BlankNode):
73
+ return True
74
+ return False
75
+
76
+
77
+ def triples(ts):
78
+ if not isinstance(ts, frozenset):
79
+ ts = frozenset(ts)
80
+ assert(isinstance(ts, frozenset))
81
+ from pyoxigraph import Triple
82
+ for t in ts:
83
+ assert(not isinstance(t.subject, Triple))
84
+ assert(not isinstance(t.object, Triple))
85
+ # optimization
86
+ if not hasanon(ts):
87
+ return ts
88
+
89
+ from . import conversions as c
90
+ ts = c.oxigraph.rdflib(ts)
91
+ from rdflib import Graph
92
+ _ = Graph()
93
+ for t in ts: ts = _.add(t)
94
+ from rdflib.compare import to_canonical_graph
95
+ ts = _; del _
96
+ ts = to_canonical_graph(ts)
97
+ ts = c.rdflib.oxigraph(ts)
98
+ ts = frozenset(ts)
99
+ return ts
100
+ def _ogtriples(triples):
101
+ # algo seems to gets stuck (slow?)
102
+ # wait for update TODO
103
+ from pyoxigraph import Dataset, CanonicalizationAlgorithm, Quad
104
+ def _(triples):
105
+ d = Dataset(Quad(*t) for t in triples)
106
+ d.canonicalize(CanonicalizationAlgorithm.UNSTABLE) # ?? unstable??
107
+ for q in d: yield q.triple
108
+ yield from _(triples)
@@ -0,0 +1,108 @@
1
+ """
2
+ just to use rdflib to canonicalize
3
+ """
4
+ #use oxrdflib?
5
+ #also deal with strttl
6
+
7
+ class terms:
8
+ def _():
9
+ from rdflib.graph import DATASET_DEFAULT_GRAPH_ID as DG
10
+ from rdflib.term import BNode as BN, Literal as Lit, URIRef as NN
11
+ return locals()
12
+ rdflib = _()
13
+ def _():
14
+ from pyoxigraph import BlankNode as BN, Literal as Lit, NamedNode as NN
15
+ from pyoxigraph import DefaultGraph as DG; DG = DG()
16
+ return locals()
17
+ oxigraph = _()
18
+ from types import SimpleNamespace as NS
19
+ rdflib = NS(**rdflib)
20
+ oxigraph = NS(**oxigraph)
21
+ assert(frozenset(rdflib.__dict__.keys()) == frozenset(rdflib.__dict__.keys()))
22
+ del NS
23
+
24
+ class og2rl:
25
+ def __call__(slf, n):
26
+ rl = terms.rdflib
27
+ og = terms.oxigraph
28
+ if isinstance(n,
29
+ og.BN):
30
+ return rl.BN(n.value)
31
+ elif isinstance(n,
32
+ og.Lit):
33
+ return rl.Lit(n.value,
34
+ datatype=None if n.language else slf(n.datatype),
35
+ #TypeError: A Literal can only have one of lang or datatype, per http://www.w3.org/TR/rdf-concepts/#section-Graph-Literal
36
+ lang=n.language)
37
+ elif n == og.DG:
38
+ return rl.DG
39
+ else:
40
+ assert(isinstance(n,
41
+ og.NN))
42
+ return rl.NN(n.value)
43
+ og2rl = og2rl()
44
+
45
+ class rl2og:
46
+ def __call__(s, n):
47
+ rl = terms.rdflib
48
+ og = terms.oxigraph
49
+ assert(isinstance(n, str)) # makes the following work for bn and nn
50
+ if isinstance(n,
51
+ rl.BN):
52
+ return og.BN(n)
53
+ elif isinstance(n,
54
+ rl.Lit):
55
+ return og.Lit(n,
56
+ datatype=s(n.datatype) if n.datatype else None,
57
+ language=n.language)
58
+ if n == rl.DG:
59
+ return og.DG
60
+ else:
61
+ assert(isinstance(n,
62
+ rl.NN))
63
+ return og.NN(n)
64
+ rl2og = rl2og()
65
+ terms = terms()
66
+
67
+
68
+ class types:
69
+ from typing import Union
70
+ rdflib = Union[tuple(t for t in terms.rdflib .__dict__.values() if t is not terms.rdflib .DG)]
71
+ oxigraph = Union[tuple(t for t in terms.oxigraph.__dict__.values() if t is not terms.oxigraph.DG)]
72
+
73
+ class rdflib:
74
+ class oxigraph:
75
+ from typing import Iterable
76
+ from pyoxigraph import Triple, Quad
77
+ def __call__(slf, d: Iterable[Iterable[types.oxigraph]]) -> Iterable[Iterable[types.rdflib]]:
78
+ def _(q):
79
+ _ = tuple(terms.rl2og(t) for t in q)
80
+ if len(_) == 3: return slf.Triple(*_)
81
+ else:
82
+ assert(len(_) == 4)
83
+ return slf.Quad(*_)
84
+ _ = map(_, d)
85
+ return _
86
+ from pyoxigraph import serialize, RdfFormat
87
+ _ = serialize(og, format=RdfFormat.N_QUADS)
88
+ return _
89
+ r = Dataset()
90
+ r.parse(data=_, format='application/n-quads')
91
+ return r
92
+ oxigraph = oxigraph()
93
+ rdflib = rdflib()
94
+
95
+ class oxigraph:
96
+ class rdflib:
97
+ from typing import Iterable
98
+ def __call__(slf, d: Iterable[Iterable[types.rdflib]]) -> Iterable[Iterable[types.oxigraph]]:
99
+ # for putting it back in og
100
+ _ = lambda q: tuple(terms.og2rl(t) for t in q)
101
+ _ = map(_, d)
102
+ return _
103
+ _ = rl.serialize(format='application/n-quads')
104
+ from pyoxigraph import parse, RdfFormat
105
+ _ = parse(_, format=RdfFormat.N_QUADS)
106
+ return _
107
+ rdflib = rdflib()
108
+ oxigraph = oxigraph()
@@ -0,0 +1,40 @@
1
+ from pyoxigraph import Quad, Triple # putting imports here for max performance
2
+ class _index:
3
+ """to group quads"""
4
+ from typing import NamedTuple
5
+ class index(NamedTuple):
6
+ from pyoxigraph import NamedNode, BlankNode
7
+ outerpredicate: NamedNode | None
8
+ graph: NamedNode | BlankNode
9
+
10
+ from typing import Self
11
+ @classmethod
12
+ def quad(cls, q: Quad) -> Self:
13
+ if isinstance(q.subject, Triple):
14
+ if not isinstance(q.object, Triple):
15
+ raise ValueError(f'not handling nested subject without nested object of ({q})')
16
+ op = q.predicate # i care about the predicate
17
+ else:
18
+ op = None
19
+ return cls(
20
+ outerpredicate = op,
21
+ graph = q.graph_name
22
+ )
23
+
24
+ from typing import Iterable
25
+ from pyoxigraph import Quad
26
+ def __call__(slf, d: Iterable[Quad]) -> dict[index, frozenset[Triple]]:
27
+ from collections import defaultdict
28
+ idx = defaultdict(set)
29
+ for q in d: idx[slf.index.quad(q)].add(q.triple)
30
+ for k,v in idx.items(): idx[k] = frozenset(v)
31
+ return idx
32
+
33
+ index = _index()
34
+
35
+ from .db import Ingestable
36
+ from typing import Iterable
37
+ from pyoxigraph import Quad
38
+ def quads(i: Ingestable) -> Iterable[Quad]:
39
+ from .db import ingest, Store
40
+ yield from ingest(Store(), i)
@@ -0,0 +1,14 @@
1
+ from typing import Iterable
2
+ from pyoxigraph import Quad, Store
3
+ Ingestable = bytes|str|Iterable[Quad]
4
+ def ingest(s: Store, d: Ingestable, *, flush=True):
5
+ if isinstance(d, (str,bytes)):
6
+ # assume ttl
7
+ from pyoxigraph import RdfFormat
8
+ s.bulk_load(d, format=RdfFormat.TURTLE)
9
+ else:
10
+ # assume iterable[quand]
11
+ s.bulk_extend(d)
12
+
13
+ if flush: s.flush()
14
+ return s
@@ -0,0 +1,108 @@
1
+ import logging # :( i dont do module level imports
2
+ logger = logging.getLogger('engine')
3
+
4
+
5
+ class Engine:
6
+ from .rules import Rule
7
+ from typing import Iterable
8
+ from pyoxigraph import Store
9
+ from inspect import signature
10
+ from .canon import quads
11
+ def __init__(self,
12
+ rules: Iterable[Rule] = [], *,
13
+ db: Store=Store(),
14
+ MAX_NCYCLES: int=99,
15
+ # safe settings to avoid inf cycling
16
+ # but reduces performance
17
+ canon: bool=True,
18
+ deanon: bool=True, deanon_uri: str=signature(quads.deanon).parameters['uri'].default,
19
+ # typically expecting the engine to be used in a stand-alone program
20
+ # so it helps to have logging.
21
+ log: bool=True, log_print: bool=True,
22
+ ) -> None:
23
+ self.rules = list(rules)
24
+ self.db = db
25
+ self.MAX_NCYCLES = MAX_NCYCLES
26
+ self.canon = True if deanon else canon
27
+ self.deanon = deanon
28
+ self.deanon_uri = deanon_uri
29
+ self.i = 0
30
+
31
+ # logging
32
+ if log:
33
+ from collections import defaultdict, namedtuple
34
+ from types import SimpleNamespace as NS
35
+ self.logging = NS(
36
+ print = log_print,
37
+ log = defaultdict(list),
38
+ delta = namedtuple('delta', ['before', 'after'] ))
39
+
40
+ # TODO: make a method for applying one rule
41
+ def run1(self) -> Store:
42
+ if hasattr(self, 'logging'):
43
+ if self.logging.print:
44
+ line = '-'*10
45
+ logger.info(f"CYCLE {self.i} {line}")
46
+
47
+ for r in self.rules: # TODO: could be parallelized
48
+ # before
49
+ if hasattr(self, 'logging'):
50
+ before = len(self.db)
51
+ if self.logging.print:
52
+ logger.info(f"{repr(r)}")
53
+ from time import monotonic
54
+ start_time = monotonic()
55
+
56
+ # do
57
+ _ = r(self.db)
58
+ if self.canon:
59
+ from .data import quads
60
+ _ = quads(_)
61
+ from .canon import quads
62
+ _ = quads(_)
63
+ if self.deanon:
64
+ _ = quads.deanon(_, uri=self.deanon_uri)
65
+ from .db import ingest
66
+ # so if a rule returns a string,
67
+ # it /could/ go in fast in the case of no processing (canon/deanon)
68
+ ingest(self.db, _, flush=True)
69
+ del _
70
+
71
+ # after
72
+ if hasattr(self, 'logging'):
73
+ delta = self.logging.delta(before, len(self.db))
74
+ self.logging.log[r].append(delta)
75
+ if self.logging.print:
76
+ logger.info(f"# triples before {delta.before }, after {delta.after } => {delta.after-delta.before}.")
77
+ logger.info(f"took {'{0:.2f}'.format(monotonic()-start_time)} seconds")
78
+
79
+ self.i += 1
80
+ self.db.flush()
81
+ self.db.optimize()
82
+ return self.db
83
+
84
+ def stop(self) -> bool:
85
+ if self.MAX_NCYCLES <= 0:
86
+ return False
87
+ # could put validations here
88
+ if len(self.db) == len(self.run1()):
89
+ return True
90
+ else:
91
+ return False
92
+
93
+ def __iter__(self) -> Iterable[Store]:
94
+ while (not self.stop()):
95
+ if self.i >= self.MAX_NCYCLES:
96
+ if hasattr(self, 'logging'):
97
+ if self.logging.print:
98
+ logger.warning('reached max cycles')
99
+ break
100
+ yield self.db
101
+ else: # for case when nothing needs to happen
102
+ yield self.db
103
+
104
+ def run(self) -> Store:
105
+ for _ in self: continue
106
+ return self.db
107
+ __call__ = run
108
+
@@ -0,0 +1,9 @@
1
+ from typing import Callable
2
+ from pyoxigraph import Store
3
+ from .db import Ingestable
4
+ Rule = Callable[[Store], Ingestable]
5
+
6
+ # it's possible to come up with an abstraction for a Rule
7
+ # but decided not to (here at least).
8
+
9
+ # an engine run (itself) could be taken as a 'rule'.