PyPI - rdf-engine - Versions diffs - 182__tar.gz - Mend

rdf-engine 182__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

rdf_engine-182/.gitignore +22 -0
rdf_engine-182/PKG-INFO +35 -0
rdf_engine-182/README.md +25 -0
rdf_engine-182/pyproject.toml +36 -0
rdf_engine-182/src/rdf_engine/__init__.py +3 -0
rdf_engine-182/src/rdf_engine/canon.py +108 -0
rdf_engine-182/src/rdf_engine/conversions.py +108 -0
rdf_engine-182/src/rdf_engine/data.py +40 -0
rdf_engine-182/src/rdf_engine/db.py +14 -0
rdf_engine-182/src/rdf_engine/engine.py +108 -0
rdf_engine-182/src/rdf_engine/rules.py +9 -0

rdf_engine-182/.gitignore ADDED Viewed

@@ -0,0 +1,22 @@
+# Note: This is restrictive but it might be possible
+# to compartmentalize some specific ignores in (subfolders/workdirs)
+# by using a .gitignore there
+# First, ignore everything
+*
+# Now, whitelist anything that's a directory
+!*/
+# And all the file types you're interested in.
+# don't ignore gitignore, hah
+!.gitignore
+# scripts and progs
+!*.py
+# doc
+!*.md
+# config
+!*.ini
+!*.toml
+!*.yaml
+!*.yml
+!*.lock

rdf_engine-182/PKG-INFO ADDED Viewed

@@ -0,0 +1,35 @@
+Metadata-Version: 2.4
+Name: rdf-engine
+Version: 182
+Summary: A rules engine for RDF data
+Author-email: Majid alDosari <majid.aldosari@pnnl.gov>, Majid alDosari <majidaldosari-github@yahoo.com>
+Requires-Python: >=3.11
+Requires-Dist: pyoxigraph
+Requires-Dist: rdflib
+Description-Content-Type: text/markdown
+![PyPI - Status](https://img.shields.io/pypi/v/rdfengine)
+# RDF-Engine
+## Why?
+Motivation: This was developed as part of [BIM2RDF](https://github.com/PNNL/BIM2RDF)
+where the conversion from BIM to RDF is framed as 'mapping rules'.
+## How?
+Rules are processes that generate triples.
+They are simply applied until no _new_ triples are produced.
+[Oxigraph](https://github.com/oxigraph/oxigraph) is used to store data.
+## Features
+* Handling of anonymous/blank nodes: they can be deanonimized
+* Oxigraph can handle RDF-star data and querying
+## Development Philosophy
+* **KISS**: It should only address executing rules.
+Therefore, the code is expected to be feature complete (without need for adding more 'features').
+* **Minimal dependencies**: follows from above.

rdf_engine-182/README.md ADDED Viewed

@@ -0,0 +1,25 @@
+![PyPI - Status](https://img.shields.io/pypi/v/rdfengine)
+# RDF-Engine
+## Why?
+Motivation: This was developed as part of [BIM2RDF](https://github.com/PNNL/BIM2RDF)
+where the conversion from BIM to RDF is framed as 'mapping rules'.
+## How?
+Rules are processes that generate triples.
+They are simply applied until no _new_ triples are produced.
+[Oxigraph](https://github.com/oxigraph/oxigraph) is used to store data.
+## Features
+* Handling of anonymous/blank nodes: they can be deanonimized
+* Oxigraph can handle RDF-star data and querying
+## Development Philosophy
+* **KISS**: It should only address executing rules.
+Therefore, the code is expected to be feature complete (without need for adding more 'features').
+* **Minimal dependencies**: follows from above.

rdf_engine-182/pyproject.toml ADDED Viewed

@@ -0,0 +1,36 @@
+[project]
+name = "rdf-engine"
+dynamic = ['version' ]
+description = "A rules engine for RDF data"
+authors = [
+    {"name" = "Majid alDosari", email="majid.aldosari@pnnl.gov"},
+    {"name" = "Majid alDosari", email="majidaldosari-github@yahoo.com"},
+ ]
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "pyoxigraph",
+    "rdflib", # just for 'deanon', otherwise dont want this
+    ]
+[dependency-groups]
+dev = [
+    'ipython', 'ipdb',
+    'pytest',
+    'pre-commit',
+    'fire'
+]
+[project.scripts]
+# todo: perhaps a 'program file'
+[build-system]
+# > uv build
+# > uvx hatchling version major
+# uv backend 'std'
+# https://github.com/astral-sh/uv/issues/3957
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.hatch.version]
+path = "src/rdf_engine/__init__.py"
+[tool.hatch.build.targets.sdist]
+include = ['src/**/*.py']

rdf_engine-182/src/rdf_engine/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+__version__ = "182"
+from .engine import Engine, logger

rdf_engine-182/src/rdf_engine/canon.py ADDED Viewed

@@ -0,0 +1,108 @@
+"""
+canonicalization
+"""
+class _quads:
+    from pyoxigraph import Quad
+    from typing import Iterable
+    def __call__(self, quads: Iterable[Quad]) -> Iterable[Quad]:
+        """
+        canonicalization of sets of quads
+        """
+        # the set of quads have to be broken into sets of triples
+        # after canonicalization,
+        # each set of triples has to be reassembled as quads
+        from  pyoxigraph import BlankNode
+        from .data import index
+        for i,itriples in index(quads).items():
+            if isinstance(i.graph, BlankNode):
+                raise ValueError(f'not handling graph blank/anon node of graph {i.graph}')
+            if not i.outerpredicate:
+                c = triples(itriples)
+                yield from (self.Quad(*t, i.graph) for t in c)
+            else:
+                assert(i.outerpredicate)
+                # separately to keep computations down.
+                # assumption: the nested triple terms are not related anonymously
+                cs = triples(t.subject  for t in itriples)
+                assert(len(itriples))
+                co = triples(t.object   for t in itriples)
+                assert(len(cs) == len(co))
+                for s in cs:
+                    for o in co:
+                        yield self.Quad(s, i.outerpredicate, o, i.graph)
+    class _deanon:
+        from pyoxigraph import Triple
+        from typing import Iterable
+        from pyoxigraph import Quad
+        def __call__(slf,
+                quads: Iterable[Quad], *,
+                uri = "urn:anon:hash:") -> Iterable[Quad]:
+            """takes blank node value as an identifier for a uri"""
+            _ = map(lambda q: slf.quad(q, uri), quads)
+            return _
+        @classmethod
+        def quad(cls, q: Quad, uri):
+            if isinstance(q.subject, cls.Triple):
+                _ = cls.Triple(*(cls.f(n, uri) for n in q.subject))
+                q = cls.Quad(_, q.predicate, q.object, q.graph_name)
+            if isinstance(q.object, cls.Triple):
+                _ = cls.Triple(*(cls.f(n, uri) for n in q.object))
+                q = cls.Quad(q.subject, q.predicate, _, q.graph_name)
+            return cls.Quad(*(cls.f(n, uri) for n in q))
+        from pyoxigraph import BlankNode, NamedNode
+        @classmethod
+        def f(cls, n, uri: str):
+            if isinstance(n, cls.BlankNode):
+                return cls.NamedNode(uri+n.value)
+            else:
+                return n
+    deanon = _deanon()
+quads = _quads()
+from pyoxigraph import BlankNode, Quad, Triple
+from typing import Iterable
+def hasanon(d: Iterable[Quad| Triple]):
+    for q in d:
+        if isinstance(q.subject, BlankNode):
+            return True
+        if isinstance(q.object, BlankNode):
+            return True
+    return False
+def triples(ts):
+    if not isinstance(ts, frozenset):
+        ts = frozenset(ts)
+    assert(isinstance(ts, frozenset))
+    from pyoxigraph import Triple
+    for t in ts:
+        assert(not isinstance(t.subject,    Triple))
+        assert(not isinstance(t.object,     Triple))
+    # optimization
+    if not hasanon(ts):
+        return ts
+    from . import conversions as c
+    ts = c.oxigraph.rdflib(ts)
+    from rdflib import Graph
+    _ = Graph()
+    for t in ts: ts = _.add(t)
+    from rdflib.compare import to_canonical_graph
+    ts = _; del _
+    ts = to_canonical_graph(ts)
+    ts = c.rdflib.oxigraph(ts)
+    ts = frozenset(ts)
+    return ts
+def _ogtriples(triples):
+    # algo seems to gets stuck (slow?)
+    # wait for update TODO
+    from pyoxigraph import Dataset, CanonicalizationAlgorithm, Quad
+    def _(triples):
+        d = Dataset(Quad(*t) for t in triples)
+        d.canonicalize(CanonicalizationAlgorithm.UNSTABLE) # ?? unstable??
+        for q in d: yield q.triple
+    yield from _(triples)

rdf_engine-182/src/rdf_engine/conversions.py ADDED Viewed

@@ -0,0 +1,108 @@
+"""
+just to use rdflib to canonicalize
+"""
+#use oxrdflib?
+#also deal with strttl
+class terms:
+    def _():
+        from rdflib.graph   import DATASET_DEFAULT_GRAPH_ID                   as DG
+        from rdflib.term    import BNode     as BN, Literal as Lit, URIRef    as NN
+        return locals()
+    rdflib = _()
+    def _():
+        from pyoxigraph     import BlankNode as BN, Literal as Lit, NamedNode as NN
+        from pyoxigraph     import DefaultGraph                               as DG; DG = DG()
+        return locals()
+    oxigraph = _()
+    from types import SimpleNamespace as NS
+    rdflib    = NS(**rdflib)
+    oxigraph  = NS(**oxigraph)
+    assert(frozenset(rdflib.__dict__.keys()) == frozenset(rdflib.__dict__.keys()))
+    del NS
+    class og2rl:
+        def __call__(slf, n):
+            rl = terms.rdflib
+            og = terms.oxigraph
+            if isinstance(n,
+                        og.BN):
+                return  rl.BN(n.value)
+            elif isinstance(n,
+                        og.Lit):
+                return  rl.Lit(n.value,
+                            datatype=None if n.language else slf(n.datatype),
+                            #TypeError: A Literal can only have one of lang or datatype, per http://www.w3.org/TR/rdf-concepts/#section-Graph-Literal
+                            lang=n.language)
+            elif n ==   og.DG:
+                return  rl.DG
+            else:
+                assert(isinstance(n,
+                        og.NN))
+                return  rl.NN(n.value)
+    og2rl = og2rl()
+    class rl2og:
+        def __call__(s, n):
+            rl = terms.rdflib
+            og = terms.oxigraph
+            assert(isinstance(n, str)) # makes the following work for bn and nn
+            if isinstance(n,
+                        rl.BN):
+                return  og.BN(n)
+            elif isinstance(n,
+                        rl.Lit):
+                return  og.Lit(n,
+                            datatype=s(n.datatype) if n.datatype else None,
+                            language=n.language)
+            if n ==     rl.DG:
+                return  og.DG
+            else:
+                assert(isinstance(n,
+                        rl.NN))
+                return  og.NN(n)
+    rl2og = rl2og()
+terms = terms()
+class types:
+    from typing import Union
+    rdflib =   Union[tuple(t for t in terms.rdflib  .__dict__.values() if t is not terms.rdflib  .DG)]
+    oxigraph = Union[tuple(t for t in terms.oxigraph.__dict__.values() if t is not terms.oxigraph.DG)]
+class rdflib:
+    class oxigraph:
+        from typing import Iterable
+        from pyoxigraph import Triple, Quad
+        def __call__(slf, d: Iterable[Iterable[types.oxigraph]]) -> Iterable[Iterable[types.rdflib]]:
+            def _(q):
+                _ = tuple(terms.rl2og(t) for t in q)
+                if len(_) == 3: return slf.Triple(*_)
+                else:
+                    assert(len(_) == 4)
+                    return slf.Quad(*_)
+            _ = map(_, d)
+            return _
+            from pyoxigraph import serialize, RdfFormat
+            _ = serialize(og, format=RdfFormat.N_QUADS)
+            return _
+            r = Dataset()
+            r.parse(data=_, format='application/n-quads')
+            return r
+    oxigraph = oxigraph()
+rdflib = rdflib()
+class oxigraph:
+    class rdflib:
+        from typing import Iterable
+        def __call__(slf, d: Iterable[Iterable[types.rdflib]]) -> Iterable[Iterable[types.oxigraph]]:
+            # for putting it back in og
+            _ = lambda q: tuple(terms.og2rl(t) for t in q)
+            _ = map(_, d)
+            return _
+            _ = rl.serialize(format='application/n-quads')
+            from pyoxigraph import parse, RdfFormat
+            _ = parse(_, format=RdfFormat.N_QUADS)
+            return _
+    rdflib = rdflib()
+oxigraph = oxigraph()

rdf_engine-182/src/rdf_engine/data.py ADDED Viewed

@@ -0,0 +1,40 @@
+from pyoxigraph import Quad, Triple # putting imports here for max performance
+class _index:
+    """to group quads"""
+    from typing import NamedTuple
+    class index(NamedTuple):
+        from pyoxigraph import NamedNode, BlankNode
+        outerpredicate:  NamedNode | None
+        graph:           NamedNode | BlankNode
+        from typing import Self
+        @classmethod
+        def quad(cls, q: Quad) -> Self:
+            if isinstance(q.subject, Triple):
+                if not isinstance(q.object, Triple):
+                    raise ValueError(f'not handling nested subject without nested object of ({q})')
+                op = q.predicate # i care about the predicate
+            else:
+                op = None
+            return cls(
+                outerpredicate = op,
+                graph = q.graph_name
+            )
+    from typing import Iterable
+    from pyoxigraph import Quad
+    def __call__(slf, d: Iterable[Quad]) -> dict[index, frozenset[Triple]]:
+        from collections import defaultdict
+        idx = defaultdict(set)
+        for q in d: idx[slf.index.quad(q)].add(q.triple)
+        for k,v in idx.items(): idx[k] = frozenset(v)
+        return idx
+index = _index()
+from .db import Ingestable
+from typing import Iterable
+from pyoxigraph import Quad
+def quads(i: Ingestable) -> Iterable[Quad]:
+    from .db import ingest, Store
+    yield from ingest(Store(), i)

rdf_engine-182/src/rdf_engine/db.py ADDED Viewed

@@ -0,0 +1,14 @@
+from typing import Iterable
+from pyoxigraph import Quad, Store
+Ingestable = bytes|str|Iterable[Quad]
+def ingest(s: Store, d:  Ingestable, *, flush=True):
+    if isinstance(d, (str,bytes)):
+        # assume ttl
+        from pyoxigraph import RdfFormat
+        s.bulk_load(d, format=RdfFormat.TURTLE)
+    else:
+        # assume iterable[quand]
+        s.bulk_extend(d)
+    if flush: s.flush()
+    return s

rdf_engine-182/src/rdf_engine/engine.py ADDED Viewed

@@ -0,0 +1,108 @@
+import logging # :( i dont do module level imports
+logger = logging.getLogger('engine')
+class Engine:
+    from .rules import Rule
+    from typing import Iterable
+    from pyoxigraph import Store
+    from inspect import signature
+    from .canon import quads
+    def __init__(self,
+        rules: Iterable[Rule] = [], *,
+        db: Store=Store(),
+            MAX_NCYCLES: int=99,
+        # safe settings to avoid inf cycling
+        # but reduces performance
+            canon: bool=True,
+            deanon: bool=True, deanon_uri: str=signature(quads.deanon).parameters['uri'].default,
+        # typically expecting the engine to be used in a stand-alone program
+        # so it helps to have logging.
+            log: bool=True, log_print: bool=True,
+        ) -> None:
+        self.rules = list(rules)
+        self.db = db
+        self.MAX_NCYCLES = MAX_NCYCLES
+        self.canon = True if deanon else canon
+        self.deanon = deanon
+        self.deanon_uri = deanon_uri
+        self.i = 0
+        # logging
+        if log:
+            from collections import defaultdict, namedtuple
+            from types import SimpleNamespace as NS
+            self.logging = NS(
+                print = log_print,
+                log = defaultdict(list),
+                delta = namedtuple('delta', ['before', 'after'] ))
+    # TODO: make a method for applying one rule
+    def run1(self) -> Store:
+        if hasattr(self, 'logging'):
+            if self.logging.print:
+                line = '-'*10
+                logger.info(f"CYCLE {self.i} {line}")
+        for r in self.rules: # TODO: could be parallelized
+            # before
+            if hasattr(self, 'logging'):
+                before = len(self.db)
+                if self.logging.print:
+                    logger.info(f"{repr(r)}")
+                from time import monotonic
+                start_time = monotonic()
+            # do
+            _ = r(self.db)
+            if self.canon:
+                from .data import quads
+                _ = quads(_)
+                from .canon import quads
+                _ = quads(_)
+                if self.deanon:
+                    _ = quads.deanon(_, uri=self.deanon_uri)
+            from .db import ingest
+            # so if a rule returns a string,
+            # it /could/ go in fast in the case of no processing (canon/deanon)
+            ingest(self.db, _, flush=True)
+            del _
+            # after
+            if hasattr(self, 'logging'):
+                delta = self.logging.delta(before, len(self.db))
+                self.logging.log[r].append(delta)
+                if self.logging.print:
+                    logger.info(f"# triples before {delta.before }, after {delta.after } => {delta.after-delta.before}.")
+                    logger.info(f"took {'{0:.2f}'.format(monotonic()-start_time)} seconds")
+        self.i += 1
+        self.db.flush()
+        self.db.optimize()
+        return self.db
+    def stop(self) -> bool:
+        if self.MAX_NCYCLES <= 0:
+           return False
+        # could put validations here
+        if len(self.db) == len(self.run1()):
+            return True
+        else:
+            return False
+    def __iter__(self) -> Iterable[Store]:
+        while (not self.stop()):
+            if self.i >= self.MAX_NCYCLES:
+                if hasattr(self, 'logging'):
+                    if self.logging.print:
+                        logger.warning('reached max cycles')
+                break
+            yield self.db
+        else: # for case when nothing needs to happen
+            yield self.db
+    def run(self) -> Store:
+        for _ in self: continue
+        return self.db
+    __call__ = run

rdf_engine-182/src/rdf_engine/rules.py ADDED Viewed

@@ -0,0 +1,9 @@
+from typing import Callable
+from pyoxigraph import Store
+from .db import Ingestable
+Rule = Callable[[Store], Ingestable]
+# it's possible to come up with an abstraction for a Rule
+# but decided not to (here at least).
+# an engine run (itself) could be taken as a 'rule'.