trustgraph 0.5.2__tar.gz → 0.5.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of trustgraph might be problematic. Click here for more details.
- {trustgraph-0.5.2 → trustgraph-0.5.3}/PKG-INFO +2 -2
- trustgraph-0.5.3/scripts/concat-parquet +45 -0
- trustgraph-0.5.3/scripts/dump-parquet +24 -0
- trustgraph-0.5.3/scripts/ge-dump-parquet +6 -0
- trustgraph-0.5.3/scripts/load-graph-embeddings +145 -0
- trustgraph-0.5.3/scripts/load-triples +144 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/scripts/loader +2 -2
- {trustgraph-0.5.2 → trustgraph-0.5.3}/setup.py +7 -3
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/base/base_processor.py +10 -10
- trustgraph-0.5.3/trustgraph/dump/graph_embeddings/parquet/processor.py +87 -0
- trustgraph-0.5.3/trustgraph/dump/graph_embeddings/parquet/writer.py +94 -0
- trustgraph-0.5.3/trustgraph/embeddings/ollama/__init__.py +3 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/kg/extract_definitions/extract.py +3 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/kg/extract_relationships/extract.py +4 -0
- trustgraph-0.5.3/trustgraph/storage/triples/__init__.py +0 -0
- trustgraph-0.5.3/trustgraph/storage/triples/cassandra/__main__.py +7 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph.egg-info/PKG-INFO +2 -2
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph.egg-info/SOURCES.txt +9 -0
- trustgraph-0.5.2/scripts/dump-parquet +0 -12
- {trustgraph-0.5.2 → trustgraph-0.5.3}/LICENSE +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/README.md +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/scripts/chunker-recursive +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/scripts/embeddings-hf +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/scripts/embeddings-ollama +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/scripts/embeddings-vectorize +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/scripts/ge-write-milvus +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/scripts/graph-rag +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/scripts/graph-show +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/scripts/graph-to-turtle +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/scripts/init-pulsar-manager +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/scripts/kg-extract-definitions +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/scripts/kg-extract-relationships +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/scripts/pdf-decoder +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/scripts/query +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/scripts/run-processing +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/scripts/text-completion-azure +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/scripts/text-completion-claude +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/scripts/text-completion-ollama +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/scripts/text-completion-vertexai +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/scripts/triples-dump-parquet +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/scripts/triples-write-cassandra +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/setup.cfg +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/__init__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/base/__init__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/base/consumer.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/base/consumer_producer.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/base/producer.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/chunking/__init__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/chunking/recursive/__init__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/chunking/recursive/__main__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/chunking/recursive/chunker.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/decoding/__init__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/decoding/pdf/__init__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/decoding/pdf/__main__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/decoding/pdf/pdf_decoder.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/dump/__init__.py +0 -0
- {trustgraph-0.5.2/trustgraph/dump/triples → trustgraph-0.5.3/trustgraph/dump/graph_embeddings}/__init__.py +0 -0
- {trustgraph-0.5.2/trustgraph/dump/triples → trustgraph-0.5.3/trustgraph/dump/graph_embeddings}/parquet/__init__.py +0 -0
- {trustgraph-0.5.2/trustgraph/dump/triples → trustgraph-0.5.3/trustgraph/dump/graph_embeddings}/parquet/__main__.py +0 -0
- {trustgraph-0.5.2/trustgraph/embeddings → trustgraph-0.5.3/trustgraph/dump/triples}/__init__.py +0 -0
- {trustgraph-0.5.2/trustgraph/embeddings/ollama → trustgraph-0.5.3/trustgraph/dump/triples/parquet}/__init__.py +0 -0
- {trustgraph-0.5.2/trustgraph/storage/graph_embeddings/milvus → trustgraph-0.5.3/trustgraph/dump/triples/parquet}/__main__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/dump/triples/parquet/processor.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/dump/triples/parquet/writer.py +0 -0
- {trustgraph-0.5.2/trustgraph/kg → trustgraph-0.5.3/trustgraph/embeddings}/__init__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/embeddings/hf/__init__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/embeddings/hf/__main__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/embeddings/hf/hf.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/embeddings/ollama/__main__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/embeddings/ollama/processor.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/embeddings/vectorize/__init__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/embeddings/vectorize/__main__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/embeddings/vectorize/vectorize.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/embeddings_client.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/graph_rag.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/graph_rag_client.py +0 -0
- {trustgraph-0.5.2/trustgraph/model → trustgraph-0.5.3/trustgraph/kg}/__init__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/kg/extract_definitions/__init__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/kg/extract_definitions/__main__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/kg/extract_relationships/__init__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/kg/extract_relationships/__main__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/llm_client.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/log_level.py +0 -0
- {trustgraph-0.5.2/trustgraph/model/text_completion → trustgraph-0.5.3/trustgraph/model}/__init__.py +0 -0
- {trustgraph-0.5.2/trustgraph/retrieval → trustgraph-0.5.3/trustgraph/model/text_completion}/__init__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/model/text_completion/azure/__init__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/model/text_completion/azure/__main__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/model/text_completion/azure/llm.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/model/text_completion/claude/__init__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/model/text_completion/claude/__main__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/model/text_completion/claude/llm.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/model/text_completion/ollama/__init__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/model/text_completion/ollama/__main__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/model/text_completion/ollama/llm.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/model/text_completion/vertexai/__init__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/model/text_completion/vertexai/__main__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/model/text_completion/vertexai/llm.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/processing/__init__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/processing/__main__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/processing/processing.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/prompts.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/rdf.py +0 -0
- {trustgraph-0.5.2/trustgraph/storage → trustgraph-0.5.3/trustgraph/retrieval}/__init__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/retrieval/graph_rag/__init__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/retrieval/graph_rag/__main__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/retrieval/graph_rag/rag.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/schema.py +0 -0
- {trustgraph-0.5.2/trustgraph/storage/graph_embeddings → trustgraph-0.5.3/trustgraph/storage}/__init__.py +0 -0
- {trustgraph-0.5.2/trustgraph/storage/triples → trustgraph-0.5.3/trustgraph/storage/graph_embeddings}/__init__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/storage/graph_embeddings/milvus/__init__.py +0 -0
- {trustgraph-0.5.2/trustgraph/storage/triples/cassandra → trustgraph-0.5.3/trustgraph/storage/graph_embeddings/milvus}/__main__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/storage/graph_embeddings/milvus/write.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/storage/triples/cassandra/__init__.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/storage/triples/cassandra/write.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/triple_vectors.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/trustgraph.py +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph.egg-info/dependency_links.txt +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph.egg-info/requires.txt +0 -0
- {trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph.egg-info/top_level.txt +0 -0
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: trustgraph
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.3
|
|
4
4
|
Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
|
|
5
5
|
Home-page: https://github.com/trustgraph-ai/trustgraph
|
|
6
|
-
Download-URL: https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v0.5.
|
|
6
|
+
Download-URL: https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v0.5.3.tar.gz
|
|
7
7
|
Author: trustgraph.ai
|
|
8
8
|
Author-email: security@trustgraph.ai
|
|
9
9
|
Classifier: Programming Language :: Python :: 3
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Concatenates multiple parquet files into a single parquet output
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import pyarrow as pa
|
|
8
|
+
import pyarrow.parquet as pq
|
|
9
|
+
import pandas as pd
|
|
10
|
+
import sys
|
|
11
|
+
import argparse
|
|
12
|
+
|
|
13
|
+
parser = argparse.ArgumentParser(
|
|
14
|
+
prog="combine-parquet",
|
|
15
|
+
description=__doc__
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
parser.add_argument(
|
|
19
|
+
'-i', '--input',
|
|
20
|
+
nargs='*',
|
|
21
|
+
help=f'Input files'
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
parser.add_argument(
|
|
25
|
+
'-o', '--output',
|
|
26
|
+
help=f'Output files'
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
args = parser.parse_args()
|
|
30
|
+
|
|
31
|
+
df = None
|
|
32
|
+
|
|
33
|
+
for file in args.input:
|
|
34
|
+
|
|
35
|
+
part = pq.read_table(file).to_pandas()
|
|
36
|
+
|
|
37
|
+
if df is None:
|
|
38
|
+
df = part
|
|
39
|
+
else:
|
|
40
|
+
df = pd.concat([df, part], ignore_index=True)
|
|
41
|
+
|
|
42
|
+
if df is not None:
|
|
43
|
+
|
|
44
|
+
table = pa.Table.from_pandas(df)
|
|
45
|
+
pq.write_table(table, args.output)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
import pyarrow as pa
|
|
4
|
+
import pyarrow.csv as pc
|
|
5
|
+
import pyarrow.parquet as pq
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import sys
|
|
8
|
+
|
|
9
|
+
df = None
|
|
10
|
+
|
|
11
|
+
for file in sys.argv[1:]:
|
|
12
|
+
|
|
13
|
+
part = pq.read_table(file).to_pandas()
|
|
14
|
+
|
|
15
|
+
if df is None:
|
|
16
|
+
df = part
|
|
17
|
+
else:
|
|
18
|
+
df = pd.concat([df, part], ignore_index=True)
|
|
19
|
+
|
|
20
|
+
if df is not None:
|
|
21
|
+
|
|
22
|
+
table = pa.Table.from_pandas(df)
|
|
23
|
+
pc.write_csv(table, sys.stdout.buffer)
|
|
24
|
+
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Loads Graph embeddings into TrustGraph processing.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import pulsar
|
|
8
|
+
from pulsar.schema import JsonSchema
|
|
9
|
+
from trustgraph.schema import GraphEmbeddings, Value
|
|
10
|
+
from trustgraph.schema import graph_embeddings_store_queue
|
|
11
|
+
import argparse
|
|
12
|
+
import os
|
|
13
|
+
import time
|
|
14
|
+
import pyarrow as pa
|
|
15
|
+
import pyarrow.parquet as pq
|
|
16
|
+
|
|
17
|
+
from trustgraph.log_level import LogLevel
|
|
18
|
+
|
|
19
|
+
class Loader:
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
pulsar_host,
|
|
24
|
+
output_queue,
|
|
25
|
+
log_level,
|
|
26
|
+
file,
|
|
27
|
+
):
|
|
28
|
+
|
|
29
|
+
self.client = pulsar.Client(
|
|
30
|
+
pulsar_host,
|
|
31
|
+
logger=pulsar.ConsoleLogger(log_level.to_pulsar())
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
self.producer = self.client.create_producer(
|
|
35
|
+
topic=output_queue,
|
|
36
|
+
schema=JsonSchema(GraphEmbeddings),
|
|
37
|
+
chunking_enabled=True,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
self.file = file
|
|
41
|
+
|
|
42
|
+
def run(self):
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
|
|
46
|
+
path = self.file
|
|
47
|
+
|
|
48
|
+
print("Reading file...")
|
|
49
|
+
table = pq.read_table(path)
|
|
50
|
+
print("Loaded.")
|
|
51
|
+
|
|
52
|
+
names = set(table.column_names)
|
|
53
|
+
|
|
54
|
+
if "embeddings" not in names:
|
|
55
|
+
print("No 'embeddings' column")
|
|
56
|
+
|
|
57
|
+
if "entity" not in names:
|
|
58
|
+
print("No 'entity' column")
|
|
59
|
+
|
|
60
|
+
embc = table.column("embeddings")
|
|
61
|
+
entc = table.column("entity")
|
|
62
|
+
|
|
63
|
+
for emb, ent in zip(embc, entc):
|
|
64
|
+
|
|
65
|
+
b = emb.as_py()
|
|
66
|
+
n = ent.as_py()
|
|
67
|
+
|
|
68
|
+
r = GraphEmbeddings(
|
|
69
|
+
vectors=b,
|
|
70
|
+
entity=Value(
|
|
71
|
+
value=n,
|
|
72
|
+
is_uri=n.startswith("https:")
|
|
73
|
+
)
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
self.producer.send(r)
|
|
77
|
+
|
|
78
|
+
except Exception as e:
|
|
79
|
+
print(e, flush=True)
|
|
80
|
+
|
|
81
|
+
def __del__(self):
|
|
82
|
+
self.client.close()
|
|
83
|
+
|
|
84
|
+
def main():
|
|
85
|
+
|
|
86
|
+
parser = argparse.ArgumentParser(
|
|
87
|
+
prog='loader',
|
|
88
|
+
description=__doc__,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://localhost:6650')
|
|
92
|
+
default_output_queue = graph_embeddings_store_queue
|
|
93
|
+
|
|
94
|
+
parser.add_argument(
|
|
95
|
+
'-p', '--pulsar-host',
|
|
96
|
+
default=default_pulsar_host,
|
|
97
|
+
help=f'Pulsar host (default: {default_pulsar_host})',
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
parser.add_argument(
|
|
101
|
+
'-o', '--output-queue',
|
|
102
|
+
default=default_output_queue,
|
|
103
|
+
help=f'Output queue (default: {default_output_queue})'
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
parser.add_argument(
|
|
107
|
+
'-l', '--log-level',
|
|
108
|
+
type=LogLevel,
|
|
109
|
+
default=LogLevel.ERROR,
|
|
110
|
+
choices=list(LogLevel),
|
|
111
|
+
help=f'Output queue (default: info)'
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
parser.add_argument(
|
|
115
|
+
'-f', '--file',
|
|
116
|
+
required=True,
|
|
117
|
+
help=f'File to load'
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
args = parser.parse_args()
|
|
121
|
+
|
|
122
|
+
while True:
|
|
123
|
+
|
|
124
|
+
try:
|
|
125
|
+
p = Loader(
|
|
126
|
+
pulsar_host=args.pulsar_host,
|
|
127
|
+
output_queue=args.output_queue,
|
|
128
|
+
log_level=args.log_level,
|
|
129
|
+
file=args.file,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
p.run()
|
|
133
|
+
|
|
134
|
+
print("File loaded.")
|
|
135
|
+
break
|
|
136
|
+
|
|
137
|
+
except Exception as e:
|
|
138
|
+
|
|
139
|
+
print("Exception:", e, flush=True)
|
|
140
|
+
print("Will retry...", flush=True)
|
|
141
|
+
|
|
142
|
+
time.sleep(10)
|
|
143
|
+
|
|
144
|
+
main()
|
|
145
|
+
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Loads Graph embeddings into TrustGraph processing.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import pulsar
|
|
8
|
+
from pulsar.schema import JsonSchema
|
|
9
|
+
from trustgraph.schema import Triple, Value
|
|
10
|
+
from trustgraph.schema import triples_store_queue
|
|
11
|
+
import argparse
|
|
12
|
+
import os
|
|
13
|
+
import time
|
|
14
|
+
import pyarrow as pa
|
|
15
|
+
import pyarrow.parquet as pq
|
|
16
|
+
|
|
17
|
+
from trustgraph.log_level import LogLevel
|
|
18
|
+
|
|
19
|
+
class Loader:
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
pulsar_host,
|
|
24
|
+
output_queue,
|
|
25
|
+
log_level,
|
|
26
|
+
file,
|
|
27
|
+
):
|
|
28
|
+
|
|
29
|
+
self.client = pulsar.Client(
|
|
30
|
+
pulsar_host,
|
|
31
|
+
logger=pulsar.ConsoleLogger(log_level.to_pulsar())
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
self.producer = self.client.create_producer(
|
|
35
|
+
topic=output_queue,
|
|
36
|
+
schema=JsonSchema(Triple),
|
|
37
|
+
chunking_enabled=True,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
self.file = file
|
|
41
|
+
|
|
42
|
+
def run(self):
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
|
|
46
|
+
path = self.file
|
|
47
|
+
|
|
48
|
+
print("Reading file...")
|
|
49
|
+
table = pq.read_table(path)
|
|
50
|
+
print("Loaded.")
|
|
51
|
+
|
|
52
|
+
names = set(table.column_names)
|
|
53
|
+
|
|
54
|
+
if "s" not in names:
|
|
55
|
+
print("No 's' column")
|
|
56
|
+
|
|
57
|
+
if "p" not in names:
|
|
58
|
+
print("No 'p' column")
|
|
59
|
+
|
|
60
|
+
if "o" not in names:
|
|
61
|
+
print("No 'o' column")
|
|
62
|
+
|
|
63
|
+
sc = table.column("s")
|
|
64
|
+
pc = table.column("p")
|
|
65
|
+
oc = table.column("o")
|
|
66
|
+
|
|
67
|
+
for s, p, o in zip(sc, pc, oc):
|
|
68
|
+
|
|
69
|
+
r = Triple(
|
|
70
|
+
s=Value(value=s.as_py(), is_uri=True),
|
|
71
|
+
p=Value(value=p.as_py(), is_uri=True),
|
|
72
|
+
o=Value(value=o.as_py(), is_uri=o.as_py().startswith("https:"))
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
self.producer.send(r)
|
|
76
|
+
|
|
77
|
+
except Exception as e:
|
|
78
|
+
print(e, flush=True)
|
|
79
|
+
|
|
80
|
+
def __del__(self):
|
|
81
|
+
self.client.close()
|
|
82
|
+
|
|
83
|
+
def main():
|
|
84
|
+
|
|
85
|
+
parser = argparse.ArgumentParser(
|
|
86
|
+
prog='loader',
|
|
87
|
+
description=__doc__,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://localhost:6650')
|
|
91
|
+
default_output_queue = triples_store_queue
|
|
92
|
+
|
|
93
|
+
parser.add_argument(
|
|
94
|
+
'-p', '--pulsar-host',
|
|
95
|
+
default=default_pulsar_host,
|
|
96
|
+
help=f'Pulsar host (default: {default_pulsar_host})',
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
parser.add_argument(
|
|
100
|
+
'-o', '--output-queue',
|
|
101
|
+
default=default_output_queue,
|
|
102
|
+
help=f'Output queue (default: {default_output_queue})'
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
parser.add_argument(
|
|
106
|
+
'-l', '--log-level',
|
|
107
|
+
type=LogLevel,
|
|
108
|
+
default=LogLevel.ERROR,
|
|
109
|
+
choices=list(LogLevel),
|
|
110
|
+
help=f'Output queue (default: info)'
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
parser.add_argument(
|
|
114
|
+
'-f', '--file',
|
|
115
|
+
required=True,
|
|
116
|
+
help=f'File to load'
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
args = parser.parse_args()
|
|
120
|
+
|
|
121
|
+
while True:
|
|
122
|
+
|
|
123
|
+
try:
|
|
124
|
+
p = Loader(
|
|
125
|
+
pulsar_host=args.pulsar_host,
|
|
126
|
+
output_queue=args.output_queue,
|
|
127
|
+
log_level=args.log_level,
|
|
128
|
+
file=args.file,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
p.run()
|
|
132
|
+
|
|
133
|
+
print("File loaded.")
|
|
134
|
+
break
|
|
135
|
+
|
|
136
|
+
except Exception as e:
|
|
137
|
+
|
|
138
|
+
print("Exception:", e, flush=True)
|
|
139
|
+
print("Will retry...", flush=True)
|
|
140
|
+
|
|
141
|
+
time.sleep(10)
|
|
142
|
+
|
|
143
|
+
main()
|
|
144
|
+
|
|
@@ -6,7 +6,7 @@ Loads a PDF documented into TrustGraph processing.
|
|
|
6
6
|
|
|
7
7
|
import pulsar
|
|
8
8
|
from pulsar.schema import JsonSchema
|
|
9
|
-
from trustgraph.schema import Document, Source
|
|
9
|
+
from trustgraph.schema import Document, Source, document_ingest_queue
|
|
10
10
|
import base64
|
|
11
11
|
import hashlib
|
|
12
12
|
import argparse
|
|
@@ -72,7 +72,7 @@ def main():
|
|
|
72
72
|
)
|
|
73
73
|
|
|
74
74
|
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://localhost:6650')
|
|
75
|
-
default_output_queue =
|
|
75
|
+
default_output_queue = document_ingest_queue
|
|
76
76
|
|
|
77
77
|
parser.add_argument(
|
|
78
78
|
'-p', '--pulsar-host',
|
|
@@ -4,7 +4,7 @@ import os
|
|
|
4
4
|
with open("README.md", "r") as fh:
|
|
5
5
|
long_description = fh.read()
|
|
6
6
|
|
|
7
|
-
version = "0.5.
|
|
7
|
+
version = "0.5.3"
|
|
8
8
|
|
|
9
9
|
setuptools.setup(
|
|
10
10
|
name="trustgraph",
|
|
@@ -48,9 +48,12 @@ setuptools.setup(
|
|
|
48
48
|
],
|
|
49
49
|
scripts=[
|
|
50
50
|
"scripts/chunker-recursive",
|
|
51
|
+
"scripts/concat-parquet",
|
|
52
|
+
"scripts/dump-parquet",
|
|
51
53
|
"scripts/embeddings-hf",
|
|
52
54
|
"scripts/embeddings-ollama",
|
|
53
55
|
"scripts/embeddings-vectorize",
|
|
56
|
+
"scripts/ge-dump-parquet",
|
|
54
57
|
"scripts/ge-write-milvus",
|
|
55
58
|
"scripts/graph-rag",
|
|
56
59
|
"scripts/graph-show",
|
|
@@ -58,6 +61,8 @@ setuptools.setup(
|
|
|
58
61
|
"scripts/init-pulsar-manager",
|
|
59
62
|
"scripts/kg-extract-definitions",
|
|
60
63
|
"scripts/kg-extract-relationships",
|
|
64
|
+
"scripts/load-graph-embeddings",
|
|
65
|
+
"scripts/load-triples",
|
|
61
66
|
"scripts/loader",
|
|
62
67
|
"scripts/pdf-decoder",
|
|
63
68
|
"scripts/query",
|
|
@@ -66,8 +71,7 @@ setuptools.setup(
|
|
|
66
71
|
"scripts/text-completion-claude",
|
|
67
72
|
"scripts/text-completion-ollama",
|
|
68
73
|
"scripts/text-completion-vertexai",
|
|
69
|
-
"scripts/triples-write-cassandra",
|
|
70
|
-
"scripts/dump-parquet",
|
|
71
74
|
"scripts/triples-dump-parquet",
|
|
75
|
+
"scripts/triples-write-cassandra",
|
|
72
76
|
]
|
|
73
77
|
)
|
|
@@ -79,20 +79,20 @@ class BaseProcessor:
|
|
|
79
79
|
@classmethod
|
|
80
80
|
def start(cls, prog, doc):
|
|
81
81
|
|
|
82
|
-
|
|
82
|
+
parser = argparse.ArgumentParser(
|
|
83
|
+
prog=prog,
|
|
84
|
+
description=doc
|
|
85
|
+
)
|
|
83
86
|
|
|
84
|
-
|
|
85
|
-
prog=prog,
|
|
86
|
-
description=doc
|
|
87
|
-
)
|
|
87
|
+
cls.add_args(parser)
|
|
88
88
|
|
|
89
|
-
|
|
89
|
+
args = parser.parse_args()
|
|
90
|
+
args = vars(args)
|
|
90
91
|
|
|
91
|
-
|
|
92
|
-
|
|
92
|
+
if args["metrics_enabled"]:
|
|
93
|
+
start_http_server(args["metrics_port"])
|
|
93
94
|
|
|
94
|
-
|
|
95
|
-
start_http_server(args["metrics_port"])
|
|
95
|
+
while True:
|
|
96
96
|
|
|
97
97
|
try:
|
|
98
98
|
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
|
|
2
|
+
"""
|
|
3
|
+
Write graph embeddings to parquet files in a directory.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import pulsar
|
|
7
|
+
import base64
|
|
8
|
+
import os
|
|
9
|
+
import argparse
|
|
10
|
+
import time
|
|
11
|
+
|
|
12
|
+
from .... trustgraph import TrustGraph
|
|
13
|
+
from .... schema import GraphEmbeddings
|
|
14
|
+
from .... schema import graph_embeddings_store_queue
|
|
15
|
+
from .... log_level import LogLevel
|
|
16
|
+
from .... base import Consumer
|
|
17
|
+
|
|
18
|
+
from . writer import ParquetWriter
|
|
19
|
+
|
|
20
|
+
module = ".".join(__name__.split(".")[1:-1])
|
|
21
|
+
|
|
22
|
+
default_input_queue = graph_embeddings_store_queue
|
|
23
|
+
default_subscriber = module
|
|
24
|
+
default_graph_host='localhost'
|
|
25
|
+
default_directory = "."
|
|
26
|
+
default_file_template = "graph-embeds-{id}.parquet"
|
|
27
|
+
default_rotation_time = 60
|
|
28
|
+
|
|
29
|
+
class Processor(Consumer):
|
|
30
|
+
|
|
31
|
+
def __init__(self, **params):
|
|
32
|
+
|
|
33
|
+
input_queue = params.get("input_queue", default_input_queue)
|
|
34
|
+
subscriber = params.get("subscriber", default_subscriber)
|
|
35
|
+
directory = params.get("directory", default_directory)
|
|
36
|
+
file_template = params.get("file_template", default_file_template)
|
|
37
|
+
rotation_time = params.get("rotation_time", default_rotation_time)
|
|
38
|
+
|
|
39
|
+
super(Processor, self).__init__(
|
|
40
|
+
**params | {
|
|
41
|
+
"input_queue": input_queue,
|
|
42
|
+
"subscriber": subscriber,
|
|
43
|
+
"input_schema": GraphEmbeddings,
|
|
44
|
+
}
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
self.writer = ParquetWriter(directory, file_template, rotation_time)
|
|
48
|
+
|
|
49
|
+
def __del__(self):
|
|
50
|
+
if hasattr(self, "writer"):
|
|
51
|
+
del self.writer
|
|
52
|
+
|
|
53
|
+
def handle(self, msg):
|
|
54
|
+
|
|
55
|
+
v = msg.value()
|
|
56
|
+
self.writer.write(v.vectors, v.entity.value)
|
|
57
|
+
|
|
58
|
+
@staticmethod
|
|
59
|
+
def add_args(parser):
|
|
60
|
+
|
|
61
|
+
Consumer.add_args(
|
|
62
|
+
parser, default_input_queue, default_subscriber,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
parser.add_argument(
|
|
66
|
+
'-d', '--directory',
|
|
67
|
+
default=default_directory,
|
|
68
|
+
help=f'Directory to write to (default: {default_directory})'
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
parser.add_argument(
|
|
72
|
+
'-f', '--file-template',
|
|
73
|
+
default=default_file_template,
|
|
74
|
+
help=f'Directory to write to (default: {default_file_template})'
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
parser.add_argument(
|
|
78
|
+
'-t', '--rotation-time',
|
|
79
|
+
type=int,
|
|
80
|
+
default=default_rotation_time,
|
|
81
|
+
help=f'Rotation time / seconds (default: {default_rotation_time})'
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
def run():
|
|
85
|
+
|
|
86
|
+
Processor.start(module, __doc__)
|
|
87
|
+
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
|
|
2
|
+
import threading
|
|
3
|
+
import queue
|
|
4
|
+
import time
|
|
5
|
+
import uuid
|
|
6
|
+
import pyarrow as pa
|
|
7
|
+
import pyarrow.parquet as pq
|
|
8
|
+
|
|
9
|
+
class ParquetWriter:
|
|
10
|
+
|
|
11
|
+
def __init__(self, directory, file_template, rotation_time):
|
|
12
|
+
self.directory = directory
|
|
13
|
+
self.file_template = file_template
|
|
14
|
+
self.rotation_time = rotation_time
|
|
15
|
+
|
|
16
|
+
self.q = queue.Queue()
|
|
17
|
+
|
|
18
|
+
self.running = True
|
|
19
|
+
|
|
20
|
+
self.thread = threading.Thread(target=(self.writer_thread))
|
|
21
|
+
self.thread.start()
|
|
22
|
+
|
|
23
|
+
def writer_thread(self):
|
|
24
|
+
|
|
25
|
+
items = []
|
|
26
|
+
|
|
27
|
+
timeout = None
|
|
28
|
+
|
|
29
|
+
while self.running:
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
|
|
33
|
+
item = self.q.get(timeout=1)
|
|
34
|
+
|
|
35
|
+
if timeout == None:
|
|
36
|
+
timeout = time.time() + self.rotation_time
|
|
37
|
+
|
|
38
|
+
items.append(item)
|
|
39
|
+
|
|
40
|
+
except queue.Empty:
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
if timeout:
|
|
44
|
+
if time.time() > timeout:
|
|
45
|
+
|
|
46
|
+
self.write_file(items)
|
|
47
|
+
timeout = None
|
|
48
|
+
items = []
|
|
49
|
+
|
|
50
|
+
def write_file(self, items):
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
|
|
54
|
+
schema = pa.schema([
|
|
55
|
+
pa.field('embeddings', pa.list_(pa.list_(pa.float64()))),
|
|
56
|
+
pa.field('entity', pa.string()),
|
|
57
|
+
])
|
|
58
|
+
|
|
59
|
+
fname = self.file_template.format(id=str(uuid.uuid4()))
|
|
60
|
+
path = f"{self.directory}/{fname}"
|
|
61
|
+
|
|
62
|
+
writer = pq.ParquetWriter(path, schema)
|
|
63
|
+
|
|
64
|
+
batch = pa.record_batch(
|
|
65
|
+
[
|
|
66
|
+
[i[0] for i in items],
|
|
67
|
+
[i[1] for i in items],
|
|
68
|
+
],
|
|
69
|
+
names=['embeddings', 'entity']
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
writer.write_batch(batch)
|
|
73
|
+
|
|
74
|
+
writer.close()
|
|
75
|
+
|
|
76
|
+
print(f"Wrote {path}.")
|
|
77
|
+
|
|
78
|
+
except Exception as e:
|
|
79
|
+
|
|
80
|
+
print("Parquet write:", e)
|
|
81
|
+
|
|
82
|
+
def write(self, embeds, ent):
|
|
83
|
+
self.q.put((embeds, ent))
|
|
84
|
+
|
|
85
|
+
def __del__(self):
|
|
86
|
+
|
|
87
|
+
self.running = False
|
|
88
|
+
|
|
89
|
+
if hasattr(self, "q"):
|
|
90
|
+
self.thread.join()
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
|
|
File without changes
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: trustgraph
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.3
|
|
4
4
|
Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
|
|
5
5
|
Home-page: https://github.com/trustgraph-ai/trustgraph
|
|
6
|
-
Download-URL: https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v0.5.
|
|
6
|
+
Download-URL: https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v0.5.3.tar.gz
|
|
7
7
|
Author: trustgraph.ai
|
|
8
8
|
Author-email: security@trustgraph.ai
|
|
9
9
|
Classifier: Programming Language :: Python :: 3
|
|
@@ -2,10 +2,12 @@ LICENSE
|
|
|
2
2
|
README.md
|
|
3
3
|
setup.py
|
|
4
4
|
scripts/chunker-recursive
|
|
5
|
+
scripts/concat-parquet
|
|
5
6
|
scripts/dump-parquet
|
|
6
7
|
scripts/embeddings-hf
|
|
7
8
|
scripts/embeddings-ollama
|
|
8
9
|
scripts/embeddings-vectorize
|
|
10
|
+
scripts/ge-dump-parquet
|
|
9
11
|
scripts/ge-write-milvus
|
|
10
12
|
scripts/graph-rag
|
|
11
13
|
scripts/graph-show
|
|
@@ -13,6 +15,8 @@ scripts/graph-to-turtle
|
|
|
13
15
|
scripts/init-pulsar-manager
|
|
14
16
|
scripts/kg-extract-definitions
|
|
15
17
|
scripts/kg-extract-relationships
|
|
18
|
+
scripts/load-graph-embeddings
|
|
19
|
+
scripts/load-triples
|
|
16
20
|
scripts/loader
|
|
17
21
|
scripts/pdf-decoder
|
|
18
22
|
scripts/query
|
|
@@ -53,6 +57,11 @@ trustgraph/decoding/pdf/__init__.py
|
|
|
53
57
|
trustgraph/decoding/pdf/__main__.py
|
|
54
58
|
trustgraph/decoding/pdf/pdf_decoder.py
|
|
55
59
|
trustgraph/dump/__init__.py
|
|
60
|
+
trustgraph/dump/graph_embeddings/__init__.py
|
|
61
|
+
trustgraph/dump/graph_embeddings/parquet/__init__.py
|
|
62
|
+
trustgraph/dump/graph_embeddings/parquet/__main__.py
|
|
63
|
+
trustgraph/dump/graph_embeddings/parquet/processor.py
|
|
64
|
+
trustgraph/dump/graph_embeddings/parquet/writer.py
|
|
56
65
|
trustgraph/dump/triples/__init__.py
|
|
57
66
|
trustgraph/dump/triples/parquet/__init__.py
|
|
58
67
|
trustgraph/dump/triples/parquet/__main__.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{trustgraph-0.5.2/trustgraph/embeddings → trustgraph-0.5.3/trustgraph/dump/triples}/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{trustgraph-0.5.2/trustgraph/model/text_completion → trustgraph-0.5.3/trustgraph/model}/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{trustgraph-0.5.2 → trustgraph-0.5.3}/trustgraph/storage/graph_embeddings/milvus/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|