trustgraph 0.3.0__tar.gz → 0.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of trustgraph might be problematic. Click here for more details.
- {trustgraph-0.3.0 → trustgraph-0.4.1}/PKG-INFO +3 -2
- {trustgraph-0.3.0 → trustgraph-0.4.1}/setup.py +2 -1
- trustgraph-0.4.1/trustgraph/base/processor.py +360 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/chunker/recursive/chunker.py +15 -18
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/decoder/pdf/pdf_decoder.py +12 -15
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/embeddings/hf/hf.py +13 -16
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/embeddings/ollama/processor.py +12 -17
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/embeddings/vectorize/vectorize.py +13 -16
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/graph/cassandra_write/write.py +10 -20
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/graph_rag.py +3 -3
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/kg/extract_definitions/extract.py +12 -15
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/kg/extract_relationships/extract.py +25 -17
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/llm/azure_text/llm.py +15 -17
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/llm/claude_text/llm.py +17 -19
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/llm/ollama_text/llm.py +27 -17
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/llm/vertexai_text/llm.py +15 -18
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/rag/graph/rag.py +24 -25
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/vector/milvus_write/write.py +12 -13
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph.egg-info/PKG-INFO +3 -2
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph.egg-info/requires.txt +1 -0
- trustgraph-0.3.0/trustgraph/base/processor.py +0 -266
- {trustgraph-0.3.0 → trustgraph-0.4.1}/LICENSE +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/README.md +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/chunker-recursive +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/embeddings-hf +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/embeddings-ollama +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/embeddings-vectorize +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/graph-rag +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/graph-show +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/graph-to-turtle +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/graph-write-cassandra +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/init-pulsar-manager +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/kg-extract-definitions +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/kg-extract-relationships +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/llm-azure-text +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/llm-claude-text +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/llm-ollama-text +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/llm-vertexai-text +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/loader +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/pdf-decoder +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/query +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/run-processing +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/vector-write-milvus +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/setup.cfg +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/__init__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/base/__init__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/chunker/__init__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/chunker/recursive/__init__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/chunker/recursive/__main__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/decoder/__init__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/decoder/pdf/__init__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/decoder/pdf/__main__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/embeddings/__init__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/embeddings/hf/__init__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/embeddings/hf/__main__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/embeddings/ollama/__init__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/embeddings/ollama/__main__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/embeddings/vectorize/__init__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/embeddings/vectorize/__main__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/embeddings_client.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/graph/__init__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/graph/cassandra_write/__init__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/graph/cassandra_write/__main__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/graph_rag_client.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/kg/__init__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/kg/extract_definitions/__init__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/kg/extract_definitions/__main__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/kg/extract_relationships/__init__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/kg/extract_relationships/__main__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/llm/__init__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/llm/azure_text/__init__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/llm/azure_text/__main__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/llm/claude_text/__init__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/llm/claude_text/__main__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/llm/ollama_text/__init__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/llm/ollama_text/__main__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/llm/vertexai_text/__init__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/llm/vertexai_text/__main__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/llm_client.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/log_level.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/processing/__init__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/processing/__main__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/processing/processing.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/prompts.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/rag/__init__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/rag/graph/__init__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/rag/graph/__main__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/rdf.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/schema.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/triple_vectors.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/trustgraph.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/vector/__init__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/vector/milvus_write/__init__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/vector/milvus_write/__main__.py +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph.egg-info/SOURCES.txt +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph.egg-info/dependency_links.txt +0 -0
- {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph.egg-info/top_level.txt +0 -0
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: trustgraph
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.1
|
|
4
4
|
Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
|
|
5
5
|
Home-page: https://github.com/trustgraph-ai/trustgraph
|
|
6
|
-
Download-URL: https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v0.
|
|
6
|
+
Download-URL: https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v0.4.1.tar.gz
|
|
7
7
|
Author: trustgraph.ai
|
|
8
8
|
Author-email: security@trustgraph.ai
|
|
9
9
|
Classifier: Programming Language :: Python :: 3
|
|
@@ -31,6 +31,7 @@ Requires-Dist: pypdf
|
|
|
31
31
|
Requires-Dist: anthropic
|
|
32
32
|
Requires-Dist: google-cloud-aiplatform
|
|
33
33
|
Requires-Dist: pyyaml
|
|
34
|
+
Requires-Dist: prometheus-client
|
|
34
35
|
|
|
35
36
|
|
|
36
37
|
# TrustGraph
|
|
@@ -4,7 +4,7 @@ import os
|
|
|
4
4
|
with open("README.md", "r") as fh:
|
|
5
5
|
long_description = fh.read()
|
|
6
6
|
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.4.1"
|
|
8
8
|
|
|
9
9
|
setuptools.setup(
|
|
10
10
|
name="trustgraph",
|
|
@@ -43,6 +43,7 @@ setuptools.setup(
|
|
|
43
43
|
"anthropic",
|
|
44
44
|
"google-cloud-aiplatform",
|
|
45
45
|
"pyyaml",
|
|
46
|
+
"prometheus-client",
|
|
46
47
|
],
|
|
47
48
|
scripts=[
|
|
48
49
|
"scripts/chunker-recursive",
|
|
@@ -0,0 +1,360 @@
|
|
|
1
|
+
|
|
2
|
+
import os
|
|
3
|
+
import argparse
|
|
4
|
+
import pulsar
|
|
5
|
+
import _pulsar
|
|
6
|
+
import time
|
|
7
|
+
from pulsar.schema import JsonSchema
|
|
8
|
+
from prometheus_client import start_http_server, Histogram, Info, Counter
|
|
9
|
+
|
|
10
|
+
from .. log_level import LogLevel
|
|
11
|
+
|
|
12
|
+
class BaseProcessor:
|
|
13
|
+
|
|
14
|
+
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
|
|
15
|
+
|
|
16
|
+
def __init__(self, **params):
|
|
17
|
+
|
|
18
|
+
self.client = None
|
|
19
|
+
|
|
20
|
+
if not hasattr(__class__, "params_metric"):
|
|
21
|
+
__class__.params_metric = Info(
|
|
22
|
+
'params', 'Parameters configuration'
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
# FIXME: Maybe outputs information it should not
|
|
26
|
+
__class__.params_metric.info({
|
|
27
|
+
k: str(params[k])
|
|
28
|
+
for k in params
|
|
29
|
+
})
|
|
30
|
+
|
|
31
|
+
pulsar_host = params.get("pulsar_host", self.default_pulsar_host)
|
|
32
|
+
log_level = params.get("log_level", LogLevel.INFO)
|
|
33
|
+
|
|
34
|
+
self.pulsar_host = pulsar_host
|
|
35
|
+
|
|
36
|
+
self.client = pulsar.Client(
|
|
37
|
+
pulsar_host,
|
|
38
|
+
logger=pulsar.ConsoleLogger(log_level.to_pulsar())
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
def __del__(self):
|
|
42
|
+
|
|
43
|
+
if self.client:
|
|
44
|
+
self.client.close()
|
|
45
|
+
|
|
46
|
+
@staticmethod
|
|
47
|
+
def add_args(parser):
|
|
48
|
+
|
|
49
|
+
parser.add_argument(
|
|
50
|
+
'-p', '--pulsar-host',
|
|
51
|
+
default=__class__.default_pulsar_host,
|
|
52
|
+
help=f'Pulsar host (default: {__class__.default_pulsar_host})',
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
parser.add_argument(
|
|
56
|
+
'-l', '--log-level',
|
|
57
|
+
type=LogLevel,
|
|
58
|
+
default=LogLevel.INFO,
|
|
59
|
+
choices=list(LogLevel),
|
|
60
|
+
help=f'Output queue (default: info)'
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
parser.add_argument(
|
|
64
|
+
'-M', '--metrics-enabled',
|
|
65
|
+
type=bool,
|
|
66
|
+
default=True,
|
|
67
|
+
help=f'Pulsar host (default: true)',
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
parser.add_argument(
|
|
71
|
+
'-P', '--metrics-port',
|
|
72
|
+
type=int,
|
|
73
|
+
default=8000,
|
|
74
|
+
help=f'Pulsar host (default: 8000)',
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
def run(self):
|
|
78
|
+
raise RuntimeError("Something should have implemented the run method")
|
|
79
|
+
|
|
80
|
+
@classmethod
|
|
81
|
+
def start(cls, prog, doc):
|
|
82
|
+
|
|
83
|
+
while True:
|
|
84
|
+
|
|
85
|
+
parser = argparse.ArgumentParser(
|
|
86
|
+
prog=prog,
|
|
87
|
+
description=doc
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
cls.add_args(parser)
|
|
91
|
+
|
|
92
|
+
args = parser.parse_args()
|
|
93
|
+
args = vars(args)
|
|
94
|
+
|
|
95
|
+
if args["metrics_enabled"]:
|
|
96
|
+
start_http_server(args["metrics_port"])
|
|
97
|
+
|
|
98
|
+
try:
|
|
99
|
+
|
|
100
|
+
p = cls(**args)
|
|
101
|
+
p.run()
|
|
102
|
+
|
|
103
|
+
except KeyboardInterrupt:
|
|
104
|
+
print("Keyboard interrupt.")
|
|
105
|
+
return
|
|
106
|
+
|
|
107
|
+
except _pulsar.Interrupted:
|
|
108
|
+
print("Pulsar Interrupted.")
|
|
109
|
+
return
|
|
110
|
+
|
|
111
|
+
except Exception as e:
|
|
112
|
+
|
|
113
|
+
print(type(e))
|
|
114
|
+
|
|
115
|
+
print("Exception:", e, flush=True)
|
|
116
|
+
print("Will retry...", flush=True)
|
|
117
|
+
|
|
118
|
+
time.sleep(10)
|
|
119
|
+
|
|
120
|
+
class Consumer(BaseProcessor):
|
|
121
|
+
|
|
122
|
+
def __init__(self, **params):
|
|
123
|
+
|
|
124
|
+
super(Consumer, self).__init__(**params)
|
|
125
|
+
|
|
126
|
+
input_queue = params.get("input_queue")
|
|
127
|
+
subscriber = params.get("subscriber")
|
|
128
|
+
input_schema = params.get("input_schema")
|
|
129
|
+
|
|
130
|
+
if input_schema == None:
|
|
131
|
+
raise RuntimeError("input_schema must be specified")
|
|
132
|
+
|
|
133
|
+
if not hasattr(__class__, "request_metric"):
|
|
134
|
+
__class__.request_metric = Histogram(
|
|
135
|
+
'request_latency', 'Request latency (seconds)'
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
if not hasattr(__class__, "pubsub_metric"):
|
|
139
|
+
__class__.pubsub_metric = Info(
|
|
140
|
+
'pubsub', 'Pub/sub configuration'
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
if not hasattr(__class__, "processing_metric"):
|
|
144
|
+
__class__.processing_metric = Counter(
|
|
145
|
+
'processing_count', 'Processing count', ["status"]
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
__class__.pubsub_metric.info({
|
|
149
|
+
"input_queue": input_queue,
|
|
150
|
+
"subscriber": subscriber,
|
|
151
|
+
"input_schema": input_schema.__name__,
|
|
152
|
+
})
|
|
153
|
+
|
|
154
|
+
self.consumer = self.client.subscribe(
|
|
155
|
+
input_queue, subscriber,
|
|
156
|
+
schema=JsonSchema(input_schema),
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
def run(self):
|
|
160
|
+
|
|
161
|
+
while True:
|
|
162
|
+
|
|
163
|
+
msg = self.consumer.receive()
|
|
164
|
+
|
|
165
|
+
try:
|
|
166
|
+
|
|
167
|
+
with __class__.request_metric.time():
|
|
168
|
+
self.handle(msg)
|
|
169
|
+
|
|
170
|
+
# Acknowledge successful processing of the message
|
|
171
|
+
self.consumer.acknowledge(msg)
|
|
172
|
+
|
|
173
|
+
__class__.processing_metric.labels(status="success").inc()
|
|
174
|
+
|
|
175
|
+
except Exception as e:
|
|
176
|
+
|
|
177
|
+
print("Exception:", e, flush=True)
|
|
178
|
+
|
|
179
|
+
# Message failed to be processed
|
|
180
|
+
self.consumer.negative_acknowledge(msg)
|
|
181
|
+
|
|
182
|
+
__class__.processing_metric.labels(status="error").inc()
|
|
183
|
+
|
|
184
|
+
@staticmethod
|
|
185
|
+
def add_args(parser, default_input_queue, default_subscriber):
|
|
186
|
+
|
|
187
|
+
BaseProcessor.add_args(parser)
|
|
188
|
+
|
|
189
|
+
parser.add_argument(
|
|
190
|
+
'-i', '--input-queue',
|
|
191
|
+
default=default_input_queue,
|
|
192
|
+
help=f'Input queue (default: {default_input_queue})'
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
parser.add_argument(
|
|
196
|
+
'-s', '--subscriber',
|
|
197
|
+
default=default_subscriber,
|
|
198
|
+
help=f'Queue subscriber name (default: {default_subscriber})'
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
class ConsumerProducer(BaseProcessor):
|
|
202
|
+
|
|
203
|
+
def __init__(self, **params):
|
|
204
|
+
|
|
205
|
+
input_queue = params.get("input_queue")
|
|
206
|
+
output_queue = params.get("output_queue")
|
|
207
|
+
subscriber = params.get("subscriber")
|
|
208
|
+
input_schema = params.get("input_schema")
|
|
209
|
+
output_schema = params.get("output_schema")
|
|
210
|
+
|
|
211
|
+
if not hasattr(__class__, "request_metric"):
|
|
212
|
+
__class__.request_metric = Histogram(
|
|
213
|
+
'request_latency', 'Request latency (seconds)'
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
if not hasattr(__class__, "output_metric"):
|
|
217
|
+
__class__.output_metric = Counter(
|
|
218
|
+
'output_count', 'Output items created'
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
if not hasattr(__class__, "pubsub_metric"):
|
|
222
|
+
__class__.pubsub_metric = Info(
|
|
223
|
+
'pubsub', 'Pub/sub configuration'
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
if not hasattr(__class__, "processing_metric"):
|
|
227
|
+
__class__.processing_metric = Counter(
|
|
228
|
+
'processing_count', 'Processing count', ["status"]
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
__class__.pubsub_metric.info({
|
|
232
|
+
"input_queue": input_queue,
|
|
233
|
+
"output_queue": output_queue,
|
|
234
|
+
"subscriber": subscriber,
|
|
235
|
+
"input_schema": input_schema.__name__,
|
|
236
|
+
"output_schema": output_schema.__name__,
|
|
237
|
+
})
|
|
238
|
+
|
|
239
|
+
super(ConsumerProducer, self).__init__(**params)
|
|
240
|
+
|
|
241
|
+
if input_schema == None:
|
|
242
|
+
raise RuntimeError("input_schema must be specified")
|
|
243
|
+
|
|
244
|
+
if output_schema == None:
|
|
245
|
+
raise RuntimeError("output_schema must be specified")
|
|
246
|
+
|
|
247
|
+
self.consumer = self.client.subscribe(
|
|
248
|
+
input_queue, subscriber,
|
|
249
|
+
schema=JsonSchema(input_schema),
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
self.producer = self.client.create_producer(
|
|
253
|
+
topic=output_queue,
|
|
254
|
+
schema=JsonSchema(output_schema),
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
def run(self):
|
|
258
|
+
|
|
259
|
+
while True:
|
|
260
|
+
|
|
261
|
+
msg = self.consumer.receive()
|
|
262
|
+
|
|
263
|
+
try:
|
|
264
|
+
|
|
265
|
+
with __class__.request_metric.time():
|
|
266
|
+
resp = self.handle(msg)
|
|
267
|
+
|
|
268
|
+
# Acknowledge successful processing of the message
|
|
269
|
+
self.consumer.acknowledge(msg)
|
|
270
|
+
|
|
271
|
+
__class__.processing_metric.labels(status="success").inc()
|
|
272
|
+
|
|
273
|
+
except Exception as e:
|
|
274
|
+
|
|
275
|
+
print("Exception:", e, flush=True)
|
|
276
|
+
|
|
277
|
+
# Message failed to be processed
|
|
278
|
+
self.consumer.negative_acknowledge(msg)
|
|
279
|
+
|
|
280
|
+
__class__.processing_metric.labels(status="error").inc()
|
|
281
|
+
|
|
282
|
+
def send(self, msg, properties={}):
|
|
283
|
+
self.producer.send(msg, properties)
|
|
284
|
+
__class__.output_metric.inc()
|
|
285
|
+
|
|
286
|
+
@staticmethod
|
|
287
|
+
def add_args(
|
|
288
|
+
parser, default_input_queue, default_subscriber,
|
|
289
|
+
default_output_queue,
|
|
290
|
+
):
|
|
291
|
+
|
|
292
|
+
BaseProcessor.add_args(parser)
|
|
293
|
+
|
|
294
|
+
parser.add_argument(
|
|
295
|
+
'-i', '--input-queue',
|
|
296
|
+
default=default_input_queue,
|
|
297
|
+
help=f'Input queue (default: {default_input_queue})'
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
parser.add_argument(
|
|
301
|
+
'-s', '--subscriber',
|
|
302
|
+
default=default_subscriber,
|
|
303
|
+
help=f'Queue subscriber name (default: {default_subscriber})'
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
parser.add_argument(
|
|
307
|
+
'-o', '--output-queue',
|
|
308
|
+
default=default_output_queue,
|
|
309
|
+
help=f'Output queue (default: {default_output_queue})'
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
class Producer(BaseProcessor):
|
|
313
|
+
|
|
314
|
+
def __init__(self, **params):
|
|
315
|
+
|
|
316
|
+
output_queue = params.get("output_queue")
|
|
317
|
+
output_schema = params.get("output_schema")
|
|
318
|
+
|
|
319
|
+
if not hasattr(__class__, "output_metric"):
|
|
320
|
+
__class__.output_metric = Counter(
|
|
321
|
+
'output_count', 'Output items created'
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
if not hasattr(__class__, "pubsub_metric"):
|
|
325
|
+
__class__.pubsub_metric = Info(
|
|
326
|
+
'pubsub', 'Pub/sub configuration'
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
__class__.pubsub_metric.info({
|
|
330
|
+
"output_queue": output_queue,
|
|
331
|
+
"output_schema": output_schema.__name__,
|
|
332
|
+
})
|
|
333
|
+
|
|
334
|
+
super(Producer, self).__init__(**params)
|
|
335
|
+
|
|
336
|
+
if output_schema == None:
|
|
337
|
+
raise RuntimeError("output_schema must be specified")
|
|
338
|
+
|
|
339
|
+
self.producer = self.client.create_producer(
|
|
340
|
+
topic=output_queue,
|
|
341
|
+
schema=JsonSchema(output_schema),
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
def send(self, msg, properties={}):
|
|
345
|
+
self.producer.send(msg, properties)
|
|
346
|
+
__class__.output_metric.inc()
|
|
347
|
+
|
|
348
|
+
@staticmethod
|
|
349
|
+
def add_args(
|
|
350
|
+
parser, default_input_queue, default_subscriber,
|
|
351
|
+
default_output_queue,
|
|
352
|
+
):
|
|
353
|
+
|
|
354
|
+
BaseProcessor.add_args(parser)
|
|
355
|
+
|
|
356
|
+
parser.add_argument(
|
|
357
|
+
'-o', '--output-queue',
|
|
358
|
+
default=default_output_queue,
|
|
359
|
+
help=f'Output queue (default: {default_output_queue})'
|
|
360
|
+
)
|
|
@@ -17,25 +17,22 @@ default_subscriber = 'chunker-recursive'
|
|
|
17
17
|
|
|
18
18
|
class Processor(ConsumerProducer):
|
|
19
19
|
|
|
20
|
-
def __init__(
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
chunk_overlap=100,
|
|
29
|
-
):
|
|
30
|
-
|
|
20
|
+
def __init__(self, **params):
|
|
21
|
+
|
|
22
|
+
input_queue = params.get("input_queue", default_input_queue)
|
|
23
|
+
output_queue = params.get("output_queue", default_output_queue)
|
|
24
|
+
subscriber = params.get("subscriber", default_subscriber)
|
|
25
|
+
chunk_size = params.get("chunk_size", 2000)
|
|
26
|
+
chunk_overlap = params.get("chunk_overlap", 100)
|
|
27
|
+
|
|
31
28
|
super(Processor, self).__init__(
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
29
|
+
**params | {
|
|
30
|
+
"input_queue": input_queue,
|
|
31
|
+
"output_queue": output_queue,
|
|
32
|
+
"subscriber": subscriber,
|
|
33
|
+
"input_schema": TextDocument,
|
|
34
|
+
"output_schema": Chunk,
|
|
35
|
+
}
|
|
39
36
|
)
|
|
40
37
|
|
|
41
38
|
self.text_splitter = RecursiveCharacterTextSplitter(
|
|
@@ -18,23 +18,20 @@ default_subscriber = 'pdf-decoder'
|
|
|
18
18
|
|
|
19
19
|
class Processor(ConsumerProducer):
|
|
20
20
|
|
|
21
|
-
def __init__(
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
subscriber=default_subscriber,
|
|
27
|
-
log_level=LogLevel.INFO,
|
|
28
|
-
):
|
|
21
|
+
def __init__(self, **params):
|
|
22
|
+
|
|
23
|
+
input_queue = params.get("input_queue", default_input_queue)
|
|
24
|
+
output_queue = params.get("output_queue", default_output_queue)
|
|
25
|
+
subscriber = params.get("subscriber", default_subscriber)
|
|
29
26
|
|
|
30
27
|
super(Processor, self).__init__(
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
28
|
+
**params | {
|
|
29
|
+
"input_queue": input_queue,
|
|
30
|
+
"output_queue": output_queue,
|
|
31
|
+
"subscriber": subscriber,
|
|
32
|
+
"input_schema": Document,
|
|
33
|
+
"output_schema": TextDocument,
|
|
34
|
+
}
|
|
38
35
|
)
|
|
39
36
|
|
|
40
37
|
print("PDF inited")
|
|
@@ -17,24 +17,21 @@ default_model="all-MiniLM-L6-v2"
|
|
|
17
17
|
|
|
18
18
|
class Processor(ConsumerProducer):
|
|
19
19
|
|
|
20
|
-
def __init__(
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
log_level=LogLevel.INFO,
|
|
27
|
-
model=default_model,
|
|
28
|
-
):
|
|
20
|
+
def __init__(self, **params):
|
|
21
|
+
|
|
22
|
+
input_queue = params.get("input_queue", default_input_queue)
|
|
23
|
+
output_queue = params.get("output_queue", default_output_queue)
|
|
24
|
+
subscriber = params.get("subscriber", default_subscriber)
|
|
25
|
+
model = params.get("model", default_model)
|
|
29
26
|
|
|
30
27
|
super(Processor, self).__init__(
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
28
|
+
**params | {
|
|
29
|
+
"input_queue": input_queue,
|
|
30
|
+
"output_queue": output_queue,
|
|
31
|
+
"subscriber": subscriber,
|
|
32
|
+
"input_schema": EmbeddingsRequest,
|
|
33
|
+
"output_schema": EmbeddingsResponse,
|
|
34
|
+
}
|
|
38
35
|
)
|
|
39
36
|
|
|
40
37
|
self.embeddings = HuggingFaceEmbeddings(model_name=model)
|
|
@@ -17,25 +17,20 @@ default_ollama = 'http://localhost:11434'
|
|
|
17
17
|
|
|
18
18
|
class Processor(ConsumerProducer):
|
|
19
19
|
|
|
20
|
-
def __init__(
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
subscriber=default_subscriber,
|
|
26
|
-
log_level=LogLevel.INFO,
|
|
27
|
-
model=default_model,
|
|
28
|
-
ollama=default_ollama,
|
|
29
|
-
):
|
|
20
|
+
def __init__(self, **params):
|
|
21
|
+
|
|
22
|
+
input_queue = params.get("input_queue", default_input_queue)
|
|
23
|
+
output_queue = params.get("output_queue", default_output_queue)
|
|
24
|
+
subscriber = params.get("subscriber", default_subscriber)
|
|
30
25
|
|
|
31
26
|
super(Processor, self).__init__(
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
27
|
+
**params | {
|
|
28
|
+
"input_queue": input_queue,
|
|
29
|
+
"output_queue": output_queue,
|
|
30
|
+
"subscriber": subscriber,
|
|
31
|
+
"input_schema": EmbeddingsRequest,
|
|
32
|
+
"output_schema": EmbeddingsResponse,
|
|
33
|
+
}
|
|
39
34
|
)
|
|
40
35
|
|
|
41
36
|
self.embeddings = OllamaEmbeddings(base_url=ollama, model=model)
|
|
@@ -15,26 +15,23 @@ default_subscriber = 'embeddings-vectorizer'
|
|
|
15
15
|
|
|
16
16
|
class Processor(ConsumerProducer):
|
|
17
17
|
|
|
18
|
-
def __init__(
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
subscriber=default_subscriber,
|
|
24
|
-
log_level=LogLevel.INFO,
|
|
25
|
-
):
|
|
18
|
+
def __init__(self, **params):
|
|
19
|
+
|
|
20
|
+
input_queue = params.get("input_queue", default_input_queue)
|
|
21
|
+
output_queue = params.get("output_queue", default_output_queue)
|
|
22
|
+
subscriber = params.get("subscriber", default_subscriber)
|
|
26
23
|
|
|
27
24
|
super(Processor, self).__init__(
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
25
|
+
**params | {
|
|
26
|
+
"input_queue": input_queue,
|
|
27
|
+
"output_queue": output_queue,
|
|
28
|
+
"subscriber": subscriber,
|
|
29
|
+
"input_schema": Chunk,
|
|
30
|
+
"output_schema": VectorsChunk,
|
|
31
|
+
}
|
|
35
32
|
)
|
|
36
33
|
|
|
37
|
-
self.embeddings = EmbeddingsClient(pulsar_host=pulsar_host)
|
|
34
|
+
self.embeddings = EmbeddingsClient(pulsar_host=self.pulsar_host)
|
|
38
35
|
|
|
39
36
|
def emit(self, source, chunk, vectors):
|
|
40
37
|
|
|
@@ -20,27 +20,22 @@ default_graph_host='localhost'
|
|
|
20
20
|
|
|
21
21
|
class Processor(Consumer):
|
|
22
22
|
|
|
23
|
-
def __init__(
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
graph_host=default_graph_host,
|
|
29
|
-
log_level=LogLevel.INFO,
|
|
30
|
-
):
|
|
23
|
+
def __init__(self, **params):
|
|
24
|
+
|
|
25
|
+
input_queue = params.get("input_queue", default_input_queue)
|
|
26
|
+
subscriber = params.get("subscriber", default_subscriber)
|
|
27
|
+
graph_host = params.get("graph_host", default_graph_host)
|
|
31
28
|
|
|
32
29
|
super(Processor, self).__init__(
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
30
|
+
**params | {
|
|
31
|
+
"input_queue": input_queue,
|
|
32
|
+
"subscriber": subscriber,
|
|
33
|
+
"input_schema": Triple,
|
|
34
|
+
}
|
|
38
35
|
)
|
|
39
36
|
|
|
40
37
|
self.tg = TrustGraph([graph_host])
|
|
41
38
|
|
|
42
|
-
self.count = 0
|
|
43
|
-
|
|
44
39
|
def handle(self, msg):
|
|
45
40
|
|
|
46
41
|
v = msg.value()
|
|
@@ -51,11 +46,6 @@ class Processor(Consumer):
|
|
|
51
46
|
v.o.value
|
|
52
47
|
)
|
|
53
48
|
|
|
54
|
-
self.count += 1
|
|
55
|
-
|
|
56
|
-
if (self.count % 1000) == 0:
|
|
57
|
-
print(self.count, "...", flush=True)
|
|
58
|
-
|
|
59
49
|
@staticmethod
|
|
60
50
|
def add_args(parser):
|
|
61
51
|
|
|
@@ -18,7 +18,7 @@ class GraphRag:
|
|
|
18
18
|
verbose=False,
|
|
19
19
|
entity_limit=50,
|
|
20
20
|
triple_limit=30,
|
|
21
|
-
|
|
21
|
+
max_subgraph_size=3000,
|
|
22
22
|
):
|
|
23
23
|
|
|
24
24
|
self.verbose=verbose
|
|
@@ -37,7 +37,7 @@ class GraphRag:
|
|
|
37
37
|
|
|
38
38
|
self.entity_limit=entity_limit
|
|
39
39
|
self.query_limit=triple_limit
|
|
40
|
-
self.
|
|
40
|
+
self.max_subgraph_size=max_subgraph_size
|
|
41
41
|
|
|
42
42
|
self.label_cache = {}
|
|
43
43
|
|
|
@@ -149,7 +149,7 @@ class GraphRag:
|
|
|
149
149
|
|
|
150
150
|
subgraph = list(subgraph)
|
|
151
151
|
|
|
152
|
-
subgraph = subgraph[0:self.
|
|
152
|
+
subgraph = subgraph[0:self.max_subgraph_size]
|
|
153
153
|
|
|
154
154
|
if self.verbose:
|
|
155
155
|
print("Subgraph:", flush=True)
|