trustgraph 0.2.4__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of trustgraph might be problematic. Click here for more details.
- {trustgraph-0.2.4 → trustgraph-0.3.1}/PKG-INFO +2 -2
- {trustgraph-0.2.4 → trustgraph-0.3.1}/setup.py +1 -1
- trustgraph-0.3.1/trustgraph/base/processor.py +268 -0
- trustgraph-0.3.1/trustgraph/chunker/recursive/chunker.py +99 -0
- trustgraph-0.3.1/trustgraph/decoder/pdf/pdf_decoder.py +87 -0
- trustgraph-0.3.1/trustgraph/embeddings/hf/hf.py +77 -0
- trustgraph-0.3.1/trustgraph/embeddings/ollama/__init__.py +3 -0
- trustgraph-0.3.1/trustgraph/embeddings/ollama/processor.py +86 -0
- trustgraph-0.3.1/trustgraph/embeddings/vectorize/vectorize.py +77 -0
- trustgraph-0.3.1/trustgraph/graph/cassandra_write/write.py +75 -0
- trustgraph-0.3.1/trustgraph/kg/extract_definitions/extract.py +108 -0
- trustgraph-0.3.1/trustgraph/kg/extract_relationships/extract.py +167 -0
- trustgraph-0.3.1/trustgraph/llm/azure_text/llm.py +126 -0
- trustgraph-0.3.1/trustgraph/llm/claude_text/llm.py +108 -0
- trustgraph-0.3.1/trustgraph/llm/ollama_text/llm.py +88 -0
- trustgraph-0.3.1/trustgraph/llm/vertexai_text/llm.py +176 -0
- trustgraph-0.3.1/trustgraph/rag/graph/rag.py +117 -0
- trustgraph-0.3.1/trustgraph/vector/milvus_write/write.py +60 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph.egg-info/PKG-INFO +2 -2
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph.egg-info/SOURCES.txt +2 -0
- trustgraph-0.2.4/trustgraph/chunker/recursive/chunker.py +0 -191
- trustgraph-0.2.4/trustgraph/decoder/pdf/pdf_decoder.py +0 -174
- trustgraph-0.2.4/trustgraph/embeddings/hf/hf.py +0 -165
- trustgraph-0.2.4/trustgraph/embeddings/ollama/processor.py +0 -175
- trustgraph-0.2.4/trustgraph/embeddings/vectorize/vectorize.py +0 -163
- trustgraph-0.2.4/trustgraph/graph/cassandra_write/write.py +0 -148
- trustgraph-0.2.4/trustgraph/kg/extract_definitions/extract.py +0 -197
- trustgraph-0.2.4/trustgraph/kg/extract_relationships/extract.py +0 -253
- trustgraph-0.2.4/trustgraph/llm/azure_text/llm.py +0 -213
- trustgraph-0.2.4/trustgraph/llm/claude_text/llm.py +0 -192
- trustgraph-0.2.4/trustgraph/llm/ollama_text/llm.py +0 -174
- trustgraph-0.2.4/trustgraph/llm/vertexai_text/llm.py +0 -258
- trustgraph-0.2.4/trustgraph/rag/graph/rag.py +0 -207
- trustgraph-0.2.4/trustgraph/vector/milvus_write/write.py +0 -140
- {trustgraph-0.2.4 → trustgraph-0.3.1}/LICENSE +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/README.md +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/scripts/chunker-recursive +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/scripts/embeddings-hf +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/scripts/embeddings-ollama +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/scripts/embeddings-vectorize +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/scripts/graph-rag +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/scripts/graph-show +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/scripts/graph-to-turtle +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/scripts/graph-write-cassandra +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/scripts/init-pulsar-manager +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/scripts/kg-extract-definitions +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/scripts/kg-extract-relationships +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/scripts/llm-azure-text +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/scripts/llm-claude-text +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/scripts/llm-ollama-text +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/scripts/llm-vertexai-text +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/scripts/loader +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/scripts/pdf-decoder +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/scripts/query +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/scripts/run-processing +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/scripts/vector-write-milvus +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/setup.cfg +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/__init__.py +0 -0
- {trustgraph-0.2.4/trustgraph/embeddings/ollama → trustgraph-0.3.1/trustgraph/base}/__init__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/chunker/__init__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/chunker/recursive/__init__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/chunker/recursive/__main__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/decoder/__init__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/decoder/pdf/__init__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/decoder/pdf/__main__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/embeddings/__init__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/embeddings/hf/__init__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/embeddings/hf/__main__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/embeddings/ollama/__main__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/embeddings/vectorize/__init__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/embeddings/vectorize/__main__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/embeddings_client.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/graph/__init__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/graph/cassandra_write/__init__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/graph/cassandra_write/__main__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/graph_rag.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/graph_rag_client.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/kg/__init__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/kg/extract_definitions/__init__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/kg/extract_definitions/__main__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/kg/extract_relationships/__init__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/kg/extract_relationships/__main__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/llm/__init__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/llm/azure_text/__init__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/llm/azure_text/__main__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/llm/claude_text/__init__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/llm/claude_text/__main__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/llm/ollama_text/__init__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/llm/ollama_text/__main__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/llm/vertexai_text/__init__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/llm/vertexai_text/__main__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/llm_client.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/log_level.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/processing/__init__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/processing/__main__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/processing/processing.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/prompts.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/rag/__init__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/rag/graph/__init__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/rag/graph/__main__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/rdf.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/schema.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/triple_vectors.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/trustgraph.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/vector/__init__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/vector/milvus_write/__init__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph/vector/milvus_write/__main__.py +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph.egg-info/dependency_links.txt +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph.egg-info/requires.txt +0 -0
- {trustgraph-0.2.4 → trustgraph-0.3.1}/trustgraph.egg-info/top_level.txt +0 -0
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: trustgraph
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.1
|
|
4
4
|
Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
|
|
5
5
|
Home-page: https://github.com/trustgraph-ai/trustgraph
|
|
6
|
-
Download-URL: https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v0.
|
|
6
|
+
Download-URL: https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v0.3.1.tar.gz
|
|
7
7
|
Author: trustgraph.ai
|
|
8
8
|
Author-email: security@trustgraph.ai
|
|
9
9
|
Classifier: Programming Language :: Python :: 3
|
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
|
|
2
|
+
import os
|
|
3
|
+
import argparse
|
|
4
|
+
import pulsar
|
|
5
|
+
import time
|
|
6
|
+
from pulsar.schema import JsonSchema
|
|
7
|
+
|
|
8
|
+
from .. log_level import LogLevel
|
|
9
|
+
|
|
10
|
+
class BaseProcessor:
|
|
11
|
+
|
|
12
|
+
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
|
|
13
|
+
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
pulsar_host=default_pulsar_host,
|
|
17
|
+
log_level=LogLevel.INFO,
|
|
18
|
+
):
|
|
19
|
+
|
|
20
|
+
self.client = None
|
|
21
|
+
|
|
22
|
+
if pulsar_host == None:
|
|
23
|
+
pulsar_host = default_pulsar_host
|
|
24
|
+
|
|
25
|
+
self.pulsar_host = pulsar_host
|
|
26
|
+
|
|
27
|
+
self.client = pulsar.Client(
|
|
28
|
+
pulsar_host,
|
|
29
|
+
logger=pulsar.ConsoleLogger(log_level.to_pulsar())
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
def __del__(self):
|
|
33
|
+
|
|
34
|
+
if self.client:
|
|
35
|
+
self.client.close()
|
|
36
|
+
|
|
37
|
+
@staticmethod
|
|
38
|
+
def add_args(parser):
|
|
39
|
+
|
|
40
|
+
parser.add_argument(
|
|
41
|
+
'-p', '--pulsar-host',
|
|
42
|
+
default=__class__.default_pulsar_host,
|
|
43
|
+
help=f'Pulsar host (default: {__class__.default_pulsar_host})',
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
parser.add_argument(
|
|
47
|
+
'-l', '--log-level',
|
|
48
|
+
type=LogLevel,
|
|
49
|
+
default=LogLevel.INFO,
|
|
50
|
+
choices=list(LogLevel),
|
|
51
|
+
help=f'Output queue (default: info)'
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
def run(self):
|
|
55
|
+
raise RuntimeError("Something should have implemented the run method")
|
|
56
|
+
|
|
57
|
+
@classmethod
|
|
58
|
+
def start(cls, prog, doc):
|
|
59
|
+
|
|
60
|
+
while True:
|
|
61
|
+
|
|
62
|
+
parser = argparse.ArgumentParser(
|
|
63
|
+
prog=prog,
|
|
64
|
+
description=doc
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
cls.add_args(parser)
|
|
68
|
+
|
|
69
|
+
args = parser.parse_args()
|
|
70
|
+
args = vars(args)
|
|
71
|
+
|
|
72
|
+
try:
|
|
73
|
+
|
|
74
|
+
p = cls(**args)
|
|
75
|
+
p.run()
|
|
76
|
+
|
|
77
|
+
except Exception as e:
|
|
78
|
+
|
|
79
|
+
print("Exception:", e, flush=True)
|
|
80
|
+
print("Will retry...", flush=True)
|
|
81
|
+
|
|
82
|
+
time.sleep(10)
|
|
83
|
+
|
|
84
|
+
class Consumer(BaseProcessor):
|
|
85
|
+
|
|
86
|
+
def __init__(
|
|
87
|
+
self,
|
|
88
|
+
pulsar_host=None,
|
|
89
|
+
log_level=LogLevel.INFO,
|
|
90
|
+
input_queue="input",
|
|
91
|
+
subscriber="subscriber",
|
|
92
|
+
input_schema=None,
|
|
93
|
+
):
|
|
94
|
+
|
|
95
|
+
super(Consumer, self).__init__(
|
|
96
|
+
pulsar_host=pulsar_host,
|
|
97
|
+
log_level=log_level,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
if input_schema == None:
|
|
101
|
+
raise RuntimeError("input_schema must be specified")
|
|
102
|
+
|
|
103
|
+
self.consumer = self.client.subscribe(
|
|
104
|
+
input_queue, subscriber,
|
|
105
|
+
schema=JsonSchema(input_schema),
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
def run(self):
|
|
109
|
+
|
|
110
|
+
while True:
|
|
111
|
+
|
|
112
|
+
msg = self.consumer.receive()
|
|
113
|
+
|
|
114
|
+
try:
|
|
115
|
+
|
|
116
|
+
self.handle(msg)
|
|
117
|
+
|
|
118
|
+
# Acknowledge successful processing of the message
|
|
119
|
+
self.consumer.acknowledge(msg)
|
|
120
|
+
|
|
121
|
+
except Exception as e:
|
|
122
|
+
|
|
123
|
+
print("Exception:", e, flush=True)
|
|
124
|
+
|
|
125
|
+
# Message failed to be processed
|
|
126
|
+
self.consumer.negative_acknowledge(msg)
|
|
127
|
+
|
|
128
|
+
@staticmethod
|
|
129
|
+
def add_args(parser, default_input_queue, default_subscriber):
|
|
130
|
+
|
|
131
|
+
BaseProcessor.add_args(parser)
|
|
132
|
+
|
|
133
|
+
parser.add_argument(
|
|
134
|
+
'-i', '--input-queue',
|
|
135
|
+
default=default_input_queue,
|
|
136
|
+
help=f'Input queue (default: {default_input_queue})'
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
parser.add_argument(
|
|
140
|
+
'-s', '--subscriber',
|
|
141
|
+
default=default_subscriber,
|
|
142
|
+
help=f'Queue subscriber name (default: {default_subscriber})'
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
class ConsumerProducer(BaseProcessor):
|
|
146
|
+
|
|
147
|
+
def __init__(
|
|
148
|
+
self,
|
|
149
|
+
pulsar_host=None,
|
|
150
|
+
log_level=LogLevel.INFO,
|
|
151
|
+
input_queue="input",
|
|
152
|
+
output_queue="output",
|
|
153
|
+
subscriber="subscriber",
|
|
154
|
+
input_schema=None,
|
|
155
|
+
output_schema=None,
|
|
156
|
+
):
|
|
157
|
+
|
|
158
|
+
super(ConsumerProducer, self).__init__(
|
|
159
|
+
pulsar_host=pulsar_host,
|
|
160
|
+
log_level=log_level,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
if input_schema == None:
|
|
164
|
+
raise RuntimeError("input_schema must be specified")
|
|
165
|
+
|
|
166
|
+
if output_schema == None:
|
|
167
|
+
raise RuntimeError("output_schema must be specified")
|
|
168
|
+
|
|
169
|
+
self.consumer = self.client.subscribe(
|
|
170
|
+
input_queue, subscriber,
|
|
171
|
+
schema=JsonSchema(input_schema),
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
self.producer = self.client.create_producer(
|
|
175
|
+
topic=output_queue,
|
|
176
|
+
schema=JsonSchema(output_schema),
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
def run(self):
|
|
180
|
+
|
|
181
|
+
while True:
|
|
182
|
+
|
|
183
|
+
msg = self.consumer.receive()
|
|
184
|
+
|
|
185
|
+
try:
|
|
186
|
+
|
|
187
|
+
resp = self.handle(msg)
|
|
188
|
+
|
|
189
|
+
# Acknowledge successful processing of the message
|
|
190
|
+
self.consumer.acknowledge(msg)
|
|
191
|
+
|
|
192
|
+
except Exception as e:
|
|
193
|
+
|
|
194
|
+
print("Exception:", e, flush=True)
|
|
195
|
+
|
|
196
|
+
# Message failed to be processed
|
|
197
|
+
self.consumer.negative_acknowledge(msg)
|
|
198
|
+
|
|
199
|
+
def send(self, msg, properties={}):
|
|
200
|
+
|
|
201
|
+
self.producer.send(msg, properties)
|
|
202
|
+
|
|
203
|
+
@staticmethod
|
|
204
|
+
def add_args(
|
|
205
|
+
parser, default_input_queue, default_subscriber,
|
|
206
|
+
default_output_queue,
|
|
207
|
+
):
|
|
208
|
+
|
|
209
|
+
BaseProcessor.add_args(parser)
|
|
210
|
+
|
|
211
|
+
parser.add_argument(
|
|
212
|
+
'-i', '--input-queue',
|
|
213
|
+
default=default_input_queue,
|
|
214
|
+
help=f'Input queue (default: {default_input_queue})'
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
parser.add_argument(
|
|
218
|
+
'-s', '--subscriber',
|
|
219
|
+
default=default_subscriber,
|
|
220
|
+
help=f'Queue subscriber name (default: {default_subscriber})'
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
parser.add_argument(
|
|
224
|
+
'-o', '--output-queue',
|
|
225
|
+
default=default_output_queue,
|
|
226
|
+
help=f'Output queue (default: {default_output_queue})'
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
class Producer(BaseProcessor):
|
|
230
|
+
|
|
231
|
+
def __init__(
|
|
232
|
+
self,
|
|
233
|
+
pulsar_host=None,
|
|
234
|
+
log_level=LogLevel.INFO,
|
|
235
|
+
output_queue="output",
|
|
236
|
+
output_schema=None,
|
|
237
|
+
):
|
|
238
|
+
|
|
239
|
+
super(Producer, self).__init__(
|
|
240
|
+
pulsar_host=pulsar_host,
|
|
241
|
+
log_level=log_level,
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
if output_schema == None:
|
|
245
|
+
raise RuntimeError("output_schema must be specified")
|
|
246
|
+
|
|
247
|
+
self.producer = self.client.create_producer(
|
|
248
|
+
topic=output_queue,
|
|
249
|
+
schema=JsonSchema(output_schema),
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
def send(self, msg, properties={}):
|
|
253
|
+
|
|
254
|
+
self.producer.send(msg, properties)
|
|
255
|
+
|
|
256
|
+
@staticmethod
|
|
257
|
+
def add_args(
|
|
258
|
+
parser, default_input_queue, default_subscriber,
|
|
259
|
+
default_output_queue,
|
|
260
|
+
):
|
|
261
|
+
|
|
262
|
+
BaseProcessor.add_args(parser)
|
|
263
|
+
|
|
264
|
+
parser.add_argument(
|
|
265
|
+
'-o', '--output-queue',
|
|
266
|
+
default=default_output_queue,
|
|
267
|
+
help=f'Output queue (default: {default_output_queue})'
|
|
268
|
+
)
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
|
|
2
|
+
"""
|
|
3
|
+
Simple decoder, accepts text documents on input, outputs chunks from the
|
|
4
|
+
as text as separate output objects.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
from ... schema import TextDocument, Chunk, Source
|
|
11
|
+
from ... log_level import LogLevel
|
|
12
|
+
from ... base import ConsumerProducer
|
|
13
|
+
|
|
14
|
+
default_input_queue = 'text-doc-load'
|
|
15
|
+
default_output_queue = 'chunk-load'
|
|
16
|
+
default_subscriber = 'chunker-recursive'
|
|
17
|
+
|
|
18
|
+
class Processor(ConsumerProducer):
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
pulsar_host=None,
|
|
23
|
+
input_queue=default_input_queue,
|
|
24
|
+
output_queue=default_output_queue,
|
|
25
|
+
subscriber=default_subscriber,
|
|
26
|
+
log_level=LogLevel.INFO,
|
|
27
|
+
chunk_size=2000,
|
|
28
|
+
chunk_overlap=100,
|
|
29
|
+
):
|
|
30
|
+
|
|
31
|
+
super(Processor, self).__init__(
|
|
32
|
+
pulsar_host=pulsar_host,
|
|
33
|
+
log_level=log_level,
|
|
34
|
+
input_queue=input_queue,
|
|
35
|
+
output_queue=output_queue,
|
|
36
|
+
subscriber=subscriber,
|
|
37
|
+
input_schema=TextDocument,
|
|
38
|
+
output_schema=Chunk,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
self.text_splitter = RecursiveCharacterTextSplitter(
|
|
42
|
+
chunk_size=chunk_size,
|
|
43
|
+
chunk_overlap=chunk_overlap,
|
|
44
|
+
length_function=len,
|
|
45
|
+
is_separator_regex=False,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
def handle(self, msg):
|
|
49
|
+
|
|
50
|
+
v = msg.value()
|
|
51
|
+
print(f"Chunking {v.source.id}...", flush=True)
|
|
52
|
+
|
|
53
|
+
texts = self.text_splitter.create_documents(
|
|
54
|
+
[v.text.decode("utf-8")]
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
for ix, chunk in enumerate(texts):
|
|
58
|
+
|
|
59
|
+
id = v.source.id + "-c" + str(ix)
|
|
60
|
+
|
|
61
|
+
r = Chunk(
|
|
62
|
+
source=Source(
|
|
63
|
+
source=v.source.source,
|
|
64
|
+
id=id,
|
|
65
|
+
title=v.source.title
|
|
66
|
+
),
|
|
67
|
+
chunk=chunk.page_content.encode("utf-8"),
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
self.send(r)
|
|
71
|
+
|
|
72
|
+
print("Done.", flush=True)
|
|
73
|
+
|
|
74
|
+
@staticmethod
|
|
75
|
+
def add_args(parser):
|
|
76
|
+
|
|
77
|
+
ConsumerProducer.add_args(
|
|
78
|
+
parser, default_input_queue, default_subscriber,
|
|
79
|
+
default_output_queue,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
parser.add_argument(
|
|
83
|
+
'-z', '--chunk-size',
|
|
84
|
+
type=int,
|
|
85
|
+
default=2000,
|
|
86
|
+
help=f'Chunk size (default: 2000)'
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
parser.add_argument(
|
|
90
|
+
'-v', '--chunk-overlap',
|
|
91
|
+
type=int,
|
|
92
|
+
default=100,
|
|
93
|
+
help=f'Chunk overlap (default: 100)'
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
def run():
|
|
97
|
+
|
|
98
|
+
Processor.start('chunker', __doc__)
|
|
99
|
+
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
|
|
2
|
+
"""
|
|
3
|
+
Simple decoder, accepts PDF documents on input, outputs pages from the
|
|
4
|
+
PDF document as text as separate output objects.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import tempfile
|
|
8
|
+
import base64
|
|
9
|
+
from langchain_community.document_loaders import PyPDFLoader
|
|
10
|
+
|
|
11
|
+
from ... schema import Document, TextDocument, Source
|
|
12
|
+
from ... log_level import LogLevel
|
|
13
|
+
from ... base import ConsumerProducer
|
|
14
|
+
|
|
15
|
+
default_input_queue = 'document-load'
|
|
16
|
+
default_output_queue = 'text-doc-load'
|
|
17
|
+
default_subscriber = 'pdf-decoder'
|
|
18
|
+
|
|
19
|
+
class Processor(ConsumerProducer):
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
pulsar_host=None,
|
|
24
|
+
input_queue=default_input_queue,
|
|
25
|
+
output_queue=default_output_queue,
|
|
26
|
+
subscriber=default_subscriber,
|
|
27
|
+
log_level=LogLevel.INFO,
|
|
28
|
+
):
|
|
29
|
+
|
|
30
|
+
super(Processor, self).__init__(
|
|
31
|
+
pulsar_host=pulsar_host,
|
|
32
|
+
log_level=log_level,
|
|
33
|
+
input_queue=input_queue,
|
|
34
|
+
output_queue=output_queue,
|
|
35
|
+
subscriber=subscriber,
|
|
36
|
+
input_schema=Document,
|
|
37
|
+
output_schema=TextDocument,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
print("PDF inited")
|
|
41
|
+
|
|
42
|
+
def handle(self, msg):
|
|
43
|
+
|
|
44
|
+
print("PDF message received")
|
|
45
|
+
|
|
46
|
+
v = msg.value()
|
|
47
|
+
|
|
48
|
+
print(f"Decoding {v.source.id}...", flush=True)
|
|
49
|
+
|
|
50
|
+
with tempfile.NamedTemporaryFile(delete_on_close=False) as fp:
|
|
51
|
+
|
|
52
|
+
fp.write(base64.b64decode(v.data))
|
|
53
|
+
fp.close()
|
|
54
|
+
|
|
55
|
+
with open(fp.name, mode='rb') as f:
|
|
56
|
+
|
|
57
|
+
loader = PyPDFLoader(fp.name)
|
|
58
|
+
pages = loader.load()
|
|
59
|
+
|
|
60
|
+
for ix, page in enumerate(pages):
|
|
61
|
+
|
|
62
|
+
id = v.source.id + "-p" + str(ix)
|
|
63
|
+
r = TextDocument(
|
|
64
|
+
source=Source(
|
|
65
|
+
source=v.source.source,
|
|
66
|
+
title=v.source.title,
|
|
67
|
+
id=id,
|
|
68
|
+
),
|
|
69
|
+
text=page.page_content.encode("utf-8"),
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
self.send(r)
|
|
73
|
+
|
|
74
|
+
print("Done.", flush=True)
|
|
75
|
+
|
|
76
|
+
@staticmethod
|
|
77
|
+
def add_args(parser):
|
|
78
|
+
|
|
79
|
+
ConsumerProducer.add_args(
|
|
80
|
+
parser, default_input_queue, default_subscriber,
|
|
81
|
+
default_output_queue,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
def run():
|
|
85
|
+
|
|
86
|
+
Processor.start("pdf-decoder", __doc__)
|
|
87
|
+
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
|
|
2
|
+
"""
|
|
3
|
+
Embeddings service, applies an embeddings model selected from HuggingFace.
|
|
4
|
+
Input is text, output is embeddings vector.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from langchain_huggingface import HuggingFaceEmbeddings
|
|
8
|
+
|
|
9
|
+
from ... schema import EmbeddingsRequest, EmbeddingsResponse
|
|
10
|
+
from ... log_level import LogLevel
|
|
11
|
+
from ... base import ConsumerProducer
|
|
12
|
+
|
|
13
|
+
default_input_queue = 'embeddings'
|
|
14
|
+
default_output_queue = 'embeddings-response'
|
|
15
|
+
default_subscriber = 'embeddings-hf'
|
|
16
|
+
default_model="all-MiniLM-L6-v2"
|
|
17
|
+
|
|
18
|
+
class Processor(ConsumerProducer):
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
pulsar_host=None,
|
|
23
|
+
input_queue=default_input_queue,
|
|
24
|
+
output_queue=default_output_queue,
|
|
25
|
+
subscriber=default_subscriber,
|
|
26
|
+
log_level=LogLevel.INFO,
|
|
27
|
+
model=default_model,
|
|
28
|
+
):
|
|
29
|
+
|
|
30
|
+
super(Processor, self).__init__(
|
|
31
|
+
pulsar_host=pulsar_host,
|
|
32
|
+
log_level=log_level,
|
|
33
|
+
input_queue=input_queue,
|
|
34
|
+
output_queue=output_queue,
|
|
35
|
+
subscriber=subscriber,
|
|
36
|
+
input_schema=EmbeddingsRequest,
|
|
37
|
+
output_schema=EmbeddingsResponse,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
self.embeddings = HuggingFaceEmbeddings(model_name=model)
|
|
41
|
+
|
|
42
|
+
def handle(self, msg):
|
|
43
|
+
|
|
44
|
+
v = msg.value()
|
|
45
|
+
|
|
46
|
+
# Sender-produced ID
|
|
47
|
+
id = msg.properties()["id"]
|
|
48
|
+
|
|
49
|
+
print(f"Handling input {id}...", flush=True)
|
|
50
|
+
|
|
51
|
+
text = v.text
|
|
52
|
+
embeds = self.embeddings.embed_documents([text])
|
|
53
|
+
|
|
54
|
+
print("Send response...", flush=True)
|
|
55
|
+
r = EmbeddingsResponse(vectors=embeds)
|
|
56
|
+
self.producer.send(r, properties={"id": id})
|
|
57
|
+
|
|
58
|
+
print("Done.", flush=True)
|
|
59
|
+
|
|
60
|
+
@staticmethod
|
|
61
|
+
def add_args(parser):
|
|
62
|
+
|
|
63
|
+
ConsumerProducer.add_args(
|
|
64
|
+
parser, default_input_queue, default_subscriber,
|
|
65
|
+
default_output_queue,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
parser.add_argument(
|
|
69
|
+
'-m', '--model',
|
|
70
|
+
default="all-MiniLM-L6-v2",
|
|
71
|
+
help=f'LLM model (default: all-MiniLM-L6-v2)'
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
def run():
|
|
75
|
+
|
|
76
|
+
Processor.start("embeddings-hf", __doc__)
|
|
77
|
+
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
|
|
2
|
+
"""
|
|
3
|
+
Embeddings service, applies an embeddings model selected from HuggingFace.
|
|
4
|
+
Input is text, output is embeddings vector.
|
|
5
|
+
"""
|
|
6
|
+
from langchain_community.embeddings import OllamaEmbeddings
|
|
7
|
+
|
|
8
|
+
from ... schema import EmbeddingsRequest, EmbeddingsResponse
|
|
9
|
+
from ... log_level import LogLevel
|
|
10
|
+
from ... base import ConsumerProducer
|
|
11
|
+
|
|
12
|
+
default_input_queue = 'embeddings'
|
|
13
|
+
default_output_queue = 'embeddings-response'
|
|
14
|
+
default_subscriber = 'embeddings-ollama'
|
|
15
|
+
default_model="mxbai-embed-large"
|
|
16
|
+
default_ollama = 'http://localhost:11434'
|
|
17
|
+
|
|
18
|
+
class Processor(ConsumerProducer):
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
pulsar_host=None,
|
|
23
|
+
input_queue=default_input_queue,
|
|
24
|
+
output_queue=default_output_queue,
|
|
25
|
+
subscriber=default_subscriber,
|
|
26
|
+
log_level=LogLevel.INFO,
|
|
27
|
+
model=default_model,
|
|
28
|
+
ollama=default_ollama,
|
|
29
|
+
):
|
|
30
|
+
|
|
31
|
+
super(Processor, self).__init__(
|
|
32
|
+
pulsar_host=pulsar_host,
|
|
33
|
+
log_level=log_level,
|
|
34
|
+
input_queue=input_queue,
|
|
35
|
+
output_queue=output_queue,
|
|
36
|
+
subscriber=subscriber,
|
|
37
|
+
input_schema=EmbeddingsRequest,
|
|
38
|
+
output_schema=EmbeddingsResponse,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
self.embeddings = OllamaEmbeddings(base_url=ollama, model=model)
|
|
42
|
+
|
|
43
|
+
def handle(self, msg):
|
|
44
|
+
|
|
45
|
+
v = msg.value()
|
|
46
|
+
|
|
47
|
+
# Sender-produced ID
|
|
48
|
+
|
|
49
|
+
id = msg.properties()["id"]
|
|
50
|
+
|
|
51
|
+
print(f"Handling input {id}...", flush=True)
|
|
52
|
+
|
|
53
|
+
text = v.text
|
|
54
|
+
embeds = self.embeddings.embed_query([text])
|
|
55
|
+
|
|
56
|
+
print("Send response...", flush=True)
|
|
57
|
+
r = EmbeddingsResponse(vectors=[embeds])
|
|
58
|
+
|
|
59
|
+
self.producer.send(r, properties={"id": id})
|
|
60
|
+
|
|
61
|
+
print("Done.", flush=True)
|
|
62
|
+
|
|
63
|
+
@staticmethod
|
|
64
|
+
def add_args(parser):
|
|
65
|
+
|
|
66
|
+
ConsumerProducer.add_args(
|
|
67
|
+
parser, default_input_queue, default_subscriber,
|
|
68
|
+
default_output_queue,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
parser.add_argument(
|
|
72
|
+
'-m', '--model',
|
|
73
|
+
default=default_model,
|
|
74
|
+
help=f'Embeddings model (default: {default_model})'
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
parser.add_argument(
|
|
78
|
+
'-r', '--ollama',
|
|
79
|
+
default=default_ollama,
|
|
80
|
+
help=f'ollama (default: {default_ollama})'
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
def run():
|
|
84
|
+
|
|
85
|
+
Processor.start('embeddings-ollama', __doc__)
|
|
86
|
+
|