trustgraph 0.3.0__tar.gz → 0.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of trustgraph might be problematic. Click here for more details.

Files changed (97) hide show
  1. {trustgraph-0.3.0 → trustgraph-0.4.1}/PKG-INFO +3 -2
  2. {trustgraph-0.3.0 → trustgraph-0.4.1}/setup.py +2 -1
  3. trustgraph-0.4.1/trustgraph/base/processor.py +360 -0
  4. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/chunker/recursive/chunker.py +15 -18
  5. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/decoder/pdf/pdf_decoder.py +12 -15
  6. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/embeddings/hf/hf.py +13 -16
  7. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/embeddings/ollama/processor.py +12 -17
  8. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/embeddings/vectorize/vectorize.py +13 -16
  9. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/graph/cassandra_write/write.py +10 -20
  10. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/graph_rag.py +3 -3
  11. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/kg/extract_definitions/extract.py +12 -15
  12. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/kg/extract_relationships/extract.py +25 -17
  13. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/llm/azure_text/llm.py +15 -17
  14. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/llm/claude_text/llm.py +17 -19
  15. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/llm/ollama_text/llm.py +27 -17
  16. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/llm/vertexai_text/llm.py +15 -18
  17. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/rag/graph/rag.py +24 -25
  18. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/vector/milvus_write/write.py +12 -13
  19. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph.egg-info/PKG-INFO +3 -2
  20. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph.egg-info/requires.txt +1 -0
  21. trustgraph-0.3.0/trustgraph/base/processor.py +0 -266
  22. {trustgraph-0.3.0 → trustgraph-0.4.1}/LICENSE +0 -0
  23. {trustgraph-0.3.0 → trustgraph-0.4.1}/README.md +0 -0
  24. {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/chunker-recursive +0 -0
  25. {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/embeddings-hf +0 -0
  26. {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/embeddings-ollama +0 -0
  27. {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/embeddings-vectorize +0 -0
  28. {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/graph-rag +0 -0
  29. {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/graph-show +0 -0
  30. {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/graph-to-turtle +0 -0
  31. {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/graph-write-cassandra +0 -0
  32. {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/init-pulsar-manager +0 -0
  33. {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/kg-extract-definitions +0 -0
  34. {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/kg-extract-relationships +0 -0
  35. {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/llm-azure-text +0 -0
  36. {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/llm-claude-text +0 -0
  37. {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/llm-ollama-text +0 -0
  38. {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/llm-vertexai-text +0 -0
  39. {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/loader +0 -0
  40. {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/pdf-decoder +0 -0
  41. {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/query +0 -0
  42. {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/run-processing +0 -0
  43. {trustgraph-0.3.0 → trustgraph-0.4.1}/scripts/vector-write-milvus +0 -0
  44. {trustgraph-0.3.0 → trustgraph-0.4.1}/setup.cfg +0 -0
  45. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/__init__.py +0 -0
  46. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/base/__init__.py +0 -0
  47. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/chunker/__init__.py +0 -0
  48. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/chunker/recursive/__init__.py +0 -0
  49. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/chunker/recursive/__main__.py +0 -0
  50. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/decoder/__init__.py +0 -0
  51. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/decoder/pdf/__init__.py +0 -0
  52. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/decoder/pdf/__main__.py +0 -0
  53. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/embeddings/__init__.py +0 -0
  54. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/embeddings/hf/__init__.py +0 -0
  55. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/embeddings/hf/__main__.py +0 -0
  56. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/embeddings/ollama/__init__.py +0 -0
  57. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/embeddings/ollama/__main__.py +0 -0
  58. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/embeddings/vectorize/__init__.py +0 -0
  59. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/embeddings/vectorize/__main__.py +0 -0
  60. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/embeddings_client.py +0 -0
  61. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/graph/__init__.py +0 -0
  62. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/graph/cassandra_write/__init__.py +0 -0
  63. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/graph/cassandra_write/__main__.py +0 -0
  64. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/graph_rag_client.py +0 -0
  65. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/kg/__init__.py +0 -0
  66. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/kg/extract_definitions/__init__.py +0 -0
  67. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/kg/extract_definitions/__main__.py +0 -0
  68. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/kg/extract_relationships/__init__.py +0 -0
  69. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/kg/extract_relationships/__main__.py +0 -0
  70. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/llm/__init__.py +0 -0
  71. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/llm/azure_text/__init__.py +0 -0
  72. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/llm/azure_text/__main__.py +0 -0
  73. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/llm/claude_text/__init__.py +0 -0
  74. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/llm/claude_text/__main__.py +0 -0
  75. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/llm/ollama_text/__init__.py +0 -0
  76. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/llm/ollama_text/__main__.py +0 -0
  77. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/llm/vertexai_text/__init__.py +0 -0
  78. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/llm/vertexai_text/__main__.py +0 -0
  79. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/llm_client.py +0 -0
  80. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/log_level.py +0 -0
  81. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/processing/__init__.py +0 -0
  82. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/processing/__main__.py +0 -0
  83. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/processing/processing.py +0 -0
  84. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/prompts.py +0 -0
  85. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/rag/__init__.py +0 -0
  86. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/rag/graph/__init__.py +0 -0
  87. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/rag/graph/__main__.py +0 -0
  88. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/rdf.py +0 -0
  89. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/schema.py +0 -0
  90. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/triple_vectors.py +0 -0
  91. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/trustgraph.py +0 -0
  92. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/vector/__init__.py +0 -0
  93. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/vector/milvus_write/__init__.py +0 -0
  94. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph/vector/milvus_write/__main__.py +0 -0
  95. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph.egg-info/SOURCES.txt +0 -0
  96. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph.egg-info/dependency_links.txt +0 -0
  97. {trustgraph-0.3.0 → trustgraph-0.4.1}/trustgraph.egg-info/top_level.txt +0 -0
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: trustgraph
3
- Version: 0.3.0
3
+ Version: 0.4.1
4
4
  Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
5
5
  Home-page: https://github.com/trustgraph-ai/trustgraph
6
- Download-URL: https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v0.3.0.tar.gz
6
+ Download-URL: https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v0.4.1.tar.gz
7
7
  Author: trustgraph.ai
8
8
  Author-email: security@trustgraph.ai
9
9
  Classifier: Programming Language :: Python :: 3
@@ -31,6 +31,7 @@ Requires-Dist: pypdf
31
31
  Requires-Dist: anthropic
32
32
  Requires-Dist: google-cloud-aiplatform
33
33
  Requires-Dist: pyyaml
34
+ Requires-Dist: prometheus-client
34
35
 
35
36
 
36
37
  # TrustGraph
@@ -4,7 +4,7 @@ import os
4
4
  with open("README.md", "r") as fh:
5
5
  long_description = fh.read()
6
6
 
7
- version = "0.3.0"
7
+ version = "0.4.1"
8
8
 
9
9
  setuptools.setup(
10
10
  name="trustgraph",
@@ -43,6 +43,7 @@ setuptools.setup(
43
43
  "anthropic",
44
44
  "google-cloud-aiplatform",
45
45
  "pyyaml",
46
+ "prometheus-client",
46
47
  ],
47
48
  scripts=[
48
49
  "scripts/chunker-recursive",
@@ -0,0 +1,360 @@
1
+
2
+ import os
3
+ import argparse
4
+ import pulsar
5
+ import _pulsar
6
+ import time
7
+ from pulsar.schema import JsonSchema
8
+ from prometheus_client import start_http_server, Histogram, Info, Counter
9
+
10
+ from .. log_level import LogLevel
11
+
12
+ class BaseProcessor:
13
+
14
+ default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
15
+
16
+ def __init__(self, **params):
17
+
18
+ self.client = None
19
+
20
+ if not hasattr(__class__, "params_metric"):
21
+ __class__.params_metric = Info(
22
+ 'params', 'Parameters configuration'
23
+ )
24
+
25
+ # FIXME: Maybe outputs information it should not
26
+ __class__.params_metric.info({
27
+ k: str(params[k])
28
+ for k in params
29
+ })
30
+
31
+ pulsar_host = params.get("pulsar_host", self.default_pulsar_host)
32
+ log_level = params.get("log_level", LogLevel.INFO)
33
+
34
+ self.pulsar_host = pulsar_host
35
+
36
+ self.client = pulsar.Client(
37
+ pulsar_host,
38
+ logger=pulsar.ConsoleLogger(log_level.to_pulsar())
39
+ )
40
+
41
+ def __del__(self):
42
+
43
+ if self.client:
44
+ self.client.close()
45
+
46
+ @staticmethod
47
+ def add_args(parser):
48
+
49
+ parser.add_argument(
50
+ '-p', '--pulsar-host',
51
+ default=__class__.default_pulsar_host,
52
+ help=f'Pulsar host (default: {__class__.default_pulsar_host})',
53
+ )
54
+
55
+ parser.add_argument(
56
+ '-l', '--log-level',
57
+ type=LogLevel,
58
+ default=LogLevel.INFO,
59
+ choices=list(LogLevel),
60
+ help=f'Output queue (default: info)'
61
+ )
62
+
63
+ parser.add_argument(
64
+ '-M', '--metrics-enabled',
65
+ type=bool,
66
+ default=True,
67
+ help=f'Pulsar host (default: true)',
68
+ )
69
+
70
+ parser.add_argument(
71
+ '-P', '--metrics-port',
72
+ type=int,
73
+ default=8000,
74
+ help=f'Pulsar host (default: 8000)',
75
+ )
76
+
77
+ def run(self):
78
+ raise RuntimeError("Something should have implemented the run method")
79
+
80
+ @classmethod
81
+ def start(cls, prog, doc):
82
+
83
+ while True:
84
+
85
+ parser = argparse.ArgumentParser(
86
+ prog=prog,
87
+ description=doc
88
+ )
89
+
90
+ cls.add_args(parser)
91
+
92
+ args = parser.parse_args()
93
+ args = vars(args)
94
+
95
+ if args["metrics_enabled"]:
96
+ start_http_server(args["metrics_port"])
97
+
98
+ try:
99
+
100
+ p = cls(**args)
101
+ p.run()
102
+
103
+ except KeyboardInterrupt:
104
+ print("Keyboard interrupt.")
105
+ return
106
+
107
+ except _pulsar.Interrupted:
108
+ print("Pulsar Interrupted.")
109
+ return
110
+
111
+ except Exception as e:
112
+
113
+ print(type(e))
114
+
115
+ print("Exception:", e, flush=True)
116
+ print("Will retry...", flush=True)
117
+
118
+ time.sleep(10)
119
+
120
+ class Consumer(BaseProcessor):
121
+
122
+ def __init__(self, **params):
123
+
124
+ super(Consumer, self).__init__(**params)
125
+
126
+ input_queue = params.get("input_queue")
127
+ subscriber = params.get("subscriber")
128
+ input_schema = params.get("input_schema")
129
+
130
+ if input_schema == None:
131
+ raise RuntimeError("input_schema must be specified")
132
+
133
+ if not hasattr(__class__, "request_metric"):
134
+ __class__.request_metric = Histogram(
135
+ 'request_latency', 'Request latency (seconds)'
136
+ )
137
+
138
+ if not hasattr(__class__, "pubsub_metric"):
139
+ __class__.pubsub_metric = Info(
140
+ 'pubsub', 'Pub/sub configuration'
141
+ )
142
+
143
+ if not hasattr(__class__, "processing_metric"):
144
+ __class__.processing_metric = Counter(
145
+ 'processing_count', 'Processing count', ["status"]
146
+ )
147
+
148
+ __class__.pubsub_metric.info({
149
+ "input_queue": input_queue,
150
+ "subscriber": subscriber,
151
+ "input_schema": input_schema.__name__,
152
+ })
153
+
154
+ self.consumer = self.client.subscribe(
155
+ input_queue, subscriber,
156
+ schema=JsonSchema(input_schema),
157
+ )
158
+
159
+ def run(self):
160
+
161
+ while True:
162
+
163
+ msg = self.consumer.receive()
164
+
165
+ try:
166
+
167
+ with __class__.request_metric.time():
168
+ self.handle(msg)
169
+
170
+ # Acknowledge successful processing of the message
171
+ self.consumer.acknowledge(msg)
172
+
173
+ __class__.processing_metric.labels(status="success").inc()
174
+
175
+ except Exception as e:
176
+
177
+ print("Exception:", e, flush=True)
178
+
179
+ # Message failed to be processed
180
+ self.consumer.negative_acknowledge(msg)
181
+
182
+ __class__.processing_metric.labels(status="error").inc()
183
+
184
+ @staticmethod
185
+ def add_args(parser, default_input_queue, default_subscriber):
186
+
187
+ BaseProcessor.add_args(parser)
188
+
189
+ parser.add_argument(
190
+ '-i', '--input-queue',
191
+ default=default_input_queue,
192
+ help=f'Input queue (default: {default_input_queue})'
193
+ )
194
+
195
+ parser.add_argument(
196
+ '-s', '--subscriber',
197
+ default=default_subscriber,
198
+ help=f'Queue subscriber name (default: {default_subscriber})'
199
+ )
200
+
201
+ class ConsumerProducer(BaseProcessor):
202
+
203
+ def __init__(self, **params):
204
+
205
+ input_queue = params.get("input_queue")
206
+ output_queue = params.get("output_queue")
207
+ subscriber = params.get("subscriber")
208
+ input_schema = params.get("input_schema")
209
+ output_schema = params.get("output_schema")
210
+
211
+ if not hasattr(__class__, "request_metric"):
212
+ __class__.request_metric = Histogram(
213
+ 'request_latency', 'Request latency (seconds)'
214
+ )
215
+
216
+ if not hasattr(__class__, "output_metric"):
217
+ __class__.output_metric = Counter(
218
+ 'output_count', 'Output items created'
219
+ )
220
+
221
+ if not hasattr(__class__, "pubsub_metric"):
222
+ __class__.pubsub_metric = Info(
223
+ 'pubsub', 'Pub/sub configuration'
224
+ )
225
+
226
+ if not hasattr(__class__, "processing_metric"):
227
+ __class__.processing_metric = Counter(
228
+ 'processing_count', 'Processing count', ["status"]
229
+ )
230
+
231
+ __class__.pubsub_metric.info({
232
+ "input_queue": input_queue,
233
+ "output_queue": output_queue,
234
+ "subscriber": subscriber,
235
+ "input_schema": input_schema.__name__,
236
+ "output_schema": output_schema.__name__,
237
+ })
238
+
239
+ super(ConsumerProducer, self).__init__(**params)
240
+
241
+ if input_schema == None:
242
+ raise RuntimeError("input_schema must be specified")
243
+
244
+ if output_schema == None:
245
+ raise RuntimeError("output_schema must be specified")
246
+
247
+ self.consumer = self.client.subscribe(
248
+ input_queue, subscriber,
249
+ schema=JsonSchema(input_schema),
250
+ )
251
+
252
+ self.producer = self.client.create_producer(
253
+ topic=output_queue,
254
+ schema=JsonSchema(output_schema),
255
+ )
256
+
257
+ def run(self):
258
+
259
+ while True:
260
+
261
+ msg = self.consumer.receive()
262
+
263
+ try:
264
+
265
+ with __class__.request_metric.time():
266
+ resp = self.handle(msg)
267
+
268
+ # Acknowledge successful processing of the message
269
+ self.consumer.acknowledge(msg)
270
+
271
+ __class__.processing_metric.labels(status="success").inc()
272
+
273
+ except Exception as e:
274
+
275
+ print("Exception:", e, flush=True)
276
+
277
+ # Message failed to be processed
278
+ self.consumer.negative_acknowledge(msg)
279
+
280
+ __class__.processing_metric.labels(status="error").inc()
281
+
282
+ def send(self, msg, properties={}):
283
+ self.producer.send(msg, properties)
284
+ __class__.output_metric.inc()
285
+
286
+ @staticmethod
287
+ def add_args(
288
+ parser, default_input_queue, default_subscriber,
289
+ default_output_queue,
290
+ ):
291
+
292
+ BaseProcessor.add_args(parser)
293
+
294
+ parser.add_argument(
295
+ '-i', '--input-queue',
296
+ default=default_input_queue,
297
+ help=f'Input queue (default: {default_input_queue})'
298
+ )
299
+
300
+ parser.add_argument(
301
+ '-s', '--subscriber',
302
+ default=default_subscriber,
303
+ help=f'Queue subscriber name (default: {default_subscriber})'
304
+ )
305
+
306
+ parser.add_argument(
307
+ '-o', '--output-queue',
308
+ default=default_output_queue,
309
+ help=f'Output queue (default: {default_output_queue})'
310
+ )
311
+
312
+ class Producer(BaseProcessor):
313
+
314
+ def __init__(self, **params):
315
+
316
+ output_queue = params.get("output_queue")
317
+ output_schema = params.get("output_schema")
318
+
319
+ if not hasattr(__class__, "output_metric"):
320
+ __class__.output_metric = Counter(
321
+ 'output_count', 'Output items created'
322
+ )
323
+
324
+ if not hasattr(__class__, "pubsub_metric"):
325
+ __class__.pubsub_metric = Info(
326
+ 'pubsub', 'Pub/sub configuration'
327
+ )
328
+
329
+ __class__.pubsub_metric.info({
330
+ "output_queue": output_queue,
331
+ "output_schema": output_schema.__name__,
332
+ })
333
+
334
+ super(Producer, self).__init__(**params)
335
+
336
+ if output_schema == None:
337
+ raise RuntimeError("output_schema must be specified")
338
+
339
+ self.producer = self.client.create_producer(
340
+ topic=output_queue,
341
+ schema=JsonSchema(output_schema),
342
+ )
343
+
344
+ def send(self, msg, properties={}):
345
+ self.producer.send(msg, properties)
346
+ __class__.output_metric.inc()
347
+
348
+ @staticmethod
349
+ def add_args(
350
+ parser, default_input_queue, default_subscriber,
351
+ default_output_queue,
352
+ ):
353
+
354
+ BaseProcessor.add_args(parser)
355
+
356
+ parser.add_argument(
357
+ '-o', '--output-queue',
358
+ default=default_output_queue,
359
+ help=f'Output queue (default: {default_output_queue})'
360
+ )
@@ -17,25 +17,22 @@ default_subscriber = 'chunker-recursive'
17
17
 
18
18
  class Processor(ConsumerProducer):
19
19
 
20
- def __init__(
21
- self,
22
- pulsar_host=None,
23
- input_queue=default_input_queue,
24
- output_queue=default_output_queue,
25
- subscriber=default_subscriber,
26
- log_level=LogLevel.INFO,
27
- chunk_size=2000,
28
- chunk_overlap=100,
29
- ):
30
-
20
+ def __init__(self, **params):
21
+
22
+ input_queue = params.get("input_queue", default_input_queue)
23
+ output_queue = params.get("output_queue", default_output_queue)
24
+ subscriber = params.get("subscriber", default_subscriber)
25
+ chunk_size = params.get("chunk_size", 2000)
26
+ chunk_overlap = params.get("chunk_overlap", 100)
27
+
31
28
  super(Processor, self).__init__(
32
- pulsar_host=pulsar_host,
33
- log_level=log_level,
34
- input_queue=input_queue,
35
- output_queue=output_queue,
36
- subscriber=subscriber,
37
- input_schema=TextDocument,
38
- output_schema=Chunk,
29
+ **params | {
30
+ "input_queue": input_queue,
31
+ "output_queue": output_queue,
32
+ "subscriber": subscriber,
33
+ "input_schema": TextDocument,
34
+ "output_schema": Chunk,
35
+ }
39
36
  )
40
37
 
41
38
  self.text_splitter = RecursiveCharacterTextSplitter(
@@ -18,23 +18,20 @@ default_subscriber = 'pdf-decoder'
18
18
 
19
19
  class Processor(ConsumerProducer):
20
20
 
21
- def __init__(
22
- self,
23
- pulsar_host=None,
24
- input_queue=default_input_queue,
25
- output_queue=default_output_queue,
26
- subscriber=default_subscriber,
27
- log_level=LogLevel.INFO,
28
- ):
21
+ def __init__(self, **params):
22
+
23
+ input_queue = params.get("input_queue", default_input_queue)
24
+ output_queue = params.get("output_queue", default_output_queue)
25
+ subscriber = params.get("subscriber", default_subscriber)
29
26
 
30
27
  super(Processor, self).__init__(
31
- pulsar_host=pulsar_host,
32
- log_level=log_level,
33
- input_queue=input_queue,
34
- output_queue=output_queue,
35
- subscriber=subscriber,
36
- input_schema=Document,
37
- output_schema=TextDocument,
28
+ **params | {
29
+ "input_queue": input_queue,
30
+ "output_queue": output_queue,
31
+ "subscriber": subscriber,
32
+ "input_schema": Document,
33
+ "output_schema": TextDocument,
34
+ }
38
35
  )
39
36
 
40
37
  print("PDF inited")
@@ -17,24 +17,21 @@ default_model="all-MiniLM-L6-v2"
17
17
 
18
18
  class Processor(ConsumerProducer):
19
19
 
20
- def __init__(
21
- self,
22
- pulsar_host=None,
23
- input_queue=default_input_queue,
24
- output_queue=default_output_queue,
25
- subscriber=default_subscriber,
26
- log_level=LogLevel.INFO,
27
- model=default_model,
28
- ):
20
+ def __init__(self, **params):
21
+
22
+ input_queue = params.get("input_queue", default_input_queue)
23
+ output_queue = params.get("output_queue", default_output_queue)
24
+ subscriber = params.get("subscriber", default_subscriber)
25
+ model = params.get("model", default_model)
29
26
 
30
27
  super(Processor, self).__init__(
31
- pulsar_host=pulsar_host,
32
- log_level=log_level,
33
- input_queue=input_queue,
34
- output_queue=output_queue,
35
- subscriber=subscriber,
36
- input_schema=EmbeddingsRequest,
37
- output_schema=EmbeddingsResponse,
28
+ **params | {
29
+ "input_queue": input_queue,
30
+ "output_queue": output_queue,
31
+ "subscriber": subscriber,
32
+ "input_schema": EmbeddingsRequest,
33
+ "output_schema": EmbeddingsResponse,
34
+ }
38
35
  )
39
36
 
40
37
  self.embeddings = HuggingFaceEmbeddings(model_name=model)
@@ -17,25 +17,20 @@ default_ollama = 'http://localhost:11434'
17
17
 
18
18
  class Processor(ConsumerProducer):
19
19
 
20
- def __init__(
21
- self,
22
- pulsar_host=None,
23
- input_queue=default_input_queue,
24
- output_queue=default_output_queue,
25
- subscriber=default_subscriber,
26
- log_level=LogLevel.INFO,
27
- model=default_model,
28
- ollama=default_ollama,
29
- ):
20
+ def __init__(self, **params):
21
+
22
+ input_queue = params.get("input_queue", default_input_queue)
23
+ output_queue = params.get("output_queue", default_output_queue)
24
+ subscriber = params.get("subscriber", default_subscriber)
30
25
 
31
26
  super(Processor, self).__init__(
32
- pulsar_host=pulsar_host,
33
- log_level=log_level,
34
- input_queue=input_queue,
35
- output_queue=output_queue,
36
- subscriber=subscriber,
37
- input_schema=EmbeddingsRequest,
38
- output_schema=EmbeddingsResponse,
27
+ **params | {
28
+ "input_queue": input_queue,
29
+ "output_queue": output_queue,
30
+ "subscriber": subscriber,
31
+ "input_schema": EmbeddingsRequest,
32
+ "output_schema": EmbeddingsResponse,
33
+ }
39
34
  )
40
35
 
41
36
  self.embeddings = OllamaEmbeddings(base_url=ollama, model=model)
@@ -15,26 +15,23 @@ default_subscriber = 'embeddings-vectorizer'
15
15
 
16
16
  class Processor(ConsumerProducer):
17
17
 
18
- def __init__(
19
- self,
20
- pulsar_host=None,
21
- input_queue=default_input_queue,
22
- output_queue=default_output_queue,
23
- subscriber=default_subscriber,
24
- log_level=LogLevel.INFO,
25
- ):
18
+ def __init__(self, **params):
19
+
20
+ input_queue = params.get("input_queue", default_input_queue)
21
+ output_queue = params.get("output_queue", default_output_queue)
22
+ subscriber = params.get("subscriber", default_subscriber)
26
23
 
27
24
  super(Processor, self).__init__(
28
- pulsar_host=pulsar_host,
29
- log_level=log_level,
30
- input_queue=input_queue,
31
- output_queue=output_queue,
32
- subscriber=subscriber,
33
- input_schema=Chunk,
34
- output_schema=VectorsChunk,
25
+ **params | {
26
+ "input_queue": input_queue,
27
+ "output_queue": output_queue,
28
+ "subscriber": subscriber,
29
+ "input_schema": Chunk,
30
+ "output_schema": VectorsChunk,
31
+ }
35
32
  )
36
33
 
37
- self.embeddings = EmbeddingsClient(pulsar_host=pulsar_host)
34
+ self.embeddings = EmbeddingsClient(pulsar_host=self.pulsar_host)
38
35
 
39
36
  def emit(self, source, chunk, vectors):
40
37
 
@@ -20,27 +20,22 @@ default_graph_host='localhost'
20
20
 
21
21
  class Processor(Consumer):
22
22
 
23
- def __init__(
24
- self,
25
- pulsar_host=None,
26
- input_queue=default_input_queue,
27
- subscriber=default_subscriber,
28
- graph_host=default_graph_host,
29
- log_level=LogLevel.INFO,
30
- ):
23
+ def __init__(self, **params):
24
+
25
+ input_queue = params.get("input_queue", default_input_queue)
26
+ subscriber = params.get("subscriber", default_subscriber)
27
+ graph_host = params.get("graph_host", default_graph_host)
31
28
 
32
29
  super(Processor, self).__init__(
33
- pulsar_host=pulsar_host,
34
- log_level=log_level,
35
- input_queue=input_queue,
36
- subscriber=subscriber,
37
- input_schema=Triple,
30
+ **params | {
31
+ "input_queue": input_queue,
32
+ "subscriber": subscriber,
33
+ "input_schema": Triple,
34
+ }
38
35
  )
39
36
 
40
37
  self.tg = TrustGraph([graph_host])
41
38
 
42
- self.count = 0
43
-
44
39
  def handle(self, msg):
45
40
 
46
41
  v = msg.value()
@@ -51,11 +46,6 @@ class Processor(Consumer):
51
46
  v.o.value
52
47
  )
53
48
 
54
- self.count += 1
55
-
56
- if (self.count % 1000) == 0:
57
- print(self.count, "...", flush=True)
58
-
59
49
  @staticmethod
60
50
  def add_args(parser):
61
51
 
@@ -18,7 +18,7 @@ class GraphRag:
18
18
  verbose=False,
19
19
  entity_limit=50,
20
20
  triple_limit=30,
21
- max_sg_size=3000,
21
+ max_subgraph_size=3000,
22
22
  ):
23
23
 
24
24
  self.verbose=verbose
@@ -37,7 +37,7 @@ class GraphRag:
37
37
 
38
38
  self.entity_limit=entity_limit
39
39
  self.query_limit=triple_limit
40
- self.max_sg_size=max_sg_size
40
+ self.max_subgraph_size=max_subgraph_size
41
41
 
42
42
  self.label_cache = {}
43
43
 
@@ -149,7 +149,7 @@ class GraphRag:
149
149
 
150
150
  subgraph = list(subgraph)
151
151
 
152
- subgraph = subgraph[0:self.max_sg_size]
152
+ subgraph = subgraph[0:self.max_subgraph_size]
153
153
 
154
154
  if self.verbose:
155
155
  print("Subgraph:", flush=True)