trustgraph 0.3.1__tar.gz → 0.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of trustgraph might be problematic. Click here for more details.

Files changed (96) hide show
  1. {trustgraph-0.3.1 → trustgraph-0.4.2}/PKG-INFO +3 -2
  2. {trustgraph-0.3.1 → trustgraph-0.4.2}/setup.py +2 -1
  3. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/base/processor.py +140 -48
  4. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/chunker/recursive/chunker.py +15 -18
  5. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/decoder/pdf/pdf_decoder.py +12 -15
  6. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/embeddings/hf/hf.py +13 -16
  7. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/embeddings/ollama/processor.py +12 -17
  8. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/embeddings/vectorize/vectorize.py +13 -16
  9. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/graph/cassandra_write/write.py +10 -20
  10. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/graph_rag.py +3 -3
  11. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/kg/extract_definitions/extract.py +13 -16
  12. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/kg/extract_relationships/extract.py +25 -17
  13. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/llm/azure_text/llm.py +15 -17
  14. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/llm/claude_text/llm.py +17 -19
  15. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/llm/ollama_text/llm.py +27 -17
  16. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/llm/vertexai_text/llm.py +15 -18
  17. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/rag/graph/rag.py +24 -25
  18. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/vector/milvus_write/write.py +12 -13
  19. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph.egg-info/PKG-INFO +3 -2
  20. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph.egg-info/requires.txt +1 -0
  21. {trustgraph-0.3.1 → trustgraph-0.4.2}/LICENSE +0 -0
  22. {trustgraph-0.3.1 → trustgraph-0.4.2}/README.md +0 -0
  23. {trustgraph-0.3.1 → trustgraph-0.4.2}/scripts/chunker-recursive +0 -0
  24. {trustgraph-0.3.1 → trustgraph-0.4.2}/scripts/embeddings-hf +0 -0
  25. {trustgraph-0.3.1 → trustgraph-0.4.2}/scripts/embeddings-ollama +0 -0
  26. {trustgraph-0.3.1 → trustgraph-0.4.2}/scripts/embeddings-vectorize +0 -0
  27. {trustgraph-0.3.1 → trustgraph-0.4.2}/scripts/graph-rag +0 -0
  28. {trustgraph-0.3.1 → trustgraph-0.4.2}/scripts/graph-show +0 -0
  29. {trustgraph-0.3.1 → trustgraph-0.4.2}/scripts/graph-to-turtle +0 -0
  30. {trustgraph-0.3.1 → trustgraph-0.4.2}/scripts/graph-write-cassandra +0 -0
  31. {trustgraph-0.3.1 → trustgraph-0.4.2}/scripts/init-pulsar-manager +0 -0
  32. {trustgraph-0.3.1 → trustgraph-0.4.2}/scripts/kg-extract-definitions +0 -0
  33. {trustgraph-0.3.1 → trustgraph-0.4.2}/scripts/kg-extract-relationships +0 -0
  34. {trustgraph-0.3.1 → trustgraph-0.4.2}/scripts/llm-azure-text +0 -0
  35. {trustgraph-0.3.1 → trustgraph-0.4.2}/scripts/llm-claude-text +0 -0
  36. {trustgraph-0.3.1 → trustgraph-0.4.2}/scripts/llm-ollama-text +0 -0
  37. {trustgraph-0.3.1 → trustgraph-0.4.2}/scripts/llm-vertexai-text +0 -0
  38. {trustgraph-0.3.1 → trustgraph-0.4.2}/scripts/loader +0 -0
  39. {trustgraph-0.3.1 → trustgraph-0.4.2}/scripts/pdf-decoder +0 -0
  40. {trustgraph-0.3.1 → trustgraph-0.4.2}/scripts/query +0 -0
  41. {trustgraph-0.3.1 → trustgraph-0.4.2}/scripts/run-processing +0 -0
  42. {trustgraph-0.3.1 → trustgraph-0.4.2}/scripts/vector-write-milvus +0 -0
  43. {trustgraph-0.3.1 → trustgraph-0.4.2}/setup.cfg +0 -0
  44. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/__init__.py +0 -0
  45. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/base/__init__.py +0 -0
  46. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/chunker/__init__.py +0 -0
  47. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/chunker/recursive/__init__.py +0 -0
  48. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/chunker/recursive/__main__.py +0 -0
  49. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/decoder/__init__.py +0 -0
  50. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/decoder/pdf/__init__.py +0 -0
  51. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/decoder/pdf/__main__.py +0 -0
  52. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/embeddings/__init__.py +0 -0
  53. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/embeddings/hf/__init__.py +0 -0
  54. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/embeddings/hf/__main__.py +0 -0
  55. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/embeddings/ollama/__init__.py +0 -0
  56. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/embeddings/ollama/__main__.py +0 -0
  57. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/embeddings/vectorize/__init__.py +0 -0
  58. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/embeddings/vectorize/__main__.py +0 -0
  59. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/embeddings_client.py +0 -0
  60. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/graph/__init__.py +0 -0
  61. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/graph/cassandra_write/__init__.py +0 -0
  62. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/graph/cassandra_write/__main__.py +0 -0
  63. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/graph_rag_client.py +0 -0
  64. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/kg/__init__.py +0 -0
  65. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/kg/extract_definitions/__init__.py +0 -0
  66. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/kg/extract_definitions/__main__.py +0 -0
  67. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/kg/extract_relationships/__init__.py +0 -0
  68. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/kg/extract_relationships/__main__.py +0 -0
  69. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/llm/__init__.py +0 -0
  70. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/llm/azure_text/__init__.py +0 -0
  71. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/llm/azure_text/__main__.py +0 -0
  72. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/llm/claude_text/__init__.py +0 -0
  73. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/llm/claude_text/__main__.py +0 -0
  74. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/llm/ollama_text/__init__.py +0 -0
  75. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/llm/ollama_text/__main__.py +0 -0
  76. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/llm/vertexai_text/__init__.py +0 -0
  77. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/llm/vertexai_text/__main__.py +0 -0
  78. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/llm_client.py +0 -0
  79. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/log_level.py +0 -0
  80. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/processing/__init__.py +0 -0
  81. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/processing/__main__.py +0 -0
  82. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/processing/processing.py +0 -0
  83. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/prompts.py +0 -0
  84. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/rag/__init__.py +0 -0
  85. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/rag/graph/__init__.py +0 -0
  86. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/rag/graph/__main__.py +0 -0
  87. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/rdf.py +0 -0
  88. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/schema.py +0 -0
  89. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/triple_vectors.py +0 -0
  90. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/trustgraph.py +0 -0
  91. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/vector/__init__.py +0 -0
  92. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/vector/milvus_write/__init__.py +0 -0
  93. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph/vector/milvus_write/__main__.py +0 -0
  94. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph.egg-info/SOURCES.txt +0 -0
  95. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph.egg-info/dependency_links.txt +0 -0
  96. {trustgraph-0.3.1 → trustgraph-0.4.2}/trustgraph.egg-info/top_level.txt +0 -0
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: trustgraph
3
- Version: 0.3.1
3
+ Version: 0.4.2
4
4
  Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
5
5
  Home-page: https://github.com/trustgraph-ai/trustgraph
6
- Download-URL: https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v0.3.1.tar.gz
6
+ Download-URL: https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v0.4.2.tar.gz
7
7
  Author: trustgraph.ai
8
8
  Author-email: security@trustgraph.ai
9
9
  Classifier: Programming Language :: Python :: 3
@@ -31,6 +31,7 @@ Requires-Dist: pypdf
31
31
  Requires-Dist: anthropic
32
32
  Requires-Dist: google-cloud-aiplatform
33
33
  Requires-Dist: pyyaml
34
+ Requires-Dist: prometheus-client
34
35
 
35
36
 
36
37
  # TrustGraph
@@ -4,7 +4,7 @@ import os
4
4
  with open("README.md", "r") as fh:
5
5
  long_description = fh.read()
6
6
 
7
- version = "0.3.1"
7
+ version = "0.4.2"
8
8
 
9
9
  setuptools.setup(
10
10
  name="trustgraph",
@@ -43,6 +43,7 @@ setuptools.setup(
43
43
  "anthropic",
44
44
  "google-cloud-aiplatform",
45
45
  "pyyaml",
46
+ "prometheus-client",
46
47
  ],
47
48
  scripts=[
48
49
  "scripts/chunker-recursive",
@@ -2,8 +2,10 @@
2
2
  import os
3
3
  import argparse
4
4
  import pulsar
5
+ import _pulsar
5
6
  import time
6
7
  from pulsar.schema import JsonSchema
8
+ from prometheus_client import start_http_server, Histogram, Info, Counter
7
9
 
8
10
  from .. log_level import LogLevel
9
11
 
@@ -11,16 +13,23 @@ class BaseProcessor:
11
13
 
12
14
  default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
13
15
 
14
- def __init__(
15
- self,
16
- pulsar_host=default_pulsar_host,
17
- log_level=LogLevel.INFO,
18
- ):
16
+ def __init__(self, **params):
19
17
 
20
18
  self.client = None
21
19
 
22
- if pulsar_host == None:
23
- pulsar_host = default_pulsar_host
20
+ if not hasattr(__class__, "params_metric"):
21
+ __class__.params_metric = Info(
22
+ 'params', 'Parameters configuration'
23
+ )
24
+
25
+ # FIXME: Maybe outputs information it should not
26
+ __class__.params_metric.info({
27
+ k: str(params[k])
28
+ for k in params
29
+ })
30
+
31
+ pulsar_host = params.get("pulsar_host", self.default_pulsar_host)
32
+ log_level = params.get("log_level", LogLevel.INFO)
24
33
 
25
34
  self.pulsar_host = pulsar_host
26
35
 
@@ -51,6 +60,20 @@ class BaseProcessor:
51
60
  help=f'Output queue (default: info)'
52
61
  )
53
62
 
63
+ parser.add_argument(
64
+ '-M', '--metrics-enabled',
65
+ type=bool,
66
+ default=True,
67
+ help=f'Pulsar host (default: true)',
68
+ )
69
+
70
+ parser.add_argument(
71
+ '-P', '--metrics-port',
72
+ type=int,
73
+ default=8000,
74
+ help=f'Pulsar host (default: 8000)',
75
+ )
76
+
54
77
  def run(self):
55
78
  raise RuntimeError("Something should have implemented the run method")
56
79
 
@@ -69,13 +92,26 @@ class BaseProcessor:
69
92
  args = parser.parse_args()
70
93
  args = vars(args)
71
94
 
95
+ if args["metrics_enabled"]:
96
+ start_http_server(args["metrics_port"])
97
+
72
98
  try:
73
99
 
74
100
  p = cls(**args)
75
101
  p.run()
76
102
 
103
+ except KeyboardInterrupt:
104
+ print("Keyboard interrupt.")
105
+ return
106
+
107
+ except _pulsar.Interrupted:
108
+ print("Pulsar Interrupted.")
109
+ return
110
+
77
111
  except Exception as e:
78
112
 
113
+ print(type(e))
114
+
79
115
  print("Exception:", e, flush=True)
80
116
  print("Will retry...", flush=True)
81
117
 
@@ -83,23 +119,38 @@ class BaseProcessor:
83
119
 
84
120
  class Consumer(BaseProcessor):
85
121
 
86
- def __init__(
87
- self,
88
- pulsar_host=None,
89
- log_level=LogLevel.INFO,
90
- input_queue="input",
91
- subscriber="subscriber",
92
- input_schema=None,
93
- ):
122
+ def __init__(self, **params):
94
123
 
95
- super(Consumer, self).__init__(
96
- pulsar_host=pulsar_host,
97
- log_level=log_level,
98
- )
124
+ super(Consumer, self).__init__(**params)
125
+
126
+ input_queue = params.get("input_queue")
127
+ subscriber = params.get("subscriber")
128
+ input_schema = params.get("input_schema")
99
129
 
100
130
  if input_schema == None:
101
131
  raise RuntimeError("input_schema must be specified")
102
132
 
133
+ if not hasattr(__class__, "request_metric"):
134
+ __class__.request_metric = Histogram(
135
+ 'request_latency', 'Request latency (seconds)'
136
+ )
137
+
138
+ if not hasattr(__class__, "pubsub_metric"):
139
+ __class__.pubsub_metric = Info(
140
+ 'pubsub', 'Pub/sub configuration'
141
+ )
142
+
143
+ if not hasattr(__class__, "processing_metric"):
144
+ __class__.processing_metric = Counter(
145
+ 'processing_count', 'Processing count', ["status"]
146
+ )
147
+
148
+ __class__.pubsub_metric.info({
149
+ "input_queue": input_queue,
150
+ "subscriber": subscriber,
151
+ "input_schema": input_schema.__name__,
152
+ })
153
+
103
154
  self.consumer = self.client.subscribe(
104
155
  input_queue, subscriber,
105
156
  schema=JsonSchema(input_schema),
@@ -113,11 +164,14 @@ class Consumer(BaseProcessor):
113
164
 
114
165
  try:
115
166
 
116
- self.handle(msg)
167
+ with __class__.request_metric.time():
168
+ self.handle(msg)
117
169
 
118
170
  # Acknowledge successful processing of the message
119
171
  self.consumer.acknowledge(msg)
120
172
 
173
+ __class__.processing_metric.labels(status="success").inc()
174
+
121
175
  except Exception as e:
122
176
 
123
177
  print("Exception:", e, flush=True)
@@ -125,6 +179,8 @@ class Consumer(BaseProcessor):
125
179
  # Message failed to be processed
126
180
  self.consumer.negative_acknowledge(msg)
127
181
 
182
+ __class__.processing_metric.labels(status="error").inc()
183
+
128
184
  @staticmethod
129
185
  def add_args(parser, default_input_queue, default_subscriber):
130
186
 
@@ -144,21 +200,43 @@ class Consumer(BaseProcessor):
144
200
 
145
201
  class ConsumerProducer(BaseProcessor):
146
202
 
147
- def __init__(
148
- self,
149
- pulsar_host=None,
150
- log_level=LogLevel.INFO,
151
- input_queue="input",
152
- output_queue="output",
153
- subscriber="subscriber",
154
- input_schema=None,
155
- output_schema=None,
156
- ):
203
+ def __init__(self, **params):
157
204
 
158
- super(ConsumerProducer, self).__init__(
159
- pulsar_host=pulsar_host,
160
- log_level=log_level,
161
- )
205
+ input_queue = params.get("input_queue")
206
+ output_queue = params.get("output_queue")
207
+ subscriber = params.get("subscriber")
208
+ input_schema = params.get("input_schema")
209
+ output_schema = params.get("output_schema")
210
+
211
+ if not hasattr(__class__, "request_metric"):
212
+ __class__.request_metric = Histogram(
213
+ 'request_latency', 'Request latency (seconds)'
214
+ )
215
+
216
+ if not hasattr(__class__, "output_metric"):
217
+ __class__.output_metric = Counter(
218
+ 'output_count', 'Output items created'
219
+ )
220
+
221
+ if not hasattr(__class__, "pubsub_metric"):
222
+ __class__.pubsub_metric = Info(
223
+ 'pubsub', 'Pub/sub configuration'
224
+ )
225
+
226
+ if not hasattr(__class__, "processing_metric"):
227
+ __class__.processing_metric = Counter(
228
+ 'processing_count', 'Processing count', ["status"]
229
+ )
230
+
231
+ __class__.pubsub_metric.info({
232
+ "input_queue": input_queue,
233
+ "output_queue": output_queue,
234
+ "subscriber": subscriber,
235
+ "input_schema": input_schema.__name__,
236
+ "output_schema": output_schema.__name__,
237
+ })
238
+
239
+ super(ConsumerProducer, self).__init__(**params)
162
240
 
163
241
  if input_schema == None:
164
242
  raise RuntimeError("input_schema must be specified")
@@ -184,11 +262,14 @@ class ConsumerProducer(BaseProcessor):
184
262
 
185
263
  try:
186
264
 
187
- resp = self.handle(msg)
265
+ with __class__.request_metric.time():
266
+ resp = self.handle(msg)
188
267
 
189
268
  # Acknowledge successful processing of the message
190
269
  self.consumer.acknowledge(msg)
191
270
 
271
+ __class__.processing_metric.labels(status="success").inc()
272
+
192
273
  except Exception as e:
193
274
 
194
275
  print("Exception:", e, flush=True)
@@ -196,9 +277,11 @@ class ConsumerProducer(BaseProcessor):
196
277
  # Message failed to be processed
197
278
  self.consumer.negative_acknowledge(msg)
198
279
 
199
- def send(self, msg, properties={}):
280
+ __class__.processing_metric.labels(status="error").inc()
200
281
 
282
+ def send(self, msg, properties={}):
201
283
  self.producer.send(msg, properties)
284
+ __class__.output_metric.inc()
202
285
 
203
286
  @staticmethod
204
287
  def add_args(
@@ -228,18 +311,27 @@ class ConsumerProducer(BaseProcessor):
228
311
 
229
312
  class Producer(BaseProcessor):
230
313
 
231
- def __init__(
232
- self,
233
- pulsar_host=None,
234
- log_level=LogLevel.INFO,
235
- output_queue="output",
236
- output_schema=None,
237
- ):
314
+ def __init__(self, **params):
238
315
 
239
- super(Producer, self).__init__(
240
- pulsar_host=pulsar_host,
241
- log_level=log_level,
242
- )
316
+ output_queue = params.get("output_queue")
317
+ output_schema = params.get("output_schema")
318
+
319
+ if not hasattr(__class__, "output_metric"):
320
+ __class__.output_metric = Counter(
321
+ 'output_count', 'Output items created'
322
+ )
323
+
324
+ if not hasattr(__class__, "pubsub_metric"):
325
+ __class__.pubsub_metric = Info(
326
+ 'pubsub', 'Pub/sub configuration'
327
+ )
328
+
329
+ __class__.pubsub_metric.info({
330
+ "output_queue": output_queue,
331
+ "output_schema": output_schema.__name__,
332
+ })
333
+
334
+ super(Producer, self).__init__(**params)
243
335
 
244
336
  if output_schema == None:
245
337
  raise RuntimeError("output_schema must be specified")
@@ -250,8 +342,8 @@ class Producer(BaseProcessor):
250
342
  )
251
343
 
252
344
  def send(self, msg, properties={}):
253
-
254
345
  self.producer.send(msg, properties)
346
+ __class__.output_metric.inc()
255
347
 
256
348
  @staticmethod
257
349
  def add_args(
@@ -17,25 +17,22 @@ default_subscriber = 'chunker-recursive'
17
17
 
18
18
  class Processor(ConsumerProducer):
19
19
 
20
- def __init__(
21
- self,
22
- pulsar_host=None,
23
- input_queue=default_input_queue,
24
- output_queue=default_output_queue,
25
- subscriber=default_subscriber,
26
- log_level=LogLevel.INFO,
27
- chunk_size=2000,
28
- chunk_overlap=100,
29
- ):
30
-
20
+ def __init__(self, **params):
21
+
22
+ input_queue = params.get("input_queue", default_input_queue)
23
+ output_queue = params.get("output_queue", default_output_queue)
24
+ subscriber = params.get("subscriber", default_subscriber)
25
+ chunk_size = params.get("chunk_size", 2000)
26
+ chunk_overlap = params.get("chunk_overlap", 100)
27
+
31
28
  super(Processor, self).__init__(
32
- pulsar_host=pulsar_host,
33
- log_level=log_level,
34
- input_queue=input_queue,
35
- output_queue=output_queue,
36
- subscriber=subscriber,
37
- input_schema=TextDocument,
38
- output_schema=Chunk,
29
+ **params | {
30
+ "input_queue": input_queue,
31
+ "output_queue": output_queue,
32
+ "subscriber": subscriber,
33
+ "input_schema": TextDocument,
34
+ "output_schema": Chunk,
35
+ }
39
36
  )
40
37
 
41
38
  self.text_splitter = RecursiveCharacterTextSplitter(
@@ -18,23 +18,20 @@ default_subscriber = 'pdf-decoder'
18
18
 
19
19
  class Processor(ConsumerProducer):
20
20
 
21
- def __init__(
22
- self,
23
- pulsar_host=None,
24
- input_queue=default_input_queue,
25
- output_queue=default_output_queue,
26
- subscriber=default_subscriber,
27
- log_level=LogLevel.INFO,
28
- ):
21
+ def __init__(self, **params):
22
+
23
+ input_queue = params.get("input_queue", default_input_queue)
24
+ output_queue = params.get("output_queue", default_output_queue)
25
+ subscriber = params.get("subscriber", default_subscriber)
29
26
 
30
27
  super(Processor, self).__init__(
31
- pulsar_host=pulsar_host,
32
- log_level=log_level,
33
- input_queue=input_queue,
34
- output_queue=output_queue,
35
- subscriber=subscriber,
36
- input_schema=Document,
37
- output_schema=TextDocument,
28
+ **params | {
29
+ "input_queue": input_queue,
30
+ "output_queue": output_queue,
31
+ "subscriber": subscriber,
32
+ "input_schema": Document,
33
+ "output_schema": TextDocument,
34
+ }
38
35
  )
39
36
 
40
37
  print("PDF inited")
@@ -17,24 +17,21 @@ default_model="all-MiniLM-L6-v2"
17
17
 
18
18
  class Processor(ConsumerProducer):
19
19
 
20
- def __init__(
21
- self,
22
- pulsar_host=None,
23
- input_queue=default_input_queue,
24
- output_queue=default_output_queue,
25
- subscriber=default_subscriber,
26
- log_level=LogLevel.INFO,
27
- model=default_model,
28
- ):
20
+ def __init__(self, **params):
21
+
22
+ input_queue = params.get("input_queue", default_input_queue)
23
+ output_queue = params.get("output_queue", default_output_queue)
24
+ subscriber = params.get("subscriber", default_subscriber)
25
+ model = params.get("model", default_model)
29
26
 
30
27
  super(Processor, self).__init__(
31
- pulsar_host=pulsar_host,
32
- log_level=log_level,
33
- input_queue=input_queue,
34
- output_queue=output_queue,
35
- subscriber=subscriber,
36
- input_schema=EmbeddingsRequest,
37
- output_schema=EmbeddingsResponse,
28
+ **params | {
29
+ "input_queue": input_queue,
30
+ "output_queue": output_queue,
31
+ "subscriber": subscriber,
32
+ "input_schema": EmbeddingsRequest,
33
+ "output_schema": EmbeddingsResponse,
34
+ }
38
35
  )
39
36
 
40
37
  self.embeddings = HuggingFaceEmbeddings(model_name=model)
@@ -17,25 +17,20 @@ default_ollama = 'http://localhost:11434'
17
17
 
18
18
  class Processor(ConsumerProducer):
19
19
 
20
- def __init__(
21
- self,
22
- pulsar_host=None,
23
- input_queue=default_input_queue,
24
- output_queue=default_output_queue,
25
- subscriber=default_subscriber,
26
- log_level=LogLevel.INFO,
27
- model=default_model,
28
- ollama=default_ollama,
29
- ):
20
+ def __init__(self, **params):
21
+
22
+ input_queue = params.get("input_queue", default_input_queue)
23
+ output_queue = params.get("output_queue", default_output_queue)
24
+ subscriber = params.get("subscriber", default_subscriber)
30
25
 
31
26
  super(Processor, self).__init__(
32
- pulsar_host=pulsar_host,
33
- log_level=log_level,
34
- input_queue=input_queue,
35
- output_queue=output_queue,
36
- subscriber=subscriber,
37
- input_schema=EmbeddingsRequest,
38
- output_schema=EmbeddingsResponse,
27
+ **params | {
28
+ "input_queue": input_queue,
29
+ "output_queue": output_queue,
30
+ "subscriber": subscriber,
31
+ "input_schema": EmbeddingsRequest,
32
+ "output_schema": EmbeddingsResponse,
33
+ }
39
34
  )
40
35
 
41
36
  self.embeddings = OllamaEmbeddings(base_url=ollama, model=model)
@@ -15,26 +15,23 @@ default_subscriber = 'embeddings-vectorizer'
15
15
 
16
16
  class Processor(ConsumerProducer):
17
17
 
18
- def __init__(
19
- self,
20
- pulsar_host=None,
21
- input_queue=default_input_queue,
22
- output_queue=default_output_queue,
23
- subscriber=default_subscriber,
24
- log_level=LogLevel.INFO,
25
- ):
18
+ def __init__(self, **params):
19
+
20
+ input_queue = params.get("input_queue", default_input_queue)
21
+ output_queue = params.get("output_queue", default_output_queue)
22
+ subscriber = params.get("subscriber", default_subscriber)
26
23
 
27
24
  super(Processor, self).__init__(
28
- pulsar_host=pulsar_host,
29
- log_level=log_level,
30
- input_queue=input_queue,
31
- output_queue=output_queue,
32
- subscriber=subscriber,
33
- input_schema=Chunk,
34
- output_schema=VectorsChunk,
25
+ **params | {
26
+ "input_queue": input_queue,
27
+ "output_queue": output_queue,
28
+ "subscriber": subscriber,
29
+ "input_schema": Chunk,
30
+ "output_schema": VectorsChunk,
31
+ }
35
32
  )
36
33
 
37
- self.embeddings = EmbeddingsClient(pulsar_host=pulsar_host)
34
+ self.embeddings = EmbeddingsClient(pulsar_host=self.pulsar_host)
38
35
 
39
36
  def emit(self, source, chunk, vectors):
40
37
 
@@ -20,27 +20,22 @@ default_graph_host='localhost'
20
20
 
21
21
  class Processor(Consumer):
22
22
 
23
- def __init__(
24
- self,
25
- pulsar_host=None,
26
- input_queue=default_input_queue,
27
- subscriber=default_subscriber,
28
- graph_host=default_graph_host,
29
- log_level=LogLevel.INFO,
30
- ):
23
+ def __init__(self, **params):
24
+
25
+ input_queue = params.get("input_queue", default_input_queue)
26
+ subscriber = params.get("subscriber", default_subscriber)
27
+ graph_host = params.get("graph_host", default_graph_host)
31
28
 
32
29
  super(Processor, self).__init__(
33
- pulsar_host=pulsar_host,
34
- log_level=log_level,
35
- input_queue=input_queue,
36
- subscriber=subscriber,
37
- input_schema=Triple,
30
+ **params | {
31
+ "input_queue": input_queue,
32
+ "subscriber": subscriber,
33
+ "input_schema": Triple,
34
+ }
38
35
  )
39
36
 
40
37
  self.tg = TrustGraph([graph_host])
41
38
 
42
- self.count = 0
43
-
44
39
  def handle(self, msg):
45
40
 
46
41
  v = msg.value()
@@ -51,11 +46,6 @@ class Processor(Consumer):
51
46
  v.o.value
52
47
  )
53
48
 
54
- self.count += 1
55
-
56
- if (self.count % 1000) == 0:
57
- print(self.count, "...", flush=True)
58
-
59
49
  @staticmethod
60
50
  def add_args(parser):
61
51
 
@@ -18,7 +18,7 @@ class GraphRag:
18
18
  verbose=False,
19
19
  entity_limit=50,
20
20
  triple_limit=30,
21
- max_sg_size=3000,
21
+ max_subgraph_size=3000,
22
22
  ):
23
23
 
24
24
  self.verbose=verbose
@@ -37,7 +37,7 @@ class GraphRag:
37
37
 
38
38
  self.entity_limit=entity_limit
39
39
  self.query_limit=triple_limit
40
- self.max_sg_size=max_sg_size
40
+ self.max_subgraph_size=max_subgraph_size
41
41
 
42
42
  self.label_cache = {}
43
43
 
@@ -149,7 +149,7 @@ class GraphRag:
149
149
 
150
150
  subgraph = list(subgraph)
151
151
 
152
- subgraph = subgraph[0:self.max_sg_size]
152
+ subgraph = subgraph[0:self.max_subgraph_size]
153
153
 
154
154
  if self.verbose:
155
155
  print("Subgraph:", flush=True)
@@ -22,26 +22,23 @@ default_subscriber = 'kg-extract-definitions'
22
22
 
23
23
  class Processor(ConsumerProducer):
24
24
 
25
- def __init__(
26
- self,
27
- pulsar_host=None,
28
- input_queue=default_input_queue,
29
- output_queue=default_output_queue,
30
- subscriber=default_subscriber,
31
- log_level=LogLevel.INFO,
32
- ):
25
+ def __init__(self, **params):
26
+
27
+ input_queue = params.get("input_queue", default_input_queue)
28
+ output_queue = params.get("output_queue", default_output_queue)
29
+ subscriber = params.get("subscriber", default_subscriber)
33
30
 
34
31
  super(Processor, self).__init__(
35
- pulsar_host=pulsar_host,
36
- log_level=log_level,
37
- input_queue=input_queue,
38
- output_queue=output_queue,
39
- subscriber=subscriber,
40
- input_schema=VectorsChunk,
41
- output_schema=Triple,
32
+ **params | {
33
+ "input_queue": input_queue,
34
+ "output_queue": output_queue,
35
+ "subscriber": subscriber,
36
+ "input_schema": VectorsChunk,
37
+ "output_schema": Triple,
38
+ }
42
39
  )
43
40
 
44
- self.llm = LlmClient(pulsar_host=pulsar_host)
41
+ self.llm = LlmClient(pulsar_host=self.pulsar_host)
45
42
 
46
43
  def to_uri(self, text):
47
44