trustgraph 0.2.3__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of trustgraph might be problematic. Click here for more details.

Files changed (110) hide show
  1. {trustgraph-0.2.3 → trustgraph-0.3.0}/PKG-INFO +4 -4
  2. {trustgraph-0.2.3 → trustgraph-0.3.0}/README.md +2 -2
  3. {trustgraph-0.2.3 → trustgraph-0.3.0}/setup.py +1 -1
  4. trustgraph-0.3.0/trustgraph/base/processor.py +266 -0
  5. trustgraph-0.3.0/trustgraph/chunker/recursive/chunker.py +99 -0
  6. trustgraph-0.3.0/trustgraph/decoder/pdf/pdf_decoder.py +87 -0
  7. trustgraph-0.3.0/trustgraph/embeddings/hf/hf.py +77 -0
  8. trustgraph-0.3.0/trustgraph/embeddings/ollama/__init__.py +3 -0
  9. trustgraph-0.3.0/trustgraph/embeddings/ollama/processor.py +86 -0
  10. trustgraph-0.3.0/trustgraph/embeddings/vectorize/vectorize.py +77 -0
  11. trustgraph-0.3.0/trustgraph/graph/cassandra_write/write.py +75 -0
  12. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/graph_rag.py +2 -0
  13. trustgraph-0.3.0/trustgraph/kg/extract_definitions/extract.py +108 -0
  14. trustgraph-0.3.0/trustgraph/kg/extract_relationships/extract.py +167 -0
  15. trustgraph-0.3.0/trustgraph/llm/azure_text/llm.py +126 -0
  16. trustgraph-0.3.0/trustgraph/llm/claude_text/llm.py +108 -0
  17. trustgraph-0.3.0/trustgraph/llm/ollama_text/llm.py +88 -0
  18. trustgraph-0.3.0/trustgraph/llm/vertexai_text/llm.py +176 -0
  19. trustgraph-0.3.0/trustgraph/rag/graph/rag.py +117 -0
  20. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/triple_vectors.py +17 -9
  21. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/trustgraph.py +1 -1
  22. trustgraph-0.3.0/trustgraph/vector/milvus_write/write.py +60 -0
  23. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph.egg-info/PKG-INFO +4 -4
  24. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph.egg-info/SOURCES.txt +2 -0
  25. trustgraph-0.2.3/trustgraph/chunker/recursive/chunker.py +0 -191
  26. trustgraph-0.2.3/trustgraph/decoder/pdf/pdf_decoder.py +0 -174
  27. trustgraph-0.2.3/trustgraph/embeddings/hf/hf.py +0 -165
  28. trustgraph-0.2.3/trustgraph/embeddings/ollama/processor.py +0 -175
  29. trustgraph-0.2.3/trustgraph/embeddings/vectorize/vectorize.py +0 -163
  30. trustgraph-0.2.3/trustgraph/graph/cassandra_write/write.py +0 -148
  31. trustgraph-0.2.3/trustgraph/kg/extract_definitions/extract.py +0 -197
  32. trustgraph-0.2.3/trustgraph/kg/extract_relationships/extract.py +0 -253
  33. trustgraph-0.2.3/trustgraph/llm/azure_text/llm.py +0 -213
  34. trustgraph-0.2.3/trustgraph/llm/claude_text/llm.py +0 -192
  35. trustgraph-0.2.3/trustgraph/llm/ollama_text/llm.py +0 -174
  36. trustgraph-0.2.3/trustgraph/llm/vertexai_text/llm.py +0 -258
  37. trustgraph-0.2.3/trustgraph/rag/graph/rag.py +0 -207
  38. trustgraph-0.2.3/trustgraph/vector/milvus_write/write.py +0 -140
  39. {trustgraph-0.2.3 → trustgraph-0.3.0}/LICENSE +0 -0
  40. {trustgraph-0.2.3 → trustgraph-0.3.0}/scripts/chunker-recursive +0 -0
  41. {trustgraph-0.2.3 → trustgraph-0.3.0}/scripts/embeddings-hf +0 -0
  42. {trustgraph-0.2.3 → trustgraph-0.3.0}/scripts/embeddings-ollama +0 -0
  43. {trustgraph-0.2.3 → trustgraph-0.3.0}/scripts/embeddings-vectorize +0 -0
  44. {trustgraph-0.2.3 → trustgraph-0.3.0}/scripts/graph-rag +0 -0
  45. {trustgraph-0.2.3 → trustgraph-0.3.0}/scripts/graph-show +0 -0
  46. {trustgraph-0.2.3 → trustgraph-0.3.0}/scripts/graph-to-turtle +0 -0
  47. {trustgraph-0.2.3 → trustgraph-0.3.0}/scripts/graph-write-cassandra +0 -0
  48. {trustgraph-0.2.3 → trustgraph-0.3.0}/scripts/init-pulsar-manager +0 -0
  49. {trustgraph-0.2.3 → trustgraph-0.3.0}/scripts/kg-extract-definitions +0 -0
  50. {trustgraph-0.2.3 → trustgraph-0.3.0}/scripts/kg-extract-relationships +0 -0
  51. {trustgraph-0.2.3 → trustgraph-0.3.0}/scripts/llm-azure-text +0 -0
  52. {trustgraph-0.2.3 → trustgraph-0.3.0}/scripts/llm-claude-text +0 -0
  53. {trustgraph-0.2.3 → trustgraph-0.3.0}/scripts/llm-ollama-text +0 -0
  54. {trustgraph-0.2.3 → trustgraph-0.3.0}/scripts/llm-vertexai-text +0 -0
  55. {trustgraph-0.2.3 → trustgraph-0.3.0}/scripts/loader +0 -0
  56. {trustgraph-0.2.3 → trustgraph-0.3.0}/scripts/pdf-decoder +0 -0
  57. {trustgraph-0.2.3 → trustgraph-0.3.0}/scripts/query +0 -0
  58. {trustgraph-0.2.3 → trustgraph-0.3.0}/scripts/run-processing +0 -0
  59. {trustgraph-0.2.3 → trustgraph-0.3.0}/scripts/vector-write-milvus +0 -0
  60. {trustgraph-0.2.3 → trustgraph-0.3.0}/setup.cfg +0 -0
  61. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/__init__.py +0 -0
  62. {trustgraph-0.2.3/trustgraph/embeddings/ollama → trustgraph-0.3.0/trustgraph/base}/__init__.py +0 -0
  63. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/chunker/__init__.py +0 -0
  64. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/chunker/recursive/__init__.py +0 -0
  65. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/chunker/recursive/__main__.py +0 -0
  66. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/decoder/__init__.py +0 -0
  67. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/decoder/pdf/__init__.py +0 -0
  68. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/decoder/pdf/__main__.py +0 -0
  69. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/embeddings/__init__.py +0 -0
  70. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/embeddings/hf/__init__.py +0 -0
  71. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/embeddings/hf/__main__.py +0 -0
  72. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/embeddings/ollama/__main__.py +0 -0
  73. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/embeddings/vectorize/__init__.py +0 -0
  74. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/embeddings/vectorize/__main__.py +0 -0
  75. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/embeddings_client.py +0 -0
  76. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/graph/__init__.py +0 -0
  77. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/graph/cassandra_write/__init__.py +0 -0
  78. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/graph/cassandra_write/__main__.py +0 -0
  79. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/graph_rag_client.py +0 -0
  80. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/kg/__init__.py +0 -0
  81. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/kg/extract_definitions/__init__.py +0 -0
  82. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/kg/extract_definitions/__main__.py +0 -0
  83. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/kg/extract_relationships/__init__.py +0 -0
  84. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/kg/extract_relationships/__main__.py +0 -0
  85. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/llm/__init__.py +0 -0
  86. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/llm/azure_text/__init__.py +0 -0
  87. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/llm/azure_text/__main__.py +0 -0
  88. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/llm/claude_text/__init__.py +0 -0
  89. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/llm/claude_text/__main__.py +0 -0
  90. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/llm/ollama_text/__init__.py +0 -0
  91. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/llm/ollama_text/__main__.py +0 -0
  92. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/llm/vertexai_text/__init__.py +0 -0
  93. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/llm/vertexai_text/__main__.py +0 -0
  94. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/llm_client.py +0 -0
  95. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/log_level.py +0 -0
  96. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/processing/__init__.py +0 -0
  97. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/processing/__main__.py +0 -0
  98. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/processing/processing.py +0 -0
  99. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/prompts.py +0 -0
  100. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/rag/__init__.py +0 -0
  101. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/rag/graph/__init__.py +0 -0
  102. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/rag/graph/__main__.py +0 -0
  103. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/rdf.py +0 -0
  104. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/schema.py +0 -0
  105. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/vector/__init__.py +0 -0
  106. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/vector/milvus_write/__init__.py +0 -0
  107. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph/vector/milvus_write/__main__.py +0 -0
  108. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph.egg-info/dependency_links.txt +0 -0
  109. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph.egg-info/requires.txt +0 -0
  110. {trustgraph-0.2.3 → trustgraph-0.3.0}/trustgraph.egg-info/top_level.txt +0 -0
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: trustgraph
3
- Version: 0.2.3
3
+ Version: 0.3.0
4
4
  Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
5
5
  Home-page: https://github.com/trustgraph-ai/trustgraph
6
- Download-URL: https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v0.2.3.tar.gz
6
+ Download-URL: https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v0.3.0.tar.gz
7
7
  Author: trustgraph.ai
8
8
  Author-email: security@trustgraph.ai
9
9
  Classifier: Programming Language :: Python :: 3
@@ -124,11 +124,11 @@ package installed can also run the entire architecture.
124
124
  - `llm-ollama-text` - Sends request to LM running using Ollama
125
125
  - `llm-vertexai-text` - Sends request to model available through VertexAI API
126
126
 
127
- ## Getting start
127
+ ## Quickstart Guide
128
128
 
129
129
  See [Quickstart on Docker Compose](docs/README.quickstart-docker-compose.md)
130
130
 
131
- ## Development
131
+ ## Development Guide
132
132
 
133
133
  See [Development on trustgraph](docs/README.development.md)
134
134
 
@@ -90,11 +90,11 @@ package installed can also run the entire architecture.
90
90
  - `llm-ollama-text` - Sends request to LM running using Ollama
91
91
  - `llm-vertexai-text` - Sends request to model available through VertexAI API
92
92
 
93
- ## Getting start
93
+ ## Quickstart Guide
94
94
 
95
95
  See [Quickstart on Docker Compose](docs/README.quickstart-docker-compose.md)
96
96
 
97
- ## Development
97
+ ## Development Guide
98
98
 
99
99
  See [Development on trustgraph](docs/README.development.md)
100
100
 
@@ -4,7 +4,7 @@ import os
4
4
  with open("README.md", "r") as fh:
5
5
  long_description = fh.read()
6
6
 
7
- version = "0.2.3"
7
+ version = "0.3.0"
8
8
 
9
9
  setuptools.setup(
10
10
  name="trustgraph",
@@ -0,0 +1,266 @@
1
+
2
+ import os
3
+ import argparse
4
+ import pulsar
5
+ import time
6
+ from pulsar.schema import JsonSchema
7
+
8
+ from .. log_level import LogLevel
9
+
10
+ class BaseProcessor:
11
+
12
+ default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
13
+
14
+ def __init__(
15
+ self,
16
+ pulsar_host=default_pulsar_host,
17
+ log_level=LogLevel.INFO,
18
+ ):
19
+
20
+ self.client = None
21
+
22
+ if pulsar_host == None:
23
+ pulsar_host = default_pulsar_host
24
+
25
+ self.pulsar_host = pulsar_host
26
+
27
+ self.client = pulsar.Client(
28
+ pulsar_host,
29
+ logger=pulsar.ConsoleLogger(log_level.to_pulsar())
30
+ )
31
+
32
+ def __del__(self):
33
+
34
+ if self.client:
35
+ self.client.close()
36
+
37
+ @staticmethod
38
+ def add_args(parser):
39
+
40
+ parser.add_argument(
41
+ '-p', '--pulsar-host',
42
+ default=__class__.default_pulsar_host,
43
+ help=f'Pulsar host (default: {__class__.default_pulsar_host})',
44
+ )
45
+
46
+ parser.add_argument(
47
+ '-l', '--log-level',
48
+ type=LogLevel,
49
+ default=LogLevel.INFO,
50
+ choices=list(LogLevel),
51
+ help=f'Output queue (default: info)'
52
+ )
53
+
54
+ def run(self):
55
+ raise RuntimeError("Something should have implemented the run method")
56
+
57
+ @classmethod
58
+ def start(cls, prog, doc):
59
+
60
+ parser = argparse.ArgumentParser(
61
+ prog=prog,
62
+ description=doc
63
+ )
64
+
65
+ cls.add_args(parser)
66
+
67
+ args = parser.parse_args()
68
+ args = vars(args)
69
+
70
+ try:
71
+
72
+ p = cls(**args)
73
+ p.run()
74
+
75
+ except Exception as e:
76
+
77
+ print("Exception:", e, flush=True)
78
+ print("Will retry...", flush=True)
79
+
80
+ time.sleep(10)
81
+
82
+ class Consumer(BaseProcessor):
83
+
84
+ def __init__(
85
+ self,
86
+ pulsar_host=None,
87
+ log_level=LogLevel.INFO,
88
+ input_queue="input",
89
+ subscriber="subscriber",
90
+ input_schema=None,
91
+ ):
92
+
93
+ super(Consumer, self).__init__(
94
+ pulsar_host=pulsar_host,
95
+ log_level=log_level,
96
+ )
97
+
98
+ if input_schema == None:
99
+ raise RuntimeError("input_schema must be specified")
100
+
101
+ self.consumer = self.client.subscribe(
102
+ input_queue, subscriber,
103
+ schema=JsonSchema(input_schema),
104
+ )
105
+
106
+ def run(self):
107
+
108
+ while True:
109
+
110
+ msg = self.consumer.receive()
111
+
112
+ try:
113
+
114
+ self.handle(msg)
115
+
116
+ # Acknowledge successful processing of the message
117
+ self.consumer.acknowledge(msg)
118
+
119
+ except Exception as e:
120
+
121
+ print("Exception:", e, flush=True)
122
+
123
+ # Message failed to be processed
124
+ self.consumer.negative_acknowledge(msg)
125
+
126
+ @staticmethod
127
+ def add_args(parser, default_input_queue, default_subscriber):
128
+
129
+ BaseProcessor.add_args(parser)
130
+
131
+ parser.add_argument(
132
+ '-i', '--input-queue',
133
+ default=default_input_queue,
134
+ help=f'Input queue (default: {default_input_queue})'
135
+ )
136
+
137
+ parser.add_argument(
138
+ '-s', '--subscriber',
139
+ default=default_subscriber,
140
+ help=f'Queue subscriber name (default: {default_subscriber})'
141
+ )
142
+
143
+ class ConsumerProducer(BaseProcessor):
144
+
145
+ def __init__(
146
+ self,
147
+ pulsar_host=None,
148
+ log_level=LogLevel.INFO,
149
+ input_queue="input",
150
+ output_queue="output",
151
+ subscriber="subscriber",
152
+ input_schema=None,
153
+ output_schema=None,
154
+ ):
155
+
156
+ super(ConsumerProducer, self).__init__(
157
+ pulsar_host=pulsar_host,
158
+ log_level=log_level,
159
+ )
160
+
161
+ if input_schema == None:
162
+ raise RuntimeError("input_schema must be specified")
163
+
164
+ if output_schema == None:
165
+ raise RuntimeError("output_schema must be specified")
166
+
167
+ self.consumer = self.client.subscribe(
168
+ input_queue, subscriber,
169
+ schema=JsonSchema(input_schema),
170
+ )
171
+
172
+ self.producer = self.client.create_producer(
173
+ topic=output_queue,
174
+ schema=JsonSchema(output_schema),
175
+ )
176
+
177
+ def run(self):
178
+
179
+ while True:
180
+
181
+ msg = self.consumer.receive()
182
+
183
+ try:
184
+
185
+ resp = self.handle(msg)
186
+
187
+ # Acknowledge successful processing of the message
188
+ self.consumer.acknowledge(msg)
189
+
190
+ except Exception as e:
191
+
192
+ print("Exception:", e, flush=True)
193
+
194
+ # Message failed to be processed
195
+ self.consumer.negative_acknowledge(msg)
196
+
197
+ def send(self, msg, properties={}):
198
+
199
+ self.producer.send(msg, properties)
200
+
201
+ @staticmethod
202
+ def add_args(
203
+ parser, default_input_queue, default_subscriber,
204
+ default_output_queue,
205
+ ):
206
+
207
+ BaseProcessor.add_args(parser)
208
+
209
+ parser.add_argument(
210
+ '-i', '--input-queue',
211
+ default=default_input_queue,
212
+ help=f'Input queue (default: {default_input_queue})'
213
+ )
214
+
215
+ parser.add_argument(
216
+ '-s', '--subscriber',
217
+ default=default_subscriber,
218
+ help=f'Queue subscriber name (default: {default_subscriber})'
219
+ )
220
+
221
+ parser.add_argument(
222
+ '-o', '--output-queue',
223
+ default=default_output_queue,
224
+ help=f'Output queue (default: {default_output_queue})'
225
+ )
226
+
227
+ class Producer(BaseProcessor):
228
+
229
+ def __init__(
230
+ self,
231
+ pulsar_host=None,
232
+ log_level=LogLevel.INFO,
233
+ output_queue="output",
234
+ output_schema=None,
235
+ ):
236
+
237
+ super(Producer, self).__init__(
238
+ pulsar_host=pulsar_host,
239
+ log_level=log_level,
240
+ )
241
+
242
+ if output_schema == None:
243
+ raise RuntimeError("output_schema must be specified")
244
+
245
+ self.producer = self.client.create_producer(
246
+ topic=output_queue,
247
+ schema=JsonSchema(output_schema),
248
+ )
249
+
250
+ def send(self, msg, properties={}):
251
+
252
+ self.producer.send(msg, properties)
253
+
254
+ @staticmethod
255
+ def add_args(
256
+ parser, default_input_queue, default_subscriber,
257
+ default_output_queue,
258
+ ):
259
+
260
+ BaseProcessor.add_args(parser)
261
+
262
+ parser.add_argument(
263
+ '-o', '--output-queue',
264
+ default=default_output_queue,
265
+ help=f'Output queue (default: {default_output_queue})'
266
+ )
@@ -0,0 +1,99 @@
1
+
2
+ """
3
+ Simple decoder, accepts text documents on input, outputs chunks from the
4
+ as text as separate output objects.
5
+ """
6
+
7
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
8
+
9
+
10
+ from ... schema import TextDocument, Chunk, Source
11
+ from ... log_level import LogLevel
12
+ from ... base import ConsumerProducer
13
+
14
+ default_input_queue = 'text-doc-load'
15
+ default_output_queue = 'chunk-load'
16
+ default_subscriber = 'chunker-recursive'
17
+
18
+ class Processor(ConsumerProducer):
19
+
20
+ def __init__(
21
+ self,
22
+ pulsar_host=None,
23
+ input_queue=default_input_queue,
24
+ output_queue=default_output_queue,
25
+ subscriber=default_subscriber,
26
+ log_level=LogLevel.INFO,
27
+ chunk_size=2000,
28
+ chunk_overlap=100,
29
+ ):
30
+
31
+ super(Processor, self).__init__(
32
+ pulsar_host=pulsar_host,
33
+ log_level=log_level,
34
+ input_queue=input_queue,
35
+ output_queue=output_queue,
36
+ subscriber=subscriber,
37
+ input_schema=TextDocument,
38
+ output_schema=Chunk,
39
+ )
40
+
41
+ self.text_splitter = RecursiveCharacterTextSplitter(
42
+ chunk_size=chunk_size,
43
+ chunk_overlap=chunk_overlap,
44
+ length_function=len,
45
+ is_separator_regex=False,
46
+ )
47
+
48
+ def handle(self, msg):
49
+
50
+ v = msg.value()
51
+ print(f"Chunking {v.source.id}...", flush=True)
52
+
53
+ texts = self.text_splitter.create_documents(
54
+ [v.text.decode("utf-8")]
55
+ )
56
+
57
+ for ix, chunk in enumerate(texts):
58
+
59
+ id = v.source.id + "-c" + str(ix)
60
+
61
+ r = Chunk(
62
+ source=Source(
63
+ source=v.source.source,
64
+ id=id,
65
+ title=v.source.title
66
+ ),
67
+ chunk=chunk.page_content.encode("utf-8"),
68
+ )
69
+
70
+ self.send(r)
71
+
72
+ print("Done.", flush=True)
73
+
74
+ @staticmethod
75
+ def add_args(parser):
76
+
77
+ ConsumerProducer.add_args(
78
+ parser, default_input_queue, default_subscriber,
79
+ default_output_queue,
80
+ )
81
+
82
+ parser.add_argument(
83
+ '-z', '--chunk-size',
84
+ type=int,
85
+ default=2000,
86
+ help=f'Chunk size (default: 2000)'
87
+ )
88
+
89
+ parser.add_argument(
90
+ '-v', '--chunk-overlap',
91
+ type=int,
92
+ default=100,
93
+ help=f'Chunk overlap (default: 100)'
94
+ )
95
+
96
+ def run():
97
+
98
+ Processor.start('chunker', __doc__)
99
+
@@ -0,0 +1,87 @@
1
+
2
+ """
3
+ Simple decoder, accepts PDF documents on input, outputs pages from the
4
+ PDF document as text as separate output objects.
5
+ """
6
+
7
+ import tempfile
8
+ import base64
9
+ from langchain_community.document_loaders import PyPDFLoader
10
+
11
+ from ... schema import Document, TextDocument, Source
12
+ from ... log_level import LogLevel
13
+ from ... base import ConsumerProducer
14
+
15
+ default_input_queue = 'document-load'
16
+ default_output_queue = 'text-doc-load'
17
+ default_subscriber = 'pdf-decoder'
18
+
19
+ class Processor(ConsumerProducer):
20
+
21
+ def __init__(
22
+ self,
23
+ pulsar_host=None,
24
+ input_queue=default_input_queue,
25
+ output_queue=default_output_queue,
26
+ subscriber=default_subscriber,
27
+ log_level=LogLevel.INFO,
28
+ ):
29
+
30
+ super(Processor, self).__init__(
31
+ pulsar_host=pulsar_host,
32
+ log_level=log_level,
33
+ input_queue=input_queue,
34
+ output_queue=output_queue,
35
+ subscriber=subscriber,
36
+ input_schema=Document,
37
+ output_schema=TextDocument,
38
+ )
39
+
40
+ print("PDF inited")
41
+
42
+ def handle(self, msg):
43
+
44
+ print("PDF message received")
45
+
46
+ v = msg.value()
47
+
48
+ print(f"Decoding {v.source.id}...", flush=True)
49
+
50
+ with tempfile.NamedTemporaryFile(delete_on_close=False) as fp:
51
+
52
+ fp.write(base64.b64decode(v.data))
53
+ fp.close()
54
+
55
+ with open(fp.name, mode='rb') as f:
56
+
57
+ loader = PyPDFLoader(fp.name)
58
+ pages = loader.load()
59
+
60
+ for ix, page in enumerate(pages):
61
+
62
+ id = v.source.id + "-p" + str(ix)
63
+ r = TextDocument(
64
+ source=Source(
65
+ source=v.source.source,
66
+ title=v.source.title,
67
+ id=id,
68
+ ),
69
+ text=page.page_content.encode("utf-8"),
70
+ )
71
+
72
+ self.send(r)
73
+
74
+ print("Done.", flush=True)
75
+
76
+ @staticmethod
77
+ def add_args(parser):
78
+
79
+ ConsumerProducer.add_args(
80
+ parser, default_input_queue, default_subscriber,
81
+ default_output_queue,
82
+ )
83
+
84
+ def run():
85
+
86
+ Processor.start("pdf-decoder", __doc__)
87
+
@@ -0,0 +1,77 @@
1
+
2
+ """
3
+ Embeddings service, applies an embeddings model selected from HuggingFace.
4
+ Input is text, output is embeddings vector.
5
+ """
6
+
7
+ from langchain_huggingface import HuggingFaceEmbeddings
8
+
9
+ from ... schema import EmbeddingsRequest, EmbeddingsResponse
10
+ from ... log_level import LogLevel
11
+ from ... base import ConsumerProducer
12
+
13
+ default_input_queue = 'embeddings'
14
+ default_output_queue = 'embeddings-response'
15
+ default_subscriber = 'embeddings-hf'
16
+ default_model="all-MiniLM-L6-v2"
17
+
18
+ class Processor(ConsumerProducer):
19
+
20
+ def __init__(
21
+ self,
22
+ pulsar_host=None,
23
+ input_queue=default_input_queue,
24
+ output_queue=default_output_queue,
25
+ subscriber=default_subscriber,
26
+ log_level=LogLevel.INFO,
27
+ model=default_model,
28
+ ):
29
+
30
+ super(Processor, self).__init__(
31
+ pulsar_host=pulsar_host,
32
+ log_level=log_level,
33
+ input_queue=input_queue,
34
+ output_queue=output_queue,
35
+ subscriber=subscriber,
36
+ input_schema=EmbeddingsRequest,
37
+ output_schema=EmbeddingsResponse,
38
+ )
39
+
40
+ self.embeddings = HuggingFaceEmbeddings(model_name=model)
41
+
42
+ def handle(self, msg):
43
+
44
+ v = msg.value()
45
+
46
+ # Sender-produced ID
47
+ id = msg.properties()["id"]
48
+
49
+ print(f"Handling input {id}...", flush=True)
50
+
51
+ text = v.text
52
+ embeds = self.embeddings.embed_documents([text])
53
+
54
+ print("Send response...", flush=True)
55
+ r = EmbeddingsResponse(vectors=embeds)
56
+ self.producer.send(r, properties={"id": id})
57
+
58
+ print("Done.", flush=True)
59
+
60
+ @staticmethod
61
+ def add_args(parser):
62
+
63
+ ConsumerProducer.add_args(
64
+ parser, default_input_queue, default_subscriber,
65
+ default_output_queue,
66
+ )
67
+
68
+ parser.add_argument(
69
+ '-m', '--model',
70
+ default="all-MiniLM-L6-v2",
71
+ help=f'LLM model (default: all-MiniLM-L6-v2)'
72
+ )
73
+
74
+ def run():
75
+
76
+ Processor.start("embeddings-hf", __doc__)
77
+
@@ -0,0 +1,3 @@
1
+
2
+ from . processor import *
3
+
@@ -0,0 +1,86 @@
1
+
2
+ """
3
+ Embeddings service, applies an embeddings model selected from HuggingFace.
4
+ Input is text, output is embeddings vector.
5
+ """
6
+ from langchain_community.embeddings import OllamaEmbeddings
7
+
8
+ from ... schema import EmbeddingsRequest, EmbeddingsResponse
9
+ from ... log_level import LogLevel
10
+ from ... base import ConsumerProducer
11
+
12
+ default_input_queue = 'embeddings'
13
+ default_output_queue = 'embeddings-response'
14
+ default_subscriber = 'embeddings-ollama'
15
+ default_model="mxbai-embed-large"
16
+ default_ollama = 'http://localhost:11434'
17
+
18
+ class Processor(ConsumerProducer):
19
+
20
+ def __init__(
21
+ self,
22
+ pulsar_host=None,
23
+ input_queue=default_input_queue,
24
+ output_queue=default_output_queue,
25
+ subscriber=default_subscriber,
26
+ log_level=LogLevel.INFO,
27
+ model=default_model,
28
+ ollama=default_ollama,
29
+ ):
30
+
31
+ super(Processor, self).__init__(
32
+ pulsar_host=pulsar_host,
33
+ log_level=log_level,
34
+ input_queue=input_queue,
35
+ output_queue=output_queue,
36
+ subscriber=subscriber,
37
+ input_schema=EmbeddingsRequest,
38
+ output_schema=EmbeddingsResponse,
39
+ )
40
+
41
+ self.embeddings = OllamaEmbeddings(base_url=ollama, model=model)
42
+
43
+ def handle(self, msg):
44
+
45
+ v = msg.value()
46
+
47
+ # Sender-produced ID
48
+
49
+ id = msg.properties()["id"]
50
+
51
+ print(f"Handling input {id}...", flush=True)
52
+
53
+ text = v.text
54
+ embeds = self.embeddings.embed_query([text])
55
+
56
+ print("Send response...", flush=True)
57
+ r = EmbeddingsResponse(vectors=[embeds])
58
+
59
+ self.producer.send(r, properties={"id": id})
60
+
61
+ print("Done.", flush=True)
62
+
63
+ @staticmethod
64
+ def add_args(parser):
65
+
66
+ ConsumerProducer.add_args(
67
+ parser, default_input_queue, default_subscriber,
68
+ default_output_queue,
69
+ )
70
+
71
+ parser.add_argument(
72
+ '-m', '--model',
73
+ default=default_model,
74
+ help=f'Embeddings model (default: {default_model})'
75
+ )
76
+
77
+ parser.add_argument(
78
+ '-r', '--ollama',
79
+ default=default_ollama,
80
+ help=f'ollama (default: {default_ollama})'
81
+ )
82
+
83
+ def run():
84
+
85
+ Processor.start('embeddings-ollama', __doc__)
86
+