trustgraph 0.2.2__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of trustgraph might be problematic. Click here for more details.

Files changed (97) hide show
  1. trustgraph-0.2.4/PKG-INFO +136 -0
  2. trustgraph-0.2.4/README.md +102 -0
  3. trustgraph-0.2.4/scripts/embeddings-ollama +6 -0
  4. trustgraph-0.2.4/scripts/init-pulsar-manager +11 -0
  5. {trustgraph-0.2.2 → trustgraph-0.2.4}/setup.py +9 -7
  6. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/chunker/recursive/chunker.py +20 -2
  7. trustgraph-0.2.4/trustgraph/embeddings/ollama/__init__.py +3 -0
  8. trustgraph-0.2.4/trustgraph/embeddings/ollama/__main__.py +7 -0
  9. trustgraph-0.2.4/trustgraph/embeddings/ollama/processor.py +175 -0
  10. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/graph_rag.py +11 -6
  11. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/llm/ollama_text/llm.py +2 -2
  12. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/rag/graph/rag.py +31 -1
  13. trustgraph-0.2.2/trustgraph/edge_map.py → trustgraph-0.2.4/trustgraph/triple_vectors.py +52 -18
  14. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/trustgraph.py +1 -1
  15. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/vector/milvus_write/write.py +2 -2
  16. trustgraph-0.2.4/trustgraph.egg-info/PKG-INFO +136 -0
  17. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph.egg-info/SOURCES.txt +6 -1
  18. trustgraph-0.2.2/PKG-INFO +0 -454
  19. trustgraph-0.2.2/README.md +0 -420
  20. trustgraph-0.2.2/trustgraph.egg-info/PKG-INFO +0 -454
  21. {trustgraph-0.2.2 → trustgraph-0.2.4}/LICENSE +0 -0
  22. {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/chunker-recursive +0 -0
  23. {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/embeddings-hf +0 -0
  24. {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/embeddings-vectorize +0 -0
  25. {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/graph-rag +0 -0
  26. {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/graph-show +0 -0
  27. {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/graph-to-turtle +0 -0
  28. {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/graph-write-cassandra +0 -0
  29. {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/kg-extract-definitions +0 -0
  30. {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/kg-extract-relationships +0 -0
  31. {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/llm-azure-text +0 -0
  32. {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/llm-claude-text +0 -0
  33. {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/llm-ollama-text +0 -0
  34. {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/llm-vertexai-text +0 -0
  35. {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/loader +0 -0
  36. {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/pdf-decoder +0 -0
  37. {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/query +0 -0
  38. {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/run-processing +0 -0
  39. {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/vector-write-milvus +0 -0
  40. {trustgraph-0.2.2 → trustgraph-0.2.4}/setup.cfg +0 -0
  41. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/__init__.py +0 -0
  42. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/chunker/__init__.py +0 -0
  43. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/chunker/recursive/__init__.py +0 -0
  44. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/chunker/recursive/__main__.py +0 -0
  45. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/decoder/__init__.py +0 -0
  46. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/decoder/pdf/__init__.py +0 -0
  47. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/decoder/pdf/__main__.py +0 -0
  48. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/decoder/pdf/pdf_decoder.py +0 -0
  49. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/embeddings/__init__.py +0 -0
  50. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/embeddings/hf/__init__.py +0 -0
  51. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/embeddings/hf/__main__.py +0 -0
  52. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/embeddings/hf/hf.py +0 -0
  53. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/embeddings/vectorize/__init__.py +0 -0
  54. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/embeddings/vectorize/__main__.py +0 -0
  55. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/embeddings/vectorize/vectorize.py +0 -0
  56. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/embeddings_client.py +0 -0
  57. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/graph/__init__.py +0 -0
  58. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/graph/cassandra_write/__init__.py +0 -0
  59. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/graph/cassandra_write/__main__.py +0 -0
  60. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/graph/cassandra_write/write.py +0 -0
  61. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/graph_rag_client.py +0 -0
  62. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/kg/__init__.py +0 -0
  63. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/kg/extract_definitions/__init__.py +0 -0
  64. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/kg/extract_definitions/__main__.py +0 -0
  65. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/kg/extract_definitions/extract.py +0 -0
  66. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/kg/extract_relationships/__init__.py +0 -0
  67. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/kg/extract_relationships/__main__.py +0 -0
  68. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/kg/extract_relationships/extract.py +0 -0
  69. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/llm/__init__.py +0 -0
  70. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/llm/azure_text/__init__.py +0 -0
  71. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/llm/azure_text/__main__.py +0 -0
  72. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/llm/azure_text/llm.py +0 -0
  73. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/llm/claude_text/__init__.py +0 -0
  74. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/llm/claude_text/__main__.py +0 -0
  75. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/llm/claude_text/llm.py +0 -0
  76. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/llm/ollama_text/__init__.py +0 -0
  77. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/llm/ollama_text/__main__.py +0 -0
  78. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/llm/vertexai_text/__init__.py +0 -0
  79. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/llm/vertexai_text/__main__.py +0 -0
  80. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/llm/vertexai_text/llm.py +0 -0
  81. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/llm_client.py +0 -0
  82. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/log_level.py +0 -0
  83. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/processing/__init__.py +0 -0
  84. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/processing/__main__.py +0 -0
  85. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/processing/processing.py +0 -0
  86. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/prompts.py +0 -0
  87. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/rag/__init__.py +0 -0
  88. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/rag/graph/__init__.py +0 -0
  89. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/rag/graph/__main__.py +0 -0
  90. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/rdf.py +0 -0
  91. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/schema.py +0 -0
  92. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/vector/__init__.py +0 -0
  93. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/vector/milvus_write/__init__.py +0 -0
  94. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/vector/milvus_write/__main__.py +0 -0
  95. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph.egg-info/dependency_links.txt +0 -0
  96. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph.egg-info/requires.txt +0 -0
  97. {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph.egg-info/top_level.txt +0 -0
@@ -0,0 +1,136 @@
1
+ Metadata-Version: 2.1
2
+ Name: trustgraph
3
+ Version: 0.2.4
4
+ Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
5
+ Home-page: https://github.com/trustgraph-ai/trustgraph
6
+ Download-URL: https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v0.2.4.tar.gz
7
+ Author: trustgraph.ai
8
+ Author-email: security@trustgraph.ai
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.8
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+ Requires-Dist: torch
16
+ Requires-Dist: urllib3
17
+ Requires-Dist: transformers
18
+ Requires-Dist: sentence-transformers
19
+ Requires-Dist: rdflib
20
+ Requires-Dist: pymilvus
21
+ Requires-Dist: langchain
22
+ Requires-Dist: langchain-core
23
+ Requires-Dist: langchain-huggingface
24
+ Requires-Dist: langchain-text-splitters
25
+ Requires-Dist: langchain-community
26
+ Requires-Dist: huggingface-hub
27
+ Requires-Dist: requests
28
+ Requires-Dist: cassandra-driver
29
+ Requires-Dist: pulsar-client
30
+ Requires-Dist: pypdf
31
+ Requires-Dist: anthropic
32
+ Requires-Dist: google-cloud-aiplatform
33
+ Requires-Dist: pyyaml
34
+
35
+
36
+ # TrustGraph
37
+
38
+ ## Introduction
39
+
40
+ TrustGraph is a true end-to-end (e2e) knowledge pipeline that performs a `naive extraction` on a text corpus
41
+ to build a RDF style knowledge graph coupled with a `RAG` service compatible with cloud LLMs and open-source
42
+ SLMs (Small Language Models).
43
+
44
+ The pipeline processing components are interconnected with a pub/sub engine to
45
+ maximize modularity and enable new knowledge processing functions. The core processing components decode documents,
46
+ chunk text, perform embeddings, apply a local SLM/LLM, call a LLM API, and generate LM predictions.
47
+
48
+ The processing showcases the reliability and efficiences of Graph RAG algorithms which can capture
49
+ contextual language flags that are missed in conventional RAG approaches. Graph querying algorithms enable retrieving
50
+ not just relevant knowledge but language cues essential to understanding semantic uses unique to a text corpus.
51
+
52
+ Processing modules are executed in containers. Processing can be scaled-up
53
+ by deploying multiple containers.
54
+
55
+ ### Features
56
+
57
+ - PDF decoding
58
+ - Text chunking
59
+ - Inference of LMs deployed with [Ollama](https://ollama.com)
60
+ - Inference of LLMs: Claude, VertexAI and AzureAI serverless endpoints
61
+ - Application of a [HuggingFace](https://hf.co) embeddings models
62
+ - [RDF](https://www.w3.org/TR/rdf12-schema/)-aligned Knowledge Graph extraction
63
+ - Graph edge loading into [Apache Cassandra](https://github.com/apache/cassandra)
64
+ - Storing embeddings in [Milvus](https://github.com/milvus-io/milvus)
65
+ - Embedding query service
66
+ - Graph RAG query service
67
+ - All procesing integrates with [Apache Pulsar](https://github.com/apache/pulsar/)
68
+ - Containers, so can be deployed using Docker Compose or Kubernetes
69
+ - Plug'n'play architecture: switch different LLM modules to suit your needs
70
+
71
+ ## Architecture
72
+
73
+ ![architecture](architecture.png)
74
+
75
+ TrustGraph is designed to be modular to support as many Language Models and environments as possible. A natural
76
+ fit for a modular architecture is to decompose functions into a set modules connected through a pub/sub backbone.
77
+ [Apache Pulsar](https://github.com/apache/pulsar/) serves as this pub/sub backbone. Pulsar acts as the data broker
78
+ managing inputs and outputs between modules.
79
+
80
+ **Pulsar Workflows**:
81
+ - For processing flows, Pulsar accepts the output of a processing module
82
+ and queues it for input to the next subscribed module.
83
+ - For services such as LLMs and embeddings, Pulsar provides a client/server
84
+ model. A Pulsar queue is used as the input to the service. When
85
+ processed, the output is then delivered to a separate queue where a client
86
+ subscriber can request that output.
87
+
88
+ The entire architecture, the pub/sub backbone and set of modules, is bundled into a single Python package. A container image with the
89
+ package installed can also run the entire architecture.
90
+
91
+ ## Core Modules
92
+
93
+ - `chunker-recursive` - Accepts text documents and uses LangChain recursive
94
+ chunking algorithm to produce smaller text chunks.
95
+ - `embeddings-hf` - A service which analyses text and returns a vector
96
+ embedding using one of the HuggingFace embeddings models.
97
+ - `embeddings-vectorize` - Uses an embeddings service to get a vector
98
+ embedding which is added to the processor payload.
99
+ - `graph-rag` - A query service which applies a Graph RAG algorithm to
100
+ provide a response to a text prompt.
101
+ - `graph-write-cassandra` - Takes knowledge graph edges and writes them to
102
+ a Cassandra store.
103
+ - `kg-extract-definitions` - knowledge extractor - examines text and
104
+ produces graph edges.
105
+ describing discovered terms and also their defintions. Definitions are
106
+ derived using the input documents.
107
+ - `kg-extract-relationships` - knowledge extractor - examines text and
108
+ produces graph edges describing the relationships between discovered
109
+ terms.
110
+ - `loader` - Takes a document and loads into the processing pipeline. Used
111
+ e.g. to add PDF documents.
112
+ - `pdf-decoder` - Takes a PDF doc and emits text extracted from the document.
113
+ Text extraction from PDF is not a perfect science as PDF is a printable
114
+ format. For instance, the wrapping of text between lines in a PDF document
115
+ is not semantically encoded, so the decoder will see wrapped lines as
116
+ space-separated.
117
+ - `vector-write-milvus` - Takes vector-entity mappings and records them
118
+ in the vector embeddings store.
119
+
120
+ ## LM Specific Modules
121
+
122
+ - `llm-azure-text` - Sends request to AzureAI serverless endpoint
123
+ - `llm-claude-text` - Sends request to Anthropic's API
124
+ - `llm-ollama-text` - Sends request to LM running using Ollama
125
+ - `llm-vertexai-text` - Sends request to model available through VertexAI API
126
+
127
+ ## Quickstart Guide
128
+
129
+ See [Quickstart on Docker Compose](docs/README.quickstart-docker-compose.md)
130
+
131
+ ## Development Guide
132
+
133
+ See [Development on trustgraph](docs/README.development.md)
134
+
135
+
136
+
@@ -0,0 +1,102 @@
1
+
2
+ # TrustGraph
3
+
4
+ ## Introduction
5
+
6
+ TrustGraph is a true end-to-end (e2e) knowledge pipeline that performs a `naive extraction` on a text corpus
7
+ to build a RDF style knowledge graph coupled with a `RAG` service compatible with cloud LLMs and open-source
8
+ SLMs (Small Language Models).
9
+
10
+ The pipeline processing components are interconnected with a pub/sub engine to
11
+ maximize modularity and enable new knowledge processing functions. The core processing components decode documents,
12
+ chunk text, perform embeddings, apply a local SLM/LLM, call a LLM API, and generate LM predictions.
13
+
14
+ The processing showcases the reliability and efficiences of Graph RAG algorithms which can capture
15
+ contextual language flags that are missed in conventional RAG approaches. Graph querying algorithms enable retrieving
16
+ not just relevant knowledge but language cues essential to understanding semantic uses unique to a text corpus.
17
+
18
+ Processing modules are executed in containers. Processing can be scaled-up
19
+ by deploying multiple containers.
20
+
21
+ ### Features
22
+
23
+ - PDF decoding
24
+ - Text chunking
25
+ - Inference of LMs deployed with [Ollama](https://ollama.com)
26
+ - Inference of LLMs: Claude, VertexAI and AzureAI serverless endpoints
27
+ - Application of a [HuggingFace](https://hf.co) embeddings models
28
+ - [RDF](https://www.w3.org/TR/rdf12-schema/)-aligned Knowledge Graph extraction
29
+ - Graph edge loading into [Apache Cassandra](https://github.com/apache/cassandra)
30
+ - Storing embeddings in [Milvus](https://github.com/milvus-io/milvus)
31
+ - Embedding query service
32
+ - Graph RAG query service
33
+ - All procesing integrates with [Apache Pulsar](https://github.com/apache/pulsar/)
34
+ - Containers, so can be deployed using Docker Compose or Kubernetes
35
+ - Plug'n'play architecture: switch different LLM modules to suit your needs
36
+
37
+ ## Architecture
38
+
39
+ ![architecture](architecture.png)
40
+
41
+ TrustGraph is designed to be modular to support as many Language Models and environments as possible. A natural
42
+ fit for a modular architecture is to decompose functions into a set modules connected through a pub/sub backbone.
43
+ [Apache Pulsar](https://github.com/apache/pulsar/) serves as this pub/sub backbone. Pulsar acts as the data broker
44
+ managing inputs and outputs between modules.
45
+
46
+ **Pulsar Workflows**:
47
+ - For processing flows, Pulsar accepts the output of a processing module
48
+ and queues it for input to the next subscribed module.
49
+ - For services such as LLMs and embeddings, Pulsar provides a client/server
50
+ model. A Pulsar queue is used as the input to the service. When
51
+ processed, the output is then delivered to a separate queue where a client
52
+ subscriber can request that output.
53
+
54
+ The entire architecture, the pub/sub backbone and set of modules, is bundled into a single Python package. A container image with the
55
+ package installed can also run the entire architecture.
56
+
57
+ ## Core Modules
58
+
59
+ - `chunker-recursive` - Accepts text documents and uses LangChain recursive
60
+ chunking algorithm to produce smaller text chunks.
61
+ - `embeddings-hf` - A service which analyses text and returns a vector
62
+ embedding using one of the HuggingFace embeddings models.
63
+ - `embeddings-vectorize` - Uses an embeddings service to get a vector
64
+ embedding which is added to the processor payload.
65
+ - `graph-rag` - A query service which applies a Graph RAG algorithm to
66
+ provide a response to a text prompt.
67
+ - `graph-write-cassandra` - Takes knowledge graph edges and writes them to
68
+ a Cassandra store.
69
+ - `kg-extract-definitions` - knowledge extractor - examines text and
70
+ produces graph edges.
71
+ describing discovered terms and also their defintions. Definitions are
72
+ derived using the input documents.
73
+ - `kg-extract-relationships` - knowledge extractor - examines text and
74
+ produces graph edges describing the relationships between discovered
75
+ terms.
76
+ - `loader` - Takes a document and loads into the processing pipeline. Used
77
+ e.g. to add PDF documents.
78
+ - `pdf-decoder` - Takes a PDF doc and emits text extracted from the document.
79
+ Text extraction from PDF is not a perfect science as PDF is a printable
80
+ format. For instance, the wrapping of text between lines in a PDF document
81
+ is not semantically encoded, so the decoder will see wrapped lines as
82
+ space-separated.
83
+ - `vector-write-milvus` - Takes vector-entity mappings and records them
84
+ in the vector embeddings store.
85
+
86
+ ## LM Specific Modules
87
+
88
+ - `llm-azure-text` - Sends request to AzureAI serverless endpoint
89
+ - `llm-claude-text` - Sends request to Anthropic's API
90
+ - `llm-ollama-text` - Sends request to LM running using Ollama
91
+ - `llm-vertexai-text` - Sends request to model available through VertexAI API
92
+
93
+ ## Quickstart Guide
94
+
95
+ See [Quickstart on Docker Compose](docs/README.quickstart-docker-compose.md)
96
+
97
+ ## Development Guide
98
+
99
+ See [Development on trustgraph](docs/README.development.md)
100
+
101
+
102
+
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env python3
2
+
3
+ from trustgraph.embeddings.ollama import run
4
+
5
+ run()
6
+
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env bash
2
+
3
+ CSRF_TOKEN=$(curl http://localhost:7750/pulsar-manager/csrf-token)
4
+
5
+ curl \
6
+ -H "X-XSRF-TOKEN: $CSRF_TOKEN" \
7
+ -H "Cookie: XSRF-TOKEN=$CSRF_TOKEN;" \
8
+ -H 'Content-Type: application/json' \
9
+ -X PUT \
10
+ http://localhost:7750/pulsar-manager/users/superuser \
11
+ -d '{"name": "admin", "password": "apachepulsar", "description": "test", "email": "username@test.org"}'
@@ -4,7 +4,7 @@ import os
4
4
  with open("README.md", "r") as fh:
5
5
  long_description = fh.read()
6
6
 
7
- version = "0.2.2"
7
+ version = "0.2.4"
8
8
 
9
9
  setuptools.setup(
10
10
  name="trustgraph",
@@ -46,22 +46,24 @@ setuptools.setup(
46
46
  ],
47
47
  scripts=[
48
48
  "scripts/chunker-recursive",
49
+ "scripts/embeddings-hf",
50
+ "scripts/embeddings-ollama",
51
+ "scripts/embeddings-vectorize",
52
+ "scripts/graph-rag",
49
53
  "scripts/graph-show",
50
54
  "scripts/graph-to-turtle",
51
55
  "scripts/graph-write-cassandra",
56
+ "scripts/init-pulsar-manager",
52
57
  "scripts/kg-extract-definitions",
53
58
  "scripts/kg-extract-relationships",
59
+ "scripts/llm-azure-text",
60
+ "scripts/llm-claude-text",
54
61
  "scripts/llm-ollama-text",
55
62
  "scripts/llm-vertexai-text",
56
- "scripts/llm-claude-text",
57
- "scripts/llm-azure-text",
58
- "scripts/run-processing",
59
63
  "scripts/loader",
60
64
  "scripts/pdf-decoder",
61
65
  "scripts/query",
62
- "scripts/embeddings-vectorize",
63
- "scripts/embeddings-hf",
66
+ "scripts/run-processing",
64
67
  "scripts/vector-write-milvus",
65
- "scripts/graph-rag",
66
68
  ]
67
69
  )
@@ -30,6 +30,8 @@ class Processor:
30
30
  output_queue=default_output_queue,
31
31
  subscriber=default_subscriber,
32
32
  log_level=LogLevel.INFO,
33
+ chunk_size=2000,
34
+ chunk_overlap=100,
33
35
  ):
34
36
 
35
37
  self.client = None
@@ -50,8 +52,8 @@ class Processor:
50
52
  )
51
53
 
52
54
  self.text_splitter = RecursiveCharacterTextSplitter(
53
- chunk_size=1000,
54
- chunk_overlap=20,
55
+ chunk_size=chunk_size,
56
+ chunk_overlap=chunk_overlap,
55
57
  length_function=len,
56
58
  is_separator_regex=False,
57
59
  )
@@ -146,6 +148,20 @@ def run():
146
148
  help=f'Output queue (default: info)'
147
149
  )
148
150
 
151
+ parser.add_argument(
152
+ '-z', '--chunk-size',
153
+ type=int,
154
+ default=2000,
155
+ help=f'Chunk size (default: 2000)'
156
+ )
157
+
158
+ parser.add_argument(
159
+ '-v', '--chunk-overlap',
160
+ type=int,
161
+ default=100,
162
+ help=f'Chunk overlap (default: 100)'
163
+ )
164
+
149
165
  args = parser.parse_args()
150
166
 
151
167
 
@@ -159,6 +175,8 @@ def run():
159
175
  output_queue=args.output_queue,
160
176
  subscriber=args.subscriber,
161
177
  log_level=args.log_level,
178
+ chunk_size=args.chunk_size,
179
+ chunk_overlap=args.chunk_overlap,
162
180
  )
163
181
 
164
182
  p.run()
@@ -0,0 +1,3 @@
1
+
2
+ from . processor import *
3
+
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env python3
2
+
3
+ from . processor import run
4
+
5
+ if __name__ == '__main__':
6
+ run()
7
+
@@ -0,0 +1,175 @@
1
+
2
+ """
3
+ Embeddings service, applies an embeddings model selected from HuggingFace.
4
+ Input is text, output is embeddings vector.
5
+ """
6
+
7
+ import pulsar
8
+ from pulsar.schema import JsonSchema
9
+ import tempfile
10
+ import base64
11
+ import os
12
+ import argparse
13
+ from langchain_community.embeddings import OllamaEmbeddings
14
+ import time
15
+
16
+ from ... schema import EmbeddingsRequest, EmbeddingsResponse
17
+ from ... log_level import LogLevel
18
+
19
+ default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
20
+ default_input_queue = 'embeddings'
21
+ default_output_queue = 'embeddings-response'
22
+ default_subscriber = 'embeddings-ollama'
23
+ default_model="mxbai-embed-large"
24
+ default_ollama = 'http://localhost:11434'
25
+
26
+ class Processor:
27
+
28
+ def __init__(
29
+ self,
30
+ pulsar_host=default_pulsar_host,
31
+ input_queue=default_input_queue,
32
+ output_queue=default_output_queue,
33
+ subscriber=default_subscriber,
34
+ log_level=LogLevel.INFO,
35
+ model=default_model,
36
+ ollama=default_ollama,
37
+ ):
38
+
39
+ self.client = None
40
+
41
+ self.client = pulsar.Client(
42
+ pulsar_host,
43
+ logger=pulsar.ConsoleLogger(log_level.to_pulsar())
44
+ )
45
+
46
+ self.consumer = self.client.subscribe(
47
+ input_queue, subscriber,
48
+ schema=JsonSchema(EmbeddingsRequest),
49
+ )
50
+
51
+ self.producer = self.client.create_producer(
52
+ topic=output_queue,
53
+ schema=JsonSchema(EmbeddingsResponse),
54
+ )
55
+
56
+ self.embeddings = OllamaEmbeddings(base_url=ollama, model=model)
57
+
58
+ def run(self):
59
+
60
+ while True:
61
+
62
+ msg = self.consumer.receive()
63
+
64
+ try:
65
+
66
+ v = msg.value()
67
+
68
+ # Sender-produced ID
69
+
70
+ id = msg.properties()["id"]
71
+
72
+ print(f"Handling input {id}...", flush=True)
73
+
74
+ text = v.text
75
+ embeds = self.embeddings.embed_query([text])
76
+
77
+ print("Send response...", flush=True)
78
+ r = EmbeddingsResponse(vectors=[embeds])
79
+
80
+ self.producer.send(r, properties={"id": id})
81
+
82
+ print("Done.", flush=True)
83
+
84
+ # Acknowledge successful processing of the message
85
+ self.consumer.acknowledge(msg)
86
+
87
+ except Exception as e:
88
+
89
+ print("Exception:", e, flush=True)
90
+
91
+ # Message failed to be processed
92
+ self.consumer.negative_acknowledge(msg)
93
+
94
+ def __del__(self):
95
+
96
+ if self.client:
97
+ self.client.close()
98
+
99
+ def run():
100
+
101
+ parser = argparse.ArgumentParser(
102
+ prog='embeddings-ollama',
103
+ description=__doc__,
104
+ )
105
+
106
+ parser.add_argument(
107
+ '-p', '--pulsar-host',
108
+ default=default_pulsar_host,
109
+ help=f'Pulsar host (default: {default_pulsar_host})',
110
+ )
111
+
112
+ parser.add_argument(
113
+ '-i', '--input-queue',
114
+ default=default_input_queue,
115
+ help=f'Input queue (default: {default_input_queue})'
116
+ )
117
+
118
+ parser.add_argument(
119
+ '-s', '--subscriber',
120
+ default=default_subscriber,
121
+ help=f'Queue subscriber name (default: {default_subscriber})'
122
+ )
123
+
124
+ parser.add_argument(
125
+ '-o', '--output-queue',
126
+ default=default_output_queue,
127
+ help=f'Output queue (default: {default_output_queue})'
128
+ )
129
+
130
+ parser.add_argument(
131
+ '-l', '--log-level',
132
+ type=LogLevel,
133
+ default=LogLevel.INFO,
134
+ choices=list(LogLevel),
135
+ help=f'Output queue (default: info)'
136
+ )
137
+
138
+ parser.add_argument(
139
+ '-m', '--model',
140
+ default=default_model,
141
+ help=f'Embeddings model (default: {default_model})'
142
+ )
143
+
144
+ parser.add_argument(
145
+ '-r', '--ollama',
146
+ default=default_ollama,
147
+ help=f'ollama (default: {default_ollama})'
148
+ )
149
+
150
+ args = parser.parse_args()
151
+
152
+
153
+ while True:
154
+
155
+ try:
156
+
157
+ p = Processor(
158
+ pulsar_host=args.pulsar_host,
159
+ input_queue=args.input_queue,
160
+ output_queue=args.output_queue,
161
+ subscriber=args.subscriber,
162
+ log_level=args.log_level,
163
+ model=args.model,
164
+ ollama=args.ollama,
165
+ )
166
+
167
+ p.run()
168
+
169
+ except Exception as e:
170
+
171
+ print("Exception:", e, flush=True)
172
+ print("Will retry...", flush=True)
173
+
174
+ time.sleep(10)
175
+
@@ -1,6 +1,6 @@
1
1
 
2
2
  from trustgraph.trustgraph import TrustGraph
3
- from trustgraph.edge_map import VectorStore
3
+ from trustgraph.triple_vectors import TripleVectors
4
4
  from trustgraph.trustgraph import TrustGraph
5
5
  from trustgraph.llm_client import LlmClient
6
6
  from trustgraph.embeddings_client import EmbeddingsClient
@@ -15,7 +15,10 @@ class GraphRag:
15
15
  graph_hosts=None,
16
16
  pulsar_host="pulsar://pulsar:6650",
17
17
  vector_store="http://milvus:19530",
18
- verbose=False
18
+ verbose=False,
19
+ entity_limit=50,
20
+ triple_limit=30,
21
+ max_sg_size=3000,
19
22
  ):
20
23
 
21
24
  self.verbose=verbose
@@ -30,11 +33,11 @@ class GraphRag:
30
33
 
31
34
  self.embeddings = EmbeddingsClient(pulsar_host=pulsar_host)
32
35
 
33
- self.vecstore = VectorStore(vector_store)
36
+ self.vecstore = TripleVectors(vector_store)
34
37
 
35
- self.entity_limit=50
36
- self.query_limit=30
37
- self.max_sg_size=3000
38
+ self.entity_limit=entity_limit
39
+ self.query_limit=triple_limit
40
+ self.max_sg_size=max_sg_size
38
41
 
39
42
  self.label_cache = {}
40
43
 
@@ -71,6 +74,8 @@ class GraphRag:
71
74
  limit=self.entity_limit
72
75
  )
73
76
 
77
+ print("Obtained", len(res), "entities")
78
+
74
79
  entities = set([
75
80
  item["entity"]["entity"]
76
81
  for item in res
@@ -142,8 +142,8 @@ def run():
142
142
 
143
143
  parser.add_argument(
144
144
  '-r', '--ollama',
145
- default="http://localhost:11434",
146
- help=f'ollama (default: http://localhost:11434)'
145
+ default=default_ollama,
146
+ help=f'ollama (default: {default_ollama})'
147
147
  )
148
148
 
149
149
  args = parser.parse_args()
@@ -34,6 +34,9 @@ class Processor:
34
34
  log_level=LogLevel.INFO,
35
35
  graph_hosts=default_graph_hosts,
36
36
  vector_store=default_vector_store,
37
+ entity_limit=50,
38
+ triple_limit=30,
39
+ max_sg_size=3000,
37
40
  ):
38
41
 
39
42
  self.client = None
@@ -58,6 +61,9 @@ class Processor:
58
61
  graph_hosts=graph_hosts,
59
62
  vector_store=vector_store,
60
63
  verbose=True,
64
+ entity_limit=entity_limit,
65
+ triple_limit=triple_limit,
66
+ max_sg_size=max_sg_size,
61
67
  )
62
68
 
63
69
  def run(self):
@@ -102,7 +108,7 @@ class Processor:
102
108
  def run():
103
109
 
104
110
  parser = argparse.ArgumentParser(
105
- prog='llm-ollama-text',
111
+ prog='graph-rag',
106
112
  description=__doc__,
107
113
  )
108
114
 
@@ -150,6 +156,27 @@ def run():
150
156
  help=f'Vector host (default: http://milvus:19530)'
151
157
  )
152
158
 
159
+ parser.add_argument(
160
+ '-e', '--entity-limit',
161
+ type=int,
162
+ default=50,
163
+ help=f'Entity vector fetch limit (default: 50)'
164
+ )
165
+
166
+ parser.add_argument(
167
+ '-t', '--triple-limit',
168
+ type=int,
169
+ default=30,
170
+ help=f'Triple query limit, per query (default: 30)'
171
+ )
172
+
173
+ parser.add_argument(
174
+ '-u', '--max-subgraph-size',
175
+ type=int,
176
+ default=3000,
177
+ help=f'Max subgraph size (default: 3000)'
178
+ )
179
+
153
180
  args = parser.parse_args()
154
181
 
155
182
  while True:
@@ -164,6 +191,9 @@ def run():
164
191
  log_level=args.log_level,
165
192
  graph_hosts=args.graph_hosts.split(","),
166
193
  vector_store=args.vector_store,
194
+ entity_limit=args.entity_limit,
195
+ triple_limit=args.triple_limit,
196
+ max_sg_size=args.max_subgraph_size,
167
197
  )
168
198
 
169
199
  p.run()