trustgraph 0.2.2__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of trustgraph might be problematic. Click here for more details.
- trustgraph-0.2.4/PKG-INFO +136 -0
- trustgraph-0.2.4/README.md +102 -0
- trustgraph-0.2.4/scripts/embeddings-ollama +6 -0
- trustgraph-0.2.4/scripts/init-pulsar-manager +11 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/setup.py +9 -7
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/chunker/recursive/chunker.py +20 -2
- trustgraph-0.2.4/trustgraph/embeddings/ollama/__init__.py +3 -0
- trustgraph-0.2.4/trustgraph/embeddings/ollama/__main__.py +7 -0
- trustgraph-0.2.4/trustgraph/embeddings/ollama/processor.py +175 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/graph_rag.py +11 -6
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/llm/ollama_text/llm.py +2 -2
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/rag/graph/rag.py +31 -1
- trustgraph-0.2.2/trustgraph/edge_map.py → trustgraph-0.2.4/trustgraph/triple_vectors.py +52 -18
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/trustgraph.py +1 -1
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/vector/milvus_write/write.py +2 -2
- trustgraph-0.2.4/trustgraph.egg-info/PKG-INFO +136 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph.egg-info/SOURCES.txt +6 -1
- trustgraph-0.2.2/PKG-INFO +0 -454
- trustgraph-0.2.2/README.md +0 -420
- trustgraph-0.2.2/trustgraph.egg-info/PKG-INFO +0 -454
- {trustgraph-0.2.2 → trustgraph-0.2.4}/LICENSE +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/chunker-recursive +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/embeddings-hf +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/embeddings-vectorize +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/graph-rag +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/graph-show +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/graph-to-turtle +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/graph-write-cassandra +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/kg-extract-definitions +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/kg-extract-relationships +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/llm-azure-text +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/llm-claude-text +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/llm-ollama-text +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/llm-vertexai-text +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/loader +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/pdf-decoder +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/query +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/run-processing +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/scripts/vector-write-milvus +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/setup.cfg +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/__init__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/chunker/__init__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/chunker/recursive/__init__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/chunker/recursive/__main__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/decoder/__init__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/decoder/pdf/__init__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/decoder/pdf/__main__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/decoder/pdf/pdf_decoder.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/embeddings/__init__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/embeddings/hf/__init__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/embeddings/hf/__main__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/embeddings/hf/hf.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/embeddings/vectorize/__init__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/embeddings/vectorize/__main__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/embeddings/vectorize/vectorize.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/embeddings_client.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/graph/__init__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/graph/cassandra_write/__init__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/graph/cassandra_write/__main__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/graph/cassandra_write/write.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/graph_rag_client.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/kg/__init__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/kg/extract_definitions/__init__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/kg/extract_definitions/__main__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/kg/extract_definitions/extract.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/kg/extract_relationships/__init__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/kg/extract_relationships/__main__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/kg/extract_relationships/extract.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/llm/__init__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/llm/azure_text/__init__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/llm/azure_text/__main__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/llm/azure_text/llm.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/llm/claude_text/__init__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/llm/claude_text/__main__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/llm/claude_text/llm.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/llm/ollama_text/__init__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/llm/ollama_text/__main__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/llm/vertexai_text/__init__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/llm/vertexai_text/__main__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/llm/vertexai_text/llm.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/llm_client.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/log_level.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/processing/__init__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/processing/__main__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/processing/processing.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/prompts.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/rag/__init__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/rag/graph/__init__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/rag/graph/__main__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/rdf.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/schema.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/vector/__init__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/vector/milvus_write/__init__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph/vector/milvus_write/__main__.py +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph.egg-info/dependency_links.txt +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph.egg-info/requires.txt +0 -0
- {trustgraph-0.2.2 → trustgraph-0.2.4}/trustgraph.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: trustgraph
|
|
3
|
+
Version: 0.2.4
|
|
4
|
+
Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
|
|
5
|
+
Home-page: https://github.com/trustgraph-ai/trustgraph
|
|
6
|
+
Download-URL: https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v0.2.4.tar.gz
|
|
7
|
+
Author: trustgraph.ai
|
|
8
|
+
Author-email: security@trustgraph.ai
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Requires-Python: >=3.8
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
Requires-Dist: torch
|
|
16
|
+
Requires-Dist: urllib3
|
|
17
|
+
Requires-Dist: transformers
|
|
18
|
+
Requires-Dist: sentence-transformers
|
|
19
|
+
Requires-Dist: rdflib
|
|
20
|
+
Requires-Dist: pymilvus
|
|
21
|
+
Requires-Dist: langchain
|
|
22
|
+
Requires-Dist: langchain-core
|
|
23
|
+
Requires-Dist: langchain-huggingface
|
|
24
|
+
Requires-Dist: langchain-text-splitters
|
|
25
|
+
Requires-Dist: langchain-community
|
|
26
|
+
Requires-Dist: huggingface-hub
|
|
27
|
+
Requires-Dist: requests
|
|
28
|
+
Requires-Dist: cassandra-driver
|
|
29
|
+
Requires-Dist: pulsar-client
|
|
30
|
+
Requires-Dist: pypdf
|
|
31
|
+
Requires-Dist: anthropic
|
|
32
|
+
Requires-Dist: google-cloud-aiplatform
|
|
33
|
+
Requires-Dist: pyyaml
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# TrustGraph
|
|
37
|
+
|
|
38
|
+
## Introduction
|
|
39
|
+
|
|
40
|
+
TrustGraph is a true end-to-end (e2e) knowledge pipeline that performs a `naive extraction` on a text corpus
|
|
41
|
+
to build a RDF style knowledge graph coupled with a `RAG` service compatible with cloud LLMs and open-source
|
|
42
|
+
SLMs (Small Language Models).
|
|
43
|
+
|
|
44
|
+
The pipeline processing components are interconnected with a pub/sub engine to
|
|
45
|
+
maximize modularity and enable new knowledge processing functions. The core processing components decode documents,
|
|
46
|
+
chunk text, perform embeddings, apply a local SLM/LLM, call a LLM API, and generate LM predictions.
|
|
47
|
+
|
|
48
|
+
The processing showcases the reliability and efficiences of Graph RAG algorithms which can capture
|
|
49
|
+
contextual language flags that are missed in conventional RAG approaches. Graph querying algorithms enable retrieving
|
|
50
|
+
not just relevant knowledge but language cues essential to understanding semantic uses unique to a text corpus.
|
|
51
|
+
|
|
52
|
+
Processing modules are executed in containers. Processing can be scaled-up
|
|
53
|
+
by deploying multiple containers.
|
|
54
|
+
|
|
55
|
+
### Features
|
|
56
|
+
|
|
57
|
+
- PDF decoding
|
|
58
|
+
- Text chunking
|
|
59
|
+
- Inference of LMs deployed with [Ollama](https://ollama.com)
|
|
60
|
+
- Inference of LLMs: Claude, VertexAI and AzureAI serverless endpoints
|
|
61
|
+
- Application of a [HuggingFace](https://hf.co) embeddings models
|
|
62
|
+
- [RDF](https://www.w3.org/TR/rdf12-schema/)-aligned Knowledge Graph extraction
|
|
63
|
+
- Graph edge loading into [Apache Cassandra](https://github.com/apache/cassandra)
|
|
64
|
+
- Storing embeddings in [Milvus](https://github.com/milvus-io/milvus)
|
|
65
|
+
- Embedding query service
|
|
66
|
+
- Graph RAG query service
|
|
67
|
+
- All procesing integrates with [Apache Pulsar](https://github.com/apache/pulsar/)
|
|
68
|
+
- Containers, so can be deployed using Docker Compose or Kubernetes
|
|
69
|
+
- Plug'n'play architecture: switch different LLM modules to suit your needs
|
|
70
|
+
|
|
71
|
+
## Architecture
|
|
72
|
+
|
|
73
|
+

|
|
74
|
+
|
|
75
|
+
TrustGraph is designed to be modular to support as many Language Models and environments as possible. A natural
|
|
76
|
+
fit for a modular architecture is to decompose functions into a set modules connected through a pub/sub backbone.
|
|
77
|
+
[Apache Pulsar](https://github.com/apache/pulsar/) serves as this pub/sub backbone. Pulsar acts as the data broker
|
|
78
|
+
managing inputs and outputs between modules.
|
|
79
|
+
|
|
80
|
+
**Pulsar Workflows**:
|
|
81
|
+
- For processing flows, Pulsar accepts the output of a processing module
|
|
82
|
+
and queues it for input to the next subscribed module.
|
|
83
|
+
- For services such as LLMs and embeddings, Pulsar provides a client/server
|
|
84
|
+
model. A Pulsar queue is used as the input to the service. When
|
|
85
|
+
processed, the output is then delivered to a separate queue where a client
|
|
86
|
+
subscriber can request that output.
|
|
87
|
+
|
|
88
|
+
The entire architecture, the pub/sub backbone and set of modules, is bundled into a single Python package. A container image with the
|
|
89
|
+
package installed can also run the entire architecture.
|
|
90
|
+
|
|
91
|
+
## Core Modules
|
|
92
|
+
|
|
93
|
+
- `chunker-recursive` - Accepts text documents and uses LangChain recursive
|
|
94
|
+
chunking algorithm to produce smaller text chunks.
|
|
95
|
+
- `embeddings-hf` - A service which analyses text and returns a vector
|
|
96
|
+
embedding using one of the HuggingFace embeddings models.
|
|
97
|
+
- `embeddings-vectorize` - Uses an embeddings service to get a vector
|
|
98
|
+
embedding which is added to the processor payload.
|
|
99
|
+
- `graph-rag` - A query service which applies a Graph RAG algorithm to
|
|
100
|
+
provide a response to a text prompt.
|
|
101
|
+
- `graph-write-cassandra` - Takes knowledge graph edges and writes them to
|
|
102
|
+
a Cassandra store.
|
|
103
|
+
- `kg-extract-definitions` - knowledge extractor - examines text and
|
|
104
|
+
produces graph edges.
|
|
105
|
+
describing discovered terms and also their defintions. Definitions are
|
|
106
|
+
derived using the input documents.
|
|
107
|
+
- `kg-extract-relationships` - knowledge extractor - examines text and
|
|
108
|
+
produces graph edges describing the relationships between discovered
|
|
109
|
+
terms.
|
|
110
|
+
- `loader` - Takes a document and loads into the processing pipeline. Used
|
|
111
|
+
e.g. to add PDF documents.
|
|
112
|
+
- `pdf-decoder` - Takes a PDF doc and emits text extracted from the document.
|
|
113
|
+
Text extraction from PDF is not a perfect science as PDF is a printable
|
|
114
|
+
format. For instance, the wrapping of text between lines in a PDF document
|
|
115
|
+
is not semantically encoded, so the decoder will see wrapped lines as
|
|
116
|
+
space-separated.
|
|
117
|
+
- `vector-write-milvus` - Takes vector-entity mappings and records them
|
|
118
|
+
in the vector embeddings store.
|
|
119
|
+
|
|
120
|
+
## LM Specific Modules
|
|
121
|
+
|
|
122
|
+
- `llm-azure-text` - Sends request to AzureAI serverless endpoint
|
|
123
|
+
- `llm-claude-text` - Sends request to Anthropic's API
|
|
124
|
+
- `llm-ollama-text` - Sends request to LM running using Ollama
|
|
125
|
+
- `llm-vertexai-text` - Sends request to model available through VertexAI API
|
|
126
|
+
|
|
127
|
+
## Quickstart Guide
|
|
128
|
+
|
|
129
|
+
See [Quickstart on Docker Compose](docs/README.quickstart-docker-compose.md)
|
|
130
|
+
|
|
131
|
+
## Development Guide
|
|
132
|
+
|
|
133
|
+
See [Development on trustgraph](docs/README.development.md)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
|
|
2
|
+
# TrustGraph
|
|
3
|
+
|
|
4
|
+
## Introduction
|
|
5
|
+
|
|
6
|
+
TrustGraph is a true end-to-end (e2e) knowledge pipeline that performs a `naive extraction` on a text corpus
|
|
7
|
+
to build a RDF style knowledge graph coupled with a `RAG` service compatible with cloud LLMs and open-source
|
|
8
|
+
SLMs (Small Language Models).
|
|
9
|
+
|
|
10
|
+
The pipeline processing components are interconnected with a pub/sub engine to
|
|
11
|
+
maximize modularity and enable new knowledge processing functions. The core processing components decode documents,
|
|
12
|
+
chunk text, perform embeddings, apply a local SLM/LLM, call a LLM API, and generate LM predictions.
|
|
13
|
+
|
|
14
|
+
The processing showcases the reliability and efficiences of Graph RAG algorithms which can capture
|
|
15
|
+
contextual language flags that are missed in conventional RAG approaches. Graph querying algorithms enable retrieving
|
|
16
|
+
not just relevant knowledge but language cues essential to understanding semantic uses unique to a text corpus.
|
|
17
|
+
|
|
18
|
+
Processing modules are executed in containers. Processing can be scaled-up
|
|
19
|
+
by deploying multiple containers.
|
|
20
|
+
|
|
21
|
+
### Features
|
|
22
|
+
|
|
23
|
+
- PDF decoding
|
|
24
|
+
- Text chunking
|
|
25
|
+
- Inference of LMs deployed with [Ollama](https://ollama.com)
|
|
26
|
+
- Inference of LLMs: Claude, VertexAI and AzureAI serverless endpoints
|
|
27
|
+
- Application of a [HuggingFace](https://hf.co) embeddings models
|
|
28
|
+
- [RDF](https://www.w3.org/TR/rdf12-schema/)-aligned Knowledge Graph extraction
|
|
29
|
+
- Graph edge loading into [Apache Cassandra](https://github.com/apache/cassandra)
|
|
30
|
+
- Storing embeddings in [Milvus](https://github.com/milvus-io/milvus)
|
|
31
|
+
- Embedding query service
|
|
32
|
+
- Graph RAG query service
|
|
33
|
+
- All procesing integrates with [Apache Pulsar](https://github.com/apache/pulsar/)
|
|
34
|
+
- Containers, so can be deployed using Docker Compose or Kubernetes
|
|
35
|
+
- Plug'n'play architecture: switch different LLM modules to suit your needs
|
|
36
|
+
|
|
37
|
+
## Architecture
|
|
38
|
+
|
|
39
|
+

|
|
40
|
+
|
|
41
|
+
TrustGraph is designed to be modular to support as many Language Models and environments as possible. A natural
|
|
42
|
+
fit for a modular architecture is to decompose functions into a set modules connected through a pub/sub backbone.
|
|
43
|
+
[Apache Pulsar](https://github.com/apache/pulsar/) serves as this pub/sub backbone. Pulsar acts as the data broker
|
|
44
|
+
managing inputs and outputs between modules.
|
|
45
|
+
|
|
46
|
+
**Pulsar Workflows**:
|
|
47
|
+
- For processing flows, Pulsar accepts the output of a processing module
|
|
48
|
+
and queues it for input to the next subscribed module.
|
|
49
|
+
- For services such as LLMs and embeddings, Pulsar provides a client/server
|
|
50
|
+
model. A Pulsar queue is used as the input to the service. When
|
|
51
|
+
processed, the output is then delivered to a separate queue where a client
|
|
52
|
+
subscriber can request that output.
|
|
53
|
+
|
|
54
|
+
The entire architecture, the pub/sub backbone and set of modules, is bundled into a single Python package. A container image with the
|
|
55
|
+
package installed can also run the entire architecture.
|
|
56
|
+
|
|
57
|
+
## Core Modules
|
|
58
|
+
|
|
59
|
+
- `chunker-recursive` - Accepts text documents and uses LangChain recursive
|
|
60
|
+
chunking algorithm to produce smaller text chunks.
|
|
61
|
+
- `embeddings-hf` - A service which analyses text and returns a vector
|
|
62
|
+
embedding using one of the HuggingFace embeddings models.
|
|
63
|
+
- `embeddings-vectorize` - Uses an embeddings service to get a vector
|
|
64
|
+
embedding which is added to the processor payload.
|
|
65
|
+
- `graph-rag` - A query service which applies a Graph RAG algorithm to
|
|
66
|
+
provide a response to a text prompt.
|
|
67
|
+
- `graph-write-cassandra` - Takes knowledge graph edges and writes them to
|
|
68
|
+
a Cassandra store.
|
|
69
|
+
- `kg-extract-definitions` - knowledge extractor - examines text and
|
|
70
|
+
produces graph edges.
|
|
71
|
+
describing discovered terms and also their defintions. Definitions are
|
|
72
|
+
derived using the input documents.
|
|
73
|
+
- `kg-extract-relationships` - knowledge extractor - examines text and
|
|
74
|
+
produces graph edges describing the relationships between discovered
|
|
75
|
+
terms.
|
|
76
|
+
- `loader` - Takes a document and loads into the processing pipeline. Used
|
|
77
|
+
e.g. to add PDF documents.
|
|
78
|
+
- `pdf-decoder` - Takes a PDF doc and emits text extracted from the document.
|
|
79
|
+
Text extraction from PDF is not a perfect science as PDF is a printable
|
|
80
|
+
format. For instance, the wrapping of text between lines in a PDF document
|
|
81
|
+
is not semantically encoded, so the decoder will see wrapped lines as
|
|
82
|
+
space-separated.
|
|
83
|
+
- `vector-write-milvus` - Takes vector-entity mappings and records them
|
|
84
|
+
in the vector embeddings store.
|
|
85
|
+
|
|
86
|
+
## LM Specific Modules
|
|
87
|
+
|
|
88
|
+
- `llm-azure-text` - Sends request to AzureAI serverless endpoint
|
|
89
|
+
- `llm-claude-text` - Sends request to Anthropic's API
|
|
90
|
+
- `llm-ollama-text` - Sends request to LM running using Ollama
|
|
91
|
+
- `llm-vertexai-text` - Sends request to model available through VertexAI API
|
|
92
|
+
|
|
93
|
+
## Quickstart Guide
|
|
94
|
+
|
|
95
|
+
See [Quickstart on Docker Compose](docs/README.quickstart-docker-compose.md)
|
|
96
|
+
|
|
97
|
+
## Development Guide
|
|
98
|
+
|
|
99
|
+
See [Development on trustgraph](docs/README.development.md)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
|
|
3
|
+
CSRF_TOKEN=$(curl http://localhost:7750/pulsar-manager/csrf-token)
|
|
4
|
+
|
|
5
|
+
curl \
|
|
6
|
+
-H "X-XSRF-TOKEN: $CSRF_TOKEN" \
|
|
7
|
+
-H "Cookie: XSRF-TOKEN=$CSRF_TOKEN;" \
|
|
8
|
+
-H 'Content-Type: application/json' \
|
|
9
|
+
-X PUT \
|
|
10
|
+
http://localhost:7750/pulsar-manager/users/superuser \
|
|
11
|
+
-d '{"name": "admin", "password": "apachepulsar", "description": "test", "email": "username@test.org"}'
|
|
@@ -4,7 +4,7 @@ import os
|
|
|
4
4
|
with open("README.md", "r") as fh:
|
|
5
5
|
long_description = fh.read()
|
|
6
6
|
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.4"
|
|
8
8
|
|
|
9
9
|
setuptools.setup(
|
|
10
10
|
name="trustgraph",
|
|
@@ -46,22 +46,24 @@ setuptools.setup(
|
|
|
46
46
|
],
|
|
47
47
|
scripts=[
|
|
48
48
|
"scripts/chunker-recursive",
|
|
49
|
+
"scripts/embeddings-hf",
|
|
50
|
+
"scripts/embeddings-ollama",
|
|
51
|
+
"scripts/embeddings-vectorize",
|
|
52
|
+
"scripts/graph-rag",
|
|
49
53
|
"scripts/graph-show",
|
|
50
54
|
"scripts/graph-to-turtle",
|
|
51
55
|
"scripts/graph-write-cassandra",
|
|
56
|
+
"scripts/init-pulsar-manager",
|
|
52
57
|
"scripts/kg-extract-definitions",
|
|
53
58
|
"scripts/kg-extract-relationships",
|
|
59
|
+
"scripts/llm-azure-text",
|
|
60
|
+
"scripts/llm-claude-text",
|
|
54
61
|
"scripts/llm-ollama-text",
|
|
55
62
|
"scripts/llm-vertexai-text",
|
|
56
|
-
"scripts/llm-claude-text",
|
|
57
|
-
"scripts/llm-azure-text",
|
|
58
|
-
"scripts/run-processing",
|
|
59
63
|
"scripts/loader",
|
|
60
64
|
"scripts/pdf-decoder",
|
|
61
65
|
"scripts/query",
|
|
62
|
-
"scripts/
|
|
63
|
-
"scripts/embeddings-hf",
|
|
66
|
+
"scripts/run-processing",
|
|
64
67
|
"scripts/vector-write-milvus",
|
|
65
|
-
"scripts/graph-rag",
|
|
66
68
|
]
|
|
67
69
|
)
|
|
@@ -30,6 +30,8 @@ class Processor:
|
|
|
30
30
|
output_queue=default_output_queue,
|
|
31
31
|
subscriber=default_subscriber,
|
|
32
32
|
log_level=LogLevel.INFO,
|
|
33
|
+
chunk_size=2000,
|
|
34
|
+
chunk_overlap=100,
|
|
33
35
|
):
|
|
34
36
|
|
|
35
37
|
self.client = None
|
|
@@ -50,8 +52,8 @@ class Processor:
|
|
|
50
52
|
)
|
|
51
53
|
|
|
52
54
|
self.text_splitter = RecursiveCharacterTextSplitter(
|
|
53
|
-
chunk_size=
|
|
54
|
-
chunk_overlap=
|
|
55
|
+
chunk_size=chunk_size,
|
|
56
|
+
chunk_overlap=chunk_overlap,
|
|
55
57
|
length_function=len,
|
|
56
58
|
is_separator_regex=False,
|
|
57
59
|
)
|
|
@@ -146,6 +148,20 @@ def run():
|
|
|
146
148
|
help=f'Output queue (default: info)'
|
|
147
149
|
)
|
|
148
150
|
|
|
151
|
+
parser.add_argument(
|
|
152
|
+
'-z', '--chunk-size',
|
|
153
|
+
type=int,
|
|
154
|
+
default=2000,
|
|
155
|
+
help=f'Chunk size (default: 2000)'
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
parser.add_argument(
|
|
159
|
+
'-v', '--chunk-overlap',
|
|
160
|
+
type=int,
|
|
161
|
+
default=100,
|
|
162
|
+
help=f'Chunk overlap (default: 100)'
|
|
163
|
+
)
|
|
164
|
+
|
|
149
165
|
args = parser.parse_args()
|
|
150
166
|
|
|
151
167
|
|
|
@@ -159,6 +175,8 @@ def run():
|
|
|
159
175
|
output_queue=args.output_queue,
|
|
160
176
|
subscriber=args.subscriber,
|
|
161
177
|
log_level=args.log_level,
|
|
178
|
+
chunk_size=args.chunk_size,
|
|
179
|
+
chunk_overlap=args.chunk_overlap,
|
|
162
180
|
)
|
|
163
181
|
|
|
164
182
|
p.run()
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
|
|
2
|
+
"""
|
|
3
|
+
Embeddings service, applies an embeddings model selected from HuggingFace.
|
|
4
|
+
Input is text, output is embeddings vector.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import pulsar
|
|
8
|
+
from pulsar.schema import JsonSchema
|
|
9
|
+
import tempfile
|
|
10
|
+
import base64
|
|
11
|
+
import os
|
|
12
|
+
import argparse
|
|
13
|
+
from langchain_community.embeddings import OllamaEmbeddings
|
|
14
|
+
import time
|
|
15
|
+
|
|
16
|
+
from ... schema import EmbeddingsRequest, EmbeddingsResponse
|
|
17
|
+
from ... log_level import LogLevel
|
|
18
|
+
|
|
19
|
+
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
|
|
20
|
+
default_input_queue = 'embeddings'
|
|
21
|
+
default_output_queue = 'embeddings-response'
|
|
22
|
+
default_subscriber = 'embeddings-ollama'
|
|
23
|
+
default_model="mxbai-embed-large"
|
|
24
|
+
default_ollama = 'http://localhost:11434'
|
|
25
|
+
|
|
26
|
+
class Processor:
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
pulsar_host=default_pulsar_host,
|
|
31
|
+
input_queue=default_input_queue,
|
|
32
|
+
output_queue=default_output_queue,
|
|
33
|
+
subscriber=default_subscriber,
|
|
34
|
+
log_level=LogLevel.INFO,
|
|
35
|
+
model=default_model,
|
|
36
|
+
ollama=default_ollama,
|
|
37
|
+
):
|
|
38
|
+
|
|
39
|
+
self.client = None
|
|
40
|
+
|
|
41
|
+
self.client = pulsar.Client(
|
|
42
|
+
pulsar_host,
|
|
43
|
+
logger=pulsar.ConsoleLogger(log_level.to_pulsar())
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
self.consumer = self.client.subscribe(
|
|
47
|
+
input_queue, subscriber,
|
|
48
|
+
schema=JsonSchema(EmbeddingsRequest),
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
self.producer = self.client.create_producer(
|
|
52
|
+
topic=output_queue,
|
|
53
|
+
schema=JsonSchema(EmbeddingsResponse),
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
self.embeddings = OllamaEmbeddings(base_url=ollama, model=model)
|
|
57
|
+
|
|
58
|
+
def run(self):
|
|
59
|
+
|
|
60
|
+
while True:
|
|
61
|
+
|
|
62
|
+
msg = self.consumer.receive()
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
|
|
66
|
+
v = msg.value()
|
|
67
|
+
|
|
68
|
+
# Sender-produced ID
|
|
69
|
+
|
|
70
|
+
id = msg.properties()["id"]
|
|
71
|
+
|
|
72
|
+
print(f"Handling input {id}...", flush=True)
|
|
73
|
+
|
|
74
|
+
text = v.text
|
|
75
|
+
embeds = self.embeddings.embed_query([text])
|
|
76
|
+
|
|
77
|
+
print("Send response...", flush=True)
|
|
78
|
+
r = EmbeddingsResponse(vectors=[embeds])
|
|
79
|
+
|
|
80
|
+
self.producer.send(r, properties={"id": id})
|
|
81
|
+
|
|
82
|
+
print("Done.", flush=True)
|
|
83
|
+
|
|
84
|
+
# Acknowledge successful processing of the message
|
|
85
|
+
self.consumer.acknowledge(msg)
|
|
86
|
+
|
|
87
|
+
except Exception as e:
|
|
88
|
+
|
|
89
|
+
print("Exception:", e, flush=True)
|
|
90
|
+
|
|
91
|
+
# Message failed to be processed
|
|
92
|
+
self.consumer.negative_acknowledge(msg)
|
|
93
|
+
|
|
94
|
+
def __del__(self):
|
|
95
|
+
|
|
96
|
+
if self.client:
|
|
97
|
+
self.client.close()
|
|
98
|
+
|
|
99
|
+
def run():
|
|
100
|
+
|
|
101
|
+
parser = argparse.ArgumentParser(
|
|
102
|
+
prog='embeddings-ollama',
|
|
103
|
+
description=__doc__,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
parser.add_argument(
|
|
107
|
+
'-p', '--pulsar-host',
|
|
108
|
+
default=default_pulsar_host,
|
|
109
|
+
help=f'Pulsar host (default: {default_pulsar_host})',
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
parser.add_argument(
|
|
113
|
+
'-i', '--input-queue',
|
|
114
|
+
default=default_input_queue,
|
|
115
|
+
help=f'Input queue (default: {default_input_queue})'
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
parser.add_argument(
|
|
119
|
+
'-s', '--subscriber',
|
|
120
|
+
default=default_subscriber,
|
|
121
|
+
help=f'Queue subscriber name (default: {default_subscriber})'
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
parser.add_argument(
|
|
125
|
+
'-o', '--output-queue',
|
|
126
|
+
default=default_output_queue,
|
|
127
|
+
help=f'Output queue (default: {default_output_queue})'
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
parser.add_argument(
|
|
131
|
+
'-l', '--log-level',
|
|
132
|
+
type=LogLevel,
|
|
133
|
+
default=LogLevel.INFO,
|
|
134
|
+
choices=list(LogLevel),
|
|
135
|
+
help=f'Output queue (default: info)'
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
parser.add_argument(
|
|
139
|
+
'-m', '--model',
|
|
140
|
+
default=default_model,
|
|
141
|
+
help=f'Embeddings model (default: {default_model})'
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
parser.add_argument(
|
|
145
|
+
'-r', '--ollama',
|
|
146
|
+
default=default_ollama,
|
|
147
|
+
help=f'ollama (default: {default_ollama})'
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
args = parser.parse_args()
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
while True:
|
|
154
|
+
|
|
155
|
+
try:
|
|
156
|
+
|
|
157
|
+
p = Processor(
|
|
158
|
+
pulsar_host=args.pulsar_host,
|
|
159
|
+
input_queue=args.input_queue,
|
|
160
|
+
output_queue=args.output_queue,
|
|
161
|
+
subscriber=args.subscriber,
|
|
162
|
+
log_level=args.log_level,
|
|
163
|
+
model=args.model,
|
|
164
|
+
ollama=args.ollama,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
p.run()
|
|
168
|
+
|
|
169
|
+
except Exception as e:
|
|
170
|
+
|
|
171
|
+
print("Exception:", e, flush=True)
|
|
172
|
+
print("Will retry...", flush=True)
|
|
173
|
+
|
|
174
|
+
time.sleep(10)
|
|
175
|
+
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
|
|
2
2
|
from trustgraph.trustgraph import TrustGraph
|
|
3
|
-
from trustgraph.
|
|
3
|
+
from trustgraph.triple_vectors import TripleVectors
|
|
4
4
|
from trustgraph.trustgraph import TrustGraph
|
|
5
5
|
from trustgraph.llm_client import LlmClient
|
|
6
6
|
from trustgraph.embeddings_client import EmbeddingsClient
|
|
@@ -15,7 +15,10 @@ class GraphRag:
|
|
|
15
15
|
graph_hosts=None,
|
|
16
16
|
pulsar_host="pulsar://pulsar:6650",
|
|
17
17
|
vector_store="http://milvus:19530",
|
|
18
|
-
verbose=False
|
|
18
|
+
verbose=False,
|
|
19
|
+
entity_limit=50,
|
|
20
|
+
triple_limit=30,
|
|
21
|
+
max_sg_size=3000,
|
|
19
22
|
):
|
|
20
23
|
|
|
21
24
|
self.verbose=verbose
|
|
@@ -30,11 +33,11 @@ class GraphRag:
|
|
|
30
33
|
|
|
31
34
|
self.embeddings = EmbeddingsClient(pulsar_host=pulsar_host)
|
|
32
35
|
|
|
33
|
-
self.vecstore =
|
|
36
|
+
self.vecstore = TripleVectors(vector_store)
|
|
34
37
|
|
|
35
|
-
self.entity_limit=
|
|
36
|
-
self.query_limit=
|
|
37
|
-
self.max_sg_size=
|
|
38
|
+
self.entity_limit=entity_limit
|
|
39
|
+
self.query_limit=triple_limit
|
|
40
|
+
self.max_sg_size=max_sg_size
|
|
38
41
|
|
|
39
42
|
self.label_cache = {}
|
|
40
43
|
|
|
@@ -71,6 +74,8 @@ class GraphRag:
|
|
|
71
74
|
limit=self.entity_limit
|
|
72
75
|
)
|
|
73
76
|
|
|
77
|
+
print("Obtained", len(res), "entities")
|
|
78
|
+
|
|
74
79
|
entities = set([
|
|
75
80
|
item["entity"]["entity"]
|
|
76
81
|
for item in res
|
|
@@ -142,8 +142,8 @@ def run():
|
|
|
142
142
|
|
|
143
143
|
parser.add_argument(
|
|
144
144
|
'-r', '--ollama',
|
|
145
|
-
default=
|
|
146
|
-
help=f'ollama (default:
|
|
145
|
+
default=default_ollama,
|
|
146
|
+
help=f'ollama (default: {default_ollama})'
|
|
147
147
|
)
|
|
148
148
|
|
|
149
149
|
args = parser.parse_args()
|
|
@@ -34,6 +34,9 @@ class Processor:
|
|
|
34
34
|
log_level=LogLevel.INFO,
|
|
35
35
|
graph_hosts=default_graph_hosts,
|
|
36
36
|
vector_store=default_vector_store,
|
|
37
|
+
entity_limit=50,
|
|
38
|
+
triple_limit=30,
|
|
39
|
+
max_sg_size=3000,
|
|
37
40
|
):
|
|
38
41
|
|
|
39
42
|
self.client = None
|
|
@@ -58,6 +61,9 @@ class Processor:
|
|
|
58
61
|
graph_hosts=graph_hosts,
|
|
59
62
|
vector_store=vector_store,
|
|
60
63
|
verbose=True,
|
|
64
|
+
entity_limit=entity_limit,
|
|
65
|
+
triple_limit=triple_limit,
|
|
66
|
+
max_sg_size=max_sg_size,
|
|
61
67
|
)
|
|
62
68
|
|
|
63
69
|
def run(self):
|
|
@@ -102,7 +108,7 @@ class Processor:
|
|
|
102
108
|
def run():
|
|
103
109
|
|
|
104
110
|
parser = argparse.ArgumentParser(
|
|
105
|
-
prog='
|
|
111
|
+
prog='graph-rag',
|
|
106
112
|
description=__doc__,
|
|
107
113
|
)
|
|
108
114
|
|
|
@@ -150,6 +156,27 @@ def run():
|
|
|
150
156
|
help=f'Vector host (default: http://milvus:19530)'
|
|
151
157
|
)
|
|
152
158
|
|
|
159
|
+
parser.add_argument(
|
|
160
|
+
'-e', '--entity-limit',
|
|
161
|
+
type=int,
|
|
162
|
+
default=50,
|
|
163
|
+
help=f'Entity vector fetch limit (default: 50)'
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
parser.add_argument(
|
|
167
|
+
'-t', '--triple-limit',
|
|
168
|
+
type=int,
|
|
169
|
+
default=30,
|
|
170
|
+
help=f'Triple query limit, per query (default: 30)'
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
parser.add_argument(
|
|
174
|
+
'-u', '--max-subgraph-size',
|
|
175
|
+
type=int,
|
|
176
|
+
default=3000,
|
|
177
|
+
help=f'Max subgraph size (default: 3000)'
|
|
178
|
+
)
|
|
179
|
+
|
|
153
180
|
args = parser.parse_args()
|
|
154
181
|
|
|
155
182
|
while True:
|
|
@@ -164,6 +191,9 @@ def run():
|
|
|
164
191
|
log_level=args.log_level,
|
|
165
192
|
graph_hosts=args.graph_hosts.split(","),
|
|
166
193
|
vector_store=args.vector_store,
|
|
194
|
+
entity_limit=args.entity_limit,
|
|
195
|
+
triple_limit=args.triple_limit,
|
|
196
|
+
max_sg_size=args.max_subgraph_size,
|
|
167
197
|
)
|
|
168
198
|
|
|
169
199
|
p.run()
|