trustgraph-flow 0.11.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- trustgraph-flow-0.11.11/PKG-INFO +36 -0
- trustgraph-flow-0.11.11/README.md +1 -0
- trustgraph-flow-0.11.11/scripts/chunker-recursive +6 -0
- trustgraph-flow-0.11.11/scripts/chunker-token +6 -0
- trustgraph-flow-0.11.11/scripts/de-query-milvus +6 -0
- trustgraph-flow-0.11.11/scripts/de-query-qdrant +6 -0
- trustgraph-flow-0.11.11/scripts/de-write-milvus +6 -0
- trustgraph-flow-0.11.11/scripts/de-write-qdrant +6 -0
- trustgraph-flow-0.11.11/scripts/document-rag +6 -0
- trustgraph-flow-0.11.11/scripts/embeddings-ollama +6 -0
- trustgraph-flow-0.11.11/scripts/embeddings-vectorize +6 -0
- trustgraph-flow-0.11.11/scripts/ge-query-milvus +6 -0
- trustgraph-flow-0.11.11/scripts/ge-query-qdrant +6 -0
- trustgraph-flow-0.11.11/scripts/ge-write-milvus +6 -0
- trustgraph-flow-0.11.11/scripts/ge-write-qdrant +6 -0
- trustgraph-flow-0.11.11/scripts/graph-rag +6 -0
- trustgraph-flow-0.11.11/scripts/kg-extract-definitions +6 -0
- trustgraph-flow-0.11.11/scripts/kg-extract-relationships +6 -0
- trustgraph-flow-0.11.11/scripts/kg-extract-topics +6 -0
- trustgraph-flow-0.11.11/scripts/metering +5 -0
- trustgraph-flow-0.11.11/scripts/object-extract-row +6 -0
- trustgraph-flow-0.11.11/scripts/oe-write-milvus +6 -0
- trustgraph-flow-0.11.11/scripts/pdf-decoder +6 -0
- trustgraph-flow-0.11.11/scripts/prompt-generic +6 -0
- trustgraph-flow-0.11.11/scripts/prompt-template +6 -0
- trustgraph-flow-0.11.11/scripts/rows-write-cassandra +6 -0
- trustgraph-flow-0.11.11/scripts/run-processing +6 -0
- trustgraph-flow-0.11.11/scripts/text-completion-azure +6 -0
- trustgraph-flow-0.11.11/scripts/text-completion-claude +6 -0
- trustgraph-flow-0.11.11/scripts/text-completion-cohere +6 -0
- trustgraph-flow-0.11.11/scripts/text-completion-llamafile +6 -0
- trustgraph-flow-0.11.11/scripts/text-completion-ollama +6 -0
- trustgraph-flow-0.11.11/scripts/text-completion-openai +6 -0
- trustgraph-flow-0.11.11/scripts/triples-query-cassandra +6 -0
- trustgraph-flow-0.11.11/scripts/triples-query-neo4j +6 -0
- trustgraph-flow-0.11.11/scripts/triples-write-cassandra +6 -0
- trustgraph-flow-0.11.11/scripts/triples-write-neo4j +6 -0
- trustgraph-flow-0.11.11/setup.cfg +4 -0
- trustgraph-flow-0.11.11/setup.py +96 -0
- trustgraph-flow-0.11.11/trustgraph/__init__.py +0 -0
- trustgraph-flow-0.11.11/trustgraph/chunking/__init__.py +0 -0
- trustgraph-flow-0.11.11/trustgraph/chunking/recursive/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/chunking/recursive/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/chunking/recursive/chunker.py +108 -0
- trustgraph-flow-0.11.11/trustgraph/chunking/token/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/chunking/token/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/chunking/token/chunker.py +107 -0
- trustgraph-flow-0.11.11/trustgraph/decoding/__init__.py +0 -0
- trustgraph-flow-0.11.11/trustgraph/decoding/pdf/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/decoding/pdf/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/decoding/pdf/pdf_decoder.py +87 -0
- trustgraph-flow-0.11.11/trustgraph/direct/__init__.py +0 -0
- trustgraph-flow-0.11.11/trustgraph/direct/cassandra.py +108 -0
- trustgraph-flow-0.11.11/trustgraph/direct/milvus_doc_embeddings.py +138 -0
- trustgraph-flow-0.11.11/trustgraph/direct/milvus_graph_embeddings.py +138 -0
- trustgraph-flow-0.11.11/trustgraph/direct/milvus_object_embeddings.py +154 -0
- trustgraph-flow-0.11.11/trustgraph/document_rag.py +132 -0
- trustgraph-flow-0.11.11/trustgraph/embeddings/__init__.py +0 -0
- trustgraph-flow-0.11.11/trustgraph/embeddings/ollama/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/embeddings/ollama/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/embeddings/ollama/processor.py +84 -0
- trustgraph-flow-0.11.11/trustgraph/embeddings/vectorize/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/embeddings/vectorize/__main__.py +6 -0
- trustgraph-flow-0.11.11/trustgraph/embeddings/vectorize/vectorize.py +103 -0
- trustgraph-flow-0.11.11/trustgraph/extract/__init__.py +0 -0
- trustgraph-flow-0.11.11/trustgraph/extract/kg/__init__.py +0 -0
- trustgraph-flow-0.11.11/trustgraph/extract/kg/definitions/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/extract/kg/definitions/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/extract/kg/definitions/extract.py +134 -0
- trustgraph-flow-0.11.11/trustgraph/extract/kg/relationships/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/extract/kg/relationships/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/extract/kg/relationships/extract.py +208 -0
- trustgraph-flow-0.11.11/trustgraph/extract/kg/topics/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/extract/kg/topics/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/extract/kg/topics/extract.py +134 -0
- trustgraph-flow-0.11.11/trustgraph/extract/object/__init__.py +0 -0
- trustgraph-flow-0.11.11/trustgraph/extract/object/row/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/extract/object/row/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/extract/object/row/extract.py +220 -0
- trustgraph-flow-0.11.11/trustgraph/flow_version.py +1 -0
- trustgraph-flow-0.11.11/trustgraph/graph_rag.py +250 -0
- trustgraph-flow-0.11.11/trustgraph/metering/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/metering/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/metering/counter.py +75 -0
- trustgraph-flow-0.11.11/trustgraph/metering/pricelist.py +104 -0
- trustgraph-flow-0.11.11/trustgraph/model/__init__.py +0 -0
- trustgraph-flow-0.11.11/trustgraph/model/prompt/__init__.py +0 -0
- trustgraph-flow-0.11.11/trustgraph/model/prompt/generic/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/model/prompt/generic/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/model/prompt/generic/prompts.py +176 -0
- trustgraph-flow-0.11.11/trustgraph/model/prompt/generic/service.py +473 -0
- trustgraph-flow-0.11.11/trustgraph/model/prompt/template/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/model/prompt/template/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/model/prompt/template/prompts.py +47 -0
- trustgraph-flow-0.11.11/trustgraph/model/prompt/template/service.py +523 -0
- trustgraph-flow-0.11.11/trustgraph/model/text_completion/__init__.py +0 -0
- trustgraph-flow-0.11.11/trustgraph/model/text_completion/azure/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/model/text_completion/azure/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/model/text_completion/azure/llm.py +226 -0
- trustgraph-flow-0.11.11/trustgraph/model/text_completion/claude/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/model/text_completion/claude/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/model/text_completion/claude/llm.py +199 -0
- trustgraph-flow-0.11.11/trustgraph/model/text_completion/cohere/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/model/text_completion/cohere/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/model/text_completion/cohere/llm.py +179 -0
- trustgraph-flow-0.11.11/trustgraph/model/text_completion/llamafile/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/model/text_completion/llamafile/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/model/text_completion/llamafile/llm.py +209 -0
- trustgraph-flow-0.11.11/trustgraph/model/text_completion/ollama/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/model/text_completion/ollama/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/model/text_completion/ollama/llm.py +168 -0
- trustgraph-flow-0.11.11/trustgraph/model/text_completion/openai/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/model/text_completion/openai/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/model/text_completion/openai/llm.py +209 -0
- trustgraph-flow-0.11.11/trustgraph/processing/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/processing/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/processing/processing.py +171 -0
- trustgraph-flow-0.11.11/trustgraph/query/__init__.py +0 -0
- trustgraph-flow-0.11.11/trustgraph/query/doc_embeddings/__init__.py +0 -0
- trustgraph-flow-0.11.11/trustgraph/query/doc_embeddings/milvus/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/query/doc_embeddings/milvus/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/query/doc_embeddings/milvus/service.py +106 -0
- trustgraph-flow-0.11.11/trustgraph/query/doc_embeddings/qdrant/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/query/doc_embeddings/qdrant/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/query/doc_embeddings/qdrant/service.py +117 -0
- trustgraph-flow-0.11.11/trustgraph/query/graph_embeddings/__init__.py +0 -0
- trustgraph-flow-0.11.11/trustgraph/query/graph_embeddings/milvus/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/query/graph_embeddings/milvus/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/query/graph_embeddings/milvus/service.py +121 -0
- trustgraph-flow-0.11.11/trustgraph/query/graph_embeddings/qdrant/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/query/graph_embeddings/qdrant/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/query/graph_embeddings/qdrant/service.py +133 -0
- trustgraph-flow-0.11.11/trustgraph/query/triples/__init__.py +0 -0
- trustgraph-flow-0.11.11/trustgraph/query/triples/cassandra/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/query/triples/cassandra/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/query/triples/cassandra/service.py +173 -0
- trustgraph-flow-0.11.11/trustgraph/query/triples/neo4j/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/query/triples/neo4j/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/query/triples/neo4j/service.py +348 -0
- trustgraph-flow-0.11.11/trustgraph/retrieval/__init__.py +0 -0
- trustgraph-flow-0.11.11/trustgraph/retrieval/document_rag/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/retrieval/document_rag/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/retrieval/document_rag/rag.py +165 -0
- trustgraph-flow-0.11.11/trustgraph/retrieval/graph_rag/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/retrieval/graph_rag/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/retrieval/graph_rag/rag.py +217 -0
- trustgraph-flow-0.11.11/trustgraph/storage/__init__.py +0 -0
- trustgraph-flow-0.11.11/trustgraph/storage/doc_embeddings/__init__.py +0 -0
- trustgraph-flow-0.11.11/trustgraph/storage/doc_embeddings/milvus/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/storage/doc_embeddings/milvus/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/storage/doc_embeddings/milvus/write.py +63 -0
- trustgraph-flow-0.11.11/trustgraph/storage/doc_embeddings/qdrant/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/storage/doc_embeddings/qdrant/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/storage/doc_embeddings/qdrant/write.py +104 -0
- trustgraph-flow-0.11.11/trustgraph/storage/graph_embeddings/__init__.py +0 -0
- trustgraph-flow-0.11.11/trustgraph/storage/graph_embeddings/milvus/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/storage/graph_embeddings/milvus/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/storage/graph_embeddings/milvus/write.py +61 -0
- trustgraph-flow-0.11.11/trustgraph/storage/graph_embeddings/qdrant/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/storage/graph_embeddings/qdrant/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/storage/graph_embeddings/qdrant/write.py +102 -0
- trustgraph-flow-0.11.11/trustgraph/storage/object_embeddings/__init__.py +0 -0
- trustgraph-flow-0.11.11/trustgraph/storage/object_embeddings/milvus/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/storage/object_embeddings/milvus/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/storage/object_embeddings/milvus/write.py +61 -0
- trustgraph-flow-0.11.11/trustgraph/storage/rows/__init__.py +0 -0
- trustgraph-flow-0.11.11/trustgraph/storage/rows/cassandra/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/storage/rows/cassandra/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/storage/rows/cassandra/write.py +127 -0
- trustgraph-flow-0.11.11/trustgraph/storage/triples/__init__.py +0 -0
- trustgraph-flow-0.11.11/trustgraph/storage/triples/cassandra/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/storage/triples/cassandra/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/storage/triples/cassandra/write.py +69 -0
- trustgraph-flow-0.11.11/trustgraph/storage/triples/neo4j/__init__.py +3 -0
- trustgraph-flow-0.11.11/trustgraph/storage/triples/neo4j/__main__.py +7 -0
- trustgraph-flow-0.11.11/trustgraph/storage/triples/neo4j/write.py +156 -0
- trustgraph-flow-0.11.11/trustgraph_flow.egg-info/PKG-INFO +36 -0
- trustgraph-flow-0.11.11/trustgraph_flow.egg-info/SOURCES.txt +179 -0
- trustgraph-flow-0.11.11/trustgraph_flow.egg-info/dependency_links.txt +1 -0
- trustgraph-flow-0.11.11/trustgraph_flow.egg-info/requires.txt +21 -0
- trustgraph-flow-0.11.11/trustgraph_flow.egg-info/top_level.txt +2 -0
@@ -0,0 +1,36 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: trustgraph-flow
|
3
|
+
Version: 0.11.11
|
4
|
+
Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
|
5
|
+
Home-page: https://github.com/trustgraph-ai/trustgraph
|
6
|
+
Download-URL: https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v0.11.11.tar.gz
|
7
|
+
Author: trustgraph.ai
|
8
|
+
Author-email: security@trustgraph.ai
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
10
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
|
11
|
+
Classifier: Operating System :: OS Independent
|
12
|
+
Requires-Python: >=3.8
|
13
|
+
Description-Content-Type: text/markdown
|
14
|
+
Requires-Dist: trustgraph-base
|
15
|
+
Requires-Dist: urllib3
|
16
|
+
Requires-Dist: rdflib
|
17
|
+
Requires-Dist: pymilvus
|
18
|
+
Requires-Dist: langchain
|
19
|
+
Requires-Dist: langchain-core
|
20
|
+
Requires-Dist: langchain-text-splitters
|
21
|
+
Requires-Dist: langchain-community
|
22
|
+
Requires-Dist: requests
|
23
|
+
Requires-Dist: cassandra-driver
|
24
|
+
Requires-Dist: pulsar-client
|
25
|
+
Requires-Dist: pypdf
|
26
|
+
Requires-Dist: qdrant-client
|
27
|
+
Requires-Dist: tabulate
|
28
|
+
Requires-Dist: anthropic
|
29
|
+
Requires-Dist: pyyaml
|
30
|
+
Requires-Dist: prometheus-client
|
31
|
+
Requires-Dist: cohere
|
32
|
+
Requires-Dist: openai
|
33
|
+
Requires-Dist: neo4j
|
34
|
+
Requires-Dist: tiktoken
|
35
|
+
|
36
|
+
See https://trustgraph.ai/
|
@@ -0,0 +1 @@
|
|
1
|
+
See https://trustgraph.ai/
|
@@ -0,0 +1,96 @@
|
|
1
|
+
import setuptools
|
2
|
+
import os
|
3
|
+
import importlib
|
4
|
+
|
5
|
+
with open("README.md", "r") as fh:
|
6
|
+
long_description = fh.read()
|
7
|
+
|
8
|
+
# Load a version number module
|
9
|
+
spec = importlib.util.spec_from_file_location(
|
10
|
+
'version', 'trustgraph/flow_version.py'
|
11
|
+
)
|
12
|
+
version_module = importlib.util.module_from_spec(spec)
|
13
|
+
spec.loader.exec_module(version_module)
|
14
|
+
|
15
|
+
version = version_module.__version__
|
16
|
+
|
17
|
+
setuptools.setup(
|
18
|
+
name="trustgraph-flow",
|
19
|
+
version=version,
|
20
|
+
author="trustgraph.ai",
|
21
|
+
author_email="security@trustgraph.ai",
|
22
|
+
description="TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.",
|
23
|
+
long_description=long_description,
|
24
|
+
long_description_content_type="text/markdown",
|
25
|
+
url="https://github.com/trustgraph-ai/trustgraph",
|
26
|
+
packages=setuptools.find_namespace_packages(
|
27
|
+
where='./',
|
28
|
+
),
|
29
|
+
classifiers=[
|
30
|
+
"Programming Language :: Python :: 3",
|
31
|
+
"License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
|
32
|
+
"Operating System :: OS Independent",
|
33
|
+
],
|
34
|
+
python_requires='>=3.8',
|
35
|
+
download_url = "https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v" + version + ".tar.gz",
|
36
|
+
install_requires=[
|
37
|
+
"trustgraph-base",
|
38
|
+
"urllib3",
|
39
|
+
"rdflib",
|
40
|
+
"pymilvus",
|
41
|
+
"langchain",
|
42
|
+
"langchain-core",
|
43
|
+
"langchain-text-splitters",
|
44
|
+
"langchain-community",
|
45
|
+
"requests",
|
46
|
+
"cassandra-driver",
|
47
|
+
"pulsar-client",
|
48
|
+
"pypdf",
|
49
|
+
"qdrant-client",
|
50
|
+
"tabulate",
|
51
|
+
"anthropic",
|
52
|
+
"pyyaml",
|
53
|
+
"prometheus-client",
|
54
|
+
"cohere",
|
55
|
+
"openai",
|
56
|
+
"neo4j",
|
57
|
+
"tiktoken",
|
58
|
+
],
|
59
|
+
scripts=[
|
60
|
+
"scripts/chunker-recursive",
|
61
|
+
"scripts/chunker-token",
|
62
|
+
"scripts/de-query-milvus",
|
63
|
+
"scripts/de-query-qdrant",
|
64
|
+
"scripts/de-write-milvus",
|
65
|
+
"scripts/de-write-qdrant",
|
66
|
+
"scripts/document-rag",
|
67
|
+
"scripts/embeddings-ollama",
|
68
|
+
"scripts/embeddings-vectorize",
|
69
|
+
"scripts/ge-query-milvus",
|
70
|
+
"scripts/ge-query-qdrant",
|
71
|
+
"scripts/ge-write-milvus",
|
72
|
+
"scripts/ge-write-qdrant",
|
73
|
+
"scripts/graph-rag",
|
74
|
+
"scripts/kg-extract-definitions",
|
75
|
+
"scripts/kg-extract-topics",
|
76
|
+
"scripts/kg-extract-relationships",
|
77
|
+
"scripts/metering",
|
78
|
+
"scripts/object-extract-row",
|
79
|
+
"scripts/oe-write-milvus",
|
80
|
+
"scripts/pdf-decoder",
|
81
|
+
"scripts/prompt-generic",
|
82
|
+
"scripts/prompt-template",
|
83
|
+
"scripts/rows-write-cassandra",
|
84
|
+
"scripts/run-processing",
|
85
|
+
"scripts/text-completion-azure",
|
86
|
+
"scripts/text-completion-claude",
|
87
|
+
"scripts/text-completion-cohere",
|
88
|
+
"scripts/text-completion-llamafile",
|
89
|
+
"scripts/text-completion-ollama",
|
90
|
+
"scripts/text-completion-openai",
|
91
|
+
"scripts/triples-query-cassandra",
|
92
|
+
"scripts/triples-query-neo4j",
|
93
|
+
"scripts/triples-write-cassandra",
|
94
|
+
"scripts/triples-write-neo4j",
|
95
|
+
]
|
96
|
+
)
|
File without changes
|
File without changes
|
@@ -0,0 +1,108 @@
|
|
1
|
+
|
2
|
+
"""
|
3
|
+
Simple decoder, accepts text documents on input, outputs chunks from the
|
4
|
+
as text as separate output objects.
|
5
|
+
"""
|
6
|
+
|
7
|
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
8
|
+
from prometheus_client import Histogram
|
9
|
+
|
10
|
+
from ... schema import TextDocument, Chunk, Source
|
11
|
+
from ... schema import text_ingest_queue, chunk_ingest_queue
|
12
|
+
from ... log_level import LogLevel
|
13
|
+
from ... base import ConsumerProducer
|
14
|
+
|
15
|
+
module = ".".join(__name__.split(".")[1:-1])
|
16
|
+
|
17
|
+
default_input_queue = text_ingest_queue
|
18
|
+
default_output_queue = chunk_ingest_queue
|
19
|
+
default_subscriber = module
|
20
|
+
|
21
|
+
class Processor(ConsumerProducer):
|
22
|
+
|
23
|
+
def __init__(self, **params):
|
24
|
+
|
25
|
+
input_queue = params.get("input_queue", default_input_queue)
|
26
|
+
output_queue = params.get("output_queue", default_output_queue)
|
27
|
+
subscriber = params.get("subscriber", default_subscriber)
|
28
|
+
chunk_size = params.get("chunk_size", 2000)
|
29
|
+
chunk_overlap = params.get("chunk_overlap", 100)
|
30
|
+
|
31
|
+
super(Processor, self).__init__(
|
32
|
+
**params | {
|
33
|
+
"input_queue": input_queue,
|
34
|
+
"output_queue": output_queue,
|
35
|
+
"subscriber": subscriber,
|
36
|
+
"input_schema": TextDocument,
|
37
|
+
"output_schema": Chunk,
|
38
|
+
}
|
39
|
+
)
|
40
|
+
|
41
|
+
if not hasattr(__class__, "chunk_metric"):
|
42
|
+
__class__.chunk_metric = Histogram(
|
43
|
+
'chunk_size', 'Chunk size',
|
44
|
+
buckets=[100, 160, 250, 400, 650, 1000, 1600,
|
45
|
+
2500, 4000, 6400, 10000, 16000]
|
46
|
+
)
|
47
|
+
|
48
|
+
self.text_splitter = RecursiveCharacterTextSplitter(
|
49
|
+
chunk_size=chunk_size,
|
50
|
+
chunk_overlap=chunk_overlap,
|
51
|
+
length_function=len,
|
52
|
+
is_separator_regex=False,
|
53
|
+
)
|
54
|
+
|
55
|
+
def handle(self, msg):
|
56
|
+
|
57
|
+
v = msg.value()
|
58
|
+
print(f"Chunking {v.source.id}...", flush=True)
|
59
|
+
|
60
|
+
texts = self.text_splitter.create_documents(
|
61
|
+
[v.text.decode("utf-8")]
|
62
|
+
)
|
63
|
+
|
64
|
+
for ix, chunk in enumerate(texts):
|
65
|
+
|
66
|
+
id = v.source.id + "-c" + str(ix)
|
67
|
+
|
68
|
+
r = Chunk(
|
69
|
+
source=Source(
|
70
|
+
source=v.source.source,
|
71
|
+
id=id,
|
72
|
+
title=v.source.title
|
73
|
+
),
|
74
|
+
chunk=chunk.page_content.encode("utf-8"),
|
75
|
+
)
|
76
|
+
|
77
|
+
__class__.chunk_metric.observe(len(chunk.page_content))
|
78
|
+
|
79
|
+
self.send(r)
|
80
|
+
|
81
|
+
print("Done.", flush=True)
|
82
|
+
|
83
|
+
@staticmethod
|
84
|
+
def add_args(parser):
|
85
|
+
|
86
|
+
ConsumerProducer.add_args(
|
87
|
+
parser, default_input_queue, default_subscriber,
|
88
|
+
default_output_queue,
|
89
|
+
)
|
90
|
+
|
91
|
+
parser.add_argument(
|
92
|
+
'-z', '--chunk-size',
|
93
|
+
type=int,
|
94
|
+
default=2000,
|
95
|
+
help=f'Chunk size (default: 2000)'
|
96
|
+
)
|
97
|
+
|
98
|
+
parser.add_argument(
|
99
|
+
'-v', '--chunk-overlap',
|
100
|
+
type=int,
|
101
|
+
default=100,
|
102
|
+
help=f'Chunk overlap (default: 100)'
|
103
|
+
)
|
104
|
+
|
105
|
+
def run():
|
106
|
+
|
107
|
+
Processor.start(module, __doc__)
|
108
|
+
|
@@ -0,0 +1,107 @@
|
|
1
|
+
|
2
|
+
"""
|
3
|
+
Simple decoder, accepts text documents on input, outputs chunks from the
|
4
|
+
as text as separate output objects.
|
5
|
+
"""
|
6
|
+
|
7
|
+
from langchain_text_splitters import TokenTextSplitter
|
8
|
+
from prometheus_client import Histogram
|
9
|
+
|
10
|
+
from ... schema import TextDocument, Chunk, Source
|
11
|
+
from ... schema import text_ingest_queue, chunk_ingest_queue
|
12
|
+
from ... log_level import LogLevel
|
13
|
+
from ... base import ConsumerProducer
|
14
|
+
|
15
|
+
module = ".".join(__name__.split(".")[1:-1])
|
16
|
+
|
17
|
+
default_input_queue = text_ingest_queue
|
18
|
+
default_output_queue = chunk_ingest_queue
|
19
|
+
default_subscriber = module
|
20
|
+
|
21
|
+
class Processor(ConsumerProducer):
|
22
|
+
|
23
|
+
def __init__(self, **params):
|
24
|
+
|
25
|
+
input_queue = params.get("input_queue", default_input_queue)
|
26
|
+
output_queue = params.get("output_queue", default_output_queue)
|
27
|
+
subscriber = params.get("subscriber", default_subscriber)
|
28
|
+
chunk_size = params.get("chunk_size", 250)
|
29
|
+
chunk_overlap = params.get("chunk_overlap", 15)
|
30
|
+
|
31
|
+
super(Processor, self).__init__(
|
32
|
+
**params | {
|
33
|
+
"input_queue": input_queue,
|
34
|
+
"output_queue": output_queue,
|
35
|
+
"subscriber": subscriber,
|
36
|
+
"input_schema": TextDocument,
|
37
|
+
"output_schema": Chunk,
|
38
|
+
}
|
39
|
+
)
|
40
|
+
|
41
|
+
if not hasattr(__class__, "chunk_metric"):
|
42
|
+
__class__.chunk_metric = Histogram(
|
43
|
+
'chunk_size', 'Chunk size',
|
44
|
+
buckets=[100, 160, 250, 400, 650, 1000, 1600,
|
45
|
+
2500, 4000, 6400, 10000, 16000]
|
46
|
+
)
|
47
|
+
|
48
|
+
self.text_splitter = TokenTextSplitter(
|
49
|
+
encoding_name="cl100k_base",
|
50
|
+
chunk_size=chunk_size,
|
51
|
+
chunk_overlap=chunk_overlap,
|
52
|
+
)
|
53
|
+
|
54
|
+
def handle(self, msg):
|
55
|
+
|
56
|
+
v = msg.value()
|
57
|
+
print(f"Chunking {v.source.id}...", flush=True)
|
58
|
+
|
59
|
+
texts = self.text_splitter.create_documents(
|
60
|
+
[v.text.decode("utf-8")]
|
61
|
+
)
|
62
|
+
|
63
|
+
for ix, chunk in enumerate(texts):
|
64
|
+
|
65
|
+
id = v.source.id + "-c" + str(ix)
|
66
|
+
|
67
|
+
r = Chunk(
|
68
|
+
source=Source(
|
69
|
+
source=v.source.source,
|
70
|
+
id=id,
|
71
|
+
title=v.source.title
|
72
|
+
),
|
73
|
+
chunk=chunk.page_content.encode("utf-8"),
|
74
|
+
)
|
75
|
+
|
76
|
+
__class__.chunk_metric.observe(len(chunk.page_content))
|
77
|
+
|
78
|
+
self.send(r)
|
79
|
+
|
80
|
+
print("Done.", flush=True)
|
81
|
+
|
82
|
+
@staticmethod
|
83
|
+
def add_args(parser):
|
84
|
+
|
85
|
+
ConsumerProducer.add_args(
|
86
|
+
parser, default_input_queue, default_subscriber,
|
87
|
+
default_output_queue,
|
88
|
+
)
|
89
|
+
|
90
|
+
parser.add_argument(
|
91
|
+
'-z', '--chunk-size',
|
92
|
+
type=int,
|
93
|
+
default=250,
|
94
|
+
help=f'Chunk size (default: 250)'
|
95
|
+
)
|
96
|
+
|
97
|
+
parser.add_argument(
|
98
|
+
'-v', '--chunk-overlap',
|
99
|
+
type=int,
|
100
|
+
default=15,
|
101
|
+
help=f'Chunk overlap (default: 15)'
|
102
|
+
)
|
103
|
+
|
104
|
+
def run():
|
105
|
+
|
106
|
+
Processor.start(module, __doc__)
|
107
|
+
|