trustgraph 0.5.2__tar.gz → 0.5.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of trustgraph might be problematic. Click here for more details.

Files changed (119) hide show
  1. {trustgraph-0.5.2 → trustgraph-0.5.5}/PKG-INFO +2 -2
  2. trustgraph-0.5.5/scripts/concat-parquet +45 -0
  3. trustgraph-0.5.5/scripts/dump-parquet +24 -0
  4. trustgraph-0.5.5/scripts/ge-dump-parquet +6 -0
  5. trustgraph-0.5.5/scripts/load-graph-embeddings +145 -0
  6. trustgraph-0.5.5/scripts/load-triples +144 -0
  7. {trustgraph-0.5.2 → trustgraph-0.5.5}/scripts/loader +2 -2
  8. {trustgraph-0.5.2 → trustgraph-0.5.5}/setup.py +7 -3
  9. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/base/base_processor.py +15 -13
  10. trustgraph-0.5.5/trustgraph/dump/graph_embeddings/parquet/processor.py +87 -0
  11. trustgraph-0.5.5/trustgraph/dump/graph_embeddings/parquet/writer.py +94 -0
  12. trustgraph-0.5.5/trustgraph/embeddings/ollama/__init__.py +3 -0
  13. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/embeddings/vectorize/vectorize.py +1 -1
  14. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/kg/extract_definitions/extract.py +3 -0
  15. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/kg/extract_relationships/extract.py +4 -0
  16. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/retrieval/graph_rag/rag.py +1 -0
  17. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/schema.py +28 -13
  18. trustgraph-0.5.5/trustgraph/storage/triples/__init__.py +0 -0
  19. trustgraph-0.5.5/trustgraph/storage/triples/cassandra/__main__.py +7 -0
  20. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph.egg-info/PKG-INFO +2 -2
  21. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph.egg-info/SOURCES.txt +9 -0
  22. trustgraph-0.5.2/scripts/dump-parquet +0 -12
  23. {trustgraph-0.5.2 → trustgraph-0.5.5}/LICENSE +0 -0
  24. {trustgraph-0.5.2 → trustgraph-0.5.5}/README.md +0 -0
  25. {trustgraph-0.5.2 → trustgraph-0.5.5}/scripts/chunker-recursive +0 -0
  26. {trustgraph-0.5.2 → trustgraph-0.5.5}/scripts/embeddings-hf +0 -0
  27. {trustgraph-0.5.2 → trustgraph-0.5.5}/scripts/embeddings-ollama +0 -0
  28. {trustgraph-0.5.2 → trustgraph-0.5.5}/scripts/embeddings-vectorize +0 -0
  29. {trustgraph-0.5.2 → trustgraph-0.5.5}/scripts/ge-write-milvus +0 -0
  30. {trustgraph-0.5.2 → trustgraph-0.5.5}/scripts/graph-rag +0 -0
  31. {trustgraph-0.5.2 → trustgraph-0.5.5}/scripts/graph-show +0 -0
  32. {trustgraph-0.5.2 → trustgraph-0.5.5}/scripts/graph-to-turtle +0 -0
  33. {trustgraph-0.5.2 → trustgraph-0.5.5}/scripts/init-pulsar-manager +0 -0
  34. {trustgraph-0.5.2 → trustgraph-0.5.5}/scripts/kg-extract-definitions +0 -0
  35. {trustgraph-0.5.2 → trustgraph-0.5.5}/scripts/kg-extract-relationships +0 -0
  36. {trustgraph-0.5.2 → trustgraph-0.5.5}/scripts/pdf-decoder +0 -0
  37. {trustgraph-0.5.2 → trustgraph-0.5.5}/scripts/query +0 -0
  38. {trustgraph-0.5.2 → trustgraph-0.5.5}/scripts/run-processing +0 -0
  39. {trustgraph-0.5.2 → trustgraph-0.5.5}/scripts/text-completion-azure +0 -0
  40. {trustgraph-0.5.2 → trustgraph-0.5.5}/scripts/text-completion-claude +0 -0
  41. {trustgraph-0.5.2 → trustgraph-0.5.5}/scripts/text-completion-ollama +0 -0
  42. {trustgraph-0.5.2 → trustgraph-0.5.5}/scripts/text-completion-vertexai +0 -0
  43. {trustgraph-0.5.2 → trustgraph-0.5.5}/scripts/triples-dump-parquet +0 -0
  44. {trustgraph-0.5.2 → trustgraph-0.5.5}/scripts/triples-write-cassandra +0 -0
  45. {trustgraph-0.5.2 → trustgraph-0.5.5}/setup.cfg +0 -0
  46. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/__init__.py +0 -0
  47. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/base/__init__.py +0 -0
  48. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/base/consumer.py +0 -0
  49. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/base/consumer_producer.py +0 -0
  50. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/base/producer.py +0 -0
  51. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/chunking/__init__.py +0 -0
  52. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/chunking/recursive/__init__.py +0 -0
  53. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/chunking/recursive/__main__.py +0 -0
  54. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/chunking/recursive/chunker.py +0 -0
  55. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/decoding/__init__.py +0 -0
  56. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/decoding/pdf/__init__.py +0 -0
  57. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/decoding/pdf/__main__.py +0 -0
  58. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/decoding/pdf/pdf_decoder.py +0 -0
  59. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/dump/__init__.py +0 -0
  60. {trustgraph-0.5.2/trustgraph/dump/triples → trustgraph-0.5.5/trustgraph/dump/graph_embeddings}/__init__.py +0 -0
  61. {trustgraph-0.5.2/trustgraph/dump/triples → trustgraph-0.5.5/trustgraph/dump/graph_embeddings}/parquet/__init__.py +0 -0
  62. {trustgraph-0.5.2/trustgraph/dump/triples → trustgraph-0.5.5/trustgraph/dump/graph_embeddings}/parquet/__main__.py +0 -0
  63. {trustgraph-0.5.2/trustgraph/embeddings → trustgraph-0.5.5/trustgraph/dump/triples}/__init__.py +0 -0
  64. {trustgraph-0.5.2/trustgraph/embeddings/ollama → trustgraph-0.5.5/trustgraph/dump/triples/parquet}/__init__.py +0 -0
  65. {trustgraph-0.5.2/trustgraph/storage/graph_embeddings/milvus → trustgraph-0.5.5/trustgraph/dump/triples/parquet}/__main__.py +0 -0
  66. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/dump/triples/parquet/processor.py +0 -0
  67. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/dump/triples/parquet/writer.py +0 -0
  68. {trustgraph-0.5.2/trustgraph/kg → trustgraph-0.5.5/trustgraph/embeddings}/__init__.py +0 -0
  69. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/embeddings/hf/__init__.py +0 -0
  70. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/embeddings/hf/__main__.py +0 -0
  71. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/embeddings/hf/hf.py +0 -0
  72. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/embeddings/ollama/__main__.py +0 -0
  73. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/embeddings/ollama/processor.py +0 -0
  74. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/embeddings/vectorize/__init__.py +0 -0
  75. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/embeddings/vectorize/__main__.py +0 -0
  76. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/embeddings_client.py +0 -0
  77. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/graph_rag.py +0 -0
  78. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/graph_rag_client.py +0 -0
  79. {trustgraph-0.5.2/trustgraph/model → trustgraph-0.5.5/trustgraph/kg}/__init__.py +0 -0
  80. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/kg/extract_definitions/__init__.py +0 -0
  81. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/kg/extract_definitions/__main__.py +0 -0
  82. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/kg/extract_relationships/__init__.py +0 -0
  83. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/kg/extract_relationships/__main__.py +0 -0
  84. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/llm_client.py +0 -0
  85. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/log_level.py +0 -0
  86. {trustgraph-0.5.2/trustgraph/model/text_completion → trustgraph-0.5.5/trustgraph/model}/__init__.py +0 -0
  87. {trustgraph-0.5.2/trustgraph/retrieval → trustgraph-0.5.5/trustgraph/model/text_completion}/__init__.py +0 -0
  88. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/model/text_completion/azure/__init__.py +0 -0
  89. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/model/text_completion/azure/__main__.py +0 -0
  90. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/model/text_completion/azure/llm.py +0 -0
  91. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/model/text_completion/claude/__init__.py +0 -0
  92. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/model/text_completion/claude/__main__.py +0 -0
  93. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/model/text_completion/claude/llm.py +0 -0
  94. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/model/text_completion/ollama/__init__.py +0 -0
  95. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/model/text_completion/ollama/__main__.py +0 -0
  96. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/model/text_completion/ollama/llm.py +0 -0
  97. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/model/text_completion/vertexai/__init__.py +0 -0
  98. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/model/text_completion/vertexai/__main__.py +0 -0
  99. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/model/text_completion/vertexai/llm.py +0 -0
  100. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/processing/__init__.py +0 -0
  101. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/processing/__main__.py +0 -0
  102. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/processing/processing.py +0 -0
  103. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/prompts.py +0 -0
  104. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/rdf.py +0 -0
  105. {trustgraph-0.5.2/trustgraph/storage → trustgraph-0.5.5/trustgraph/retrieval}/__init__.py +0 -0
  106. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/retrieval/graph_rag/__init__.py +0 -0
  107. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/retrieval/graph_rag/__main__.py +0 -0
  108. {trustgraph-0.5.2/trustgraph/storage/graph_embeddings → trustgraph-0.5.5/trustgraph/storage}/__init__.py +0 -0
  109. {trustgraph-0.5.2/trustgraph/storage/triples → trustgraph-0.5.5/trustgraph/storage/graph_embeddings}/__init__.py +0 -0
  110. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/storage/graph_embeddings/milvus/__init__.py +0 -0
  111. {trustgraph-0.5.2/trustgraph/storage/triples/cassandra → trustgraph-0.5.5/trustgraph/storage/graph_embeddings/milvus}/__main__.py +0 -0
  112. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/storage/graph_embeddings/milvus/write.py +0 -0
  113. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/storage/triples/cassandra/__init__.py +0 -0
  114. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/storage/triples/cassandra/write.py +0 -0
  115. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/triple_vectors.py +0 -0
  116. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph/trustgraph.py +0 -0
  117. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph.egg-info/dependency_links.txt +0 -0
  118. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph.egg-info/requires.txt +0 -0
  119. {trustgraph-0.5.2 → trustgraph-0.5.5}/trustgraph.egg-info/top_level.txt +0 -0
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: trustgraph
3
- Version: 0.5.2
3
+ Version: 0.5.5
4
4
  Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
5
5
  Home-page: https://github.com/trustgraph-ai/trustgraph
6
- Download-URL: https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v0.5.2.tar.gz
6
+ Download-URL: https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v0.5.5.tar.gz
7
7
  Author: trustgraph.ai
8
8
  Author-email: security@trustgraph.ai
9
9
  Classifier: Programming Language :: Python :: 3
@@ -0,0 +1,45 @@
1
+ #!/usr/bin/env python3
2
+
3
+ """
4
+ Concatenates multiple parquet files into a single parquet output
5
+ """
6
+
7
+ import pyarrow as pa
8
+ import pyarrow.parquet as pq
9
+ import pandas as pd
10
+ import sys
11
+ import argparse
12
+
13
+ parser = argparse.ArgumentParser(
14
+ prog="combine-parquet",
15
+ description=__doc__
16
+ )
17
+
18
+ parser.add_argument(
19
+ '-i', '--input',
20
+ nargs='*',
21
+ help=f'Input files'
22
+ )
23
+
24
+ parser.add_argument(
25
+ '-o', '--output',
26
+ help=f'Output files'
27
+ )
28
+
29
+ args = parser.parse_args()
30
+
31
+ df = None
32
+
33
+ for file in args.input:
34
+
35
+ part = pq.read_table(file).to_pandas()
36
+
37
+ if df is None:
38
+ df = part
39
+ else:
40
+ df = pd.concat([df, part], ignore_index=True)
41
+
42
+ if df is not None:
43
+
44
+ table = pa.Table.from_pandas(df)
45
+ pq.write_table(table, args.output)
@@ -0,0 +1,24 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import pyarrow as pa
4
+ import pyarrow.csv as pc
5
+ import pyarrow.parquet as pq
6
+ import pandas as pd
7
+ import sys
8
+
9
+ df = None
10
+
11
+ for file in sys.argv[1:]:
12
+
13
+ part = pq.read_table(file).to_pandas()
14
+
15
+ if df is None:
16
+ df = part
17
+ else:
18
+ df = pd.concat([df, part], ignore_index=True)
19
+
20
+ if df is not None:
21
+
22
+ table = pa.Table.from_pandas(df)
23
+ pc.write_csv(table, sys.stdout.buffer)
24
+
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env python3
2
+
3
+ from trustgraph.dump.graph_embeddings.parquet import run
4
+
5
+ run()
6
+
@@ -0,0 +1,145 @@
1
+ #!/usr/bin/env python3
2
+
3
+ """
4
+ Loads Graph embeddings into TrustGraph processing.
5
+ """
6
+
7
+ import pulsar
8
+ from pulsar.schema import JsonSchema
9
+ from trustgraph.schema import GraphEmbeddings, Value
10
+ from trustgraph.schema import graph_embeddings_store_queue
11
+ import argparse
12
+ import os
13
+ import time
14
+ import pyarrow as pa
15
+ import pyarrow.parquet as pq
16
+
17
+ from trustgraph.log_level import LogLevel
18
+
19
+ class Loader:
20
+
21
+ def __init__(
22
+ self,
23
+ pulsar_host,
24
+ output_queue,
25
+ log_level,
26
+ file,
27
+ ):
28
+
29
+ self.client = pulsar.Client(
30
+ pulsar_host,
31
+ logger=pulsar.ConsoleLogger(log_level.to_pulsar())
32
+ )
33
+
34
+ self.producer = self.client.create_producer(
35
+ topic=output_queue,
36
+ schema=JsonSchema(GraphEmbeddings),
37
+ chunking_enabled=True,
38
+ )
39
+
40
+ self.file = file
41
+
42
+ def run(self):
43
+
44
+ try:
45
+
46
+ path = self.file
47
+
48
+ print("Reading file...")
49
+ table = pq.read_table(path)
50
+ print("Loaded.")
51
+
52
+ names = set(table.column_names)
53
+
54
+ if "embeddings" not in names:
55
+ print("No 'embeddings' column")
56
+
57
+ if "entity" not in names:
58
+ print("No 'entity' column")
59
+
60
+ embc = table.column("embeddings")
61
+ entc = table.column("entity")
62
+
63
+ for emb, ent in zip(embc, entc):
64
+
65
+ b = emb.as_py()
66
+ n = ent.as_py()
67
+
68
+ r = GraphEmbeddings(
69
+ vectors=b,
70
+ entity=Value(
71
+ value=n,
72
+ is_uri=n.startswith("https:")
73
+ )
74
+ )
75
+
76
+ self.producer.send(r)
77
+
78
+ except Exception as e:
79
+ print(e, flush=True)
80
+
81
+ def __del__(self):
82
+ self.client.close()
83
+
84
+ def main():
85
+
86
+ parser = argparse.ArgumentParser(
87
+ prog='loader',
88
+ description=__doc__,
89
+ )
90
+
91
+ default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://localhost:6650')
92
+ default_output_queue = graph_embeddings_store_queue
93
+
94
+ parser.add_argument(
95
+ '-p', '--pulsar-host',
96
+ default=default_pulsar_host,
97
+ help=f'Pulsar host (default: {default_pulsar_host})',
98
+ )
99
+
100
+ parser.add_argument(
101
+ '-o', '--output-queue',
102
+ default=default_output_queue,
103
+ help=f'Output queue (default: {default_output_queue})'
104
+ )
105
+
106
+ parser.add_argument(
107
+ '-l', '--log-level',
108
+ type=LogLevel,
109
+ default=LogLevel.ERROR,
110
+ choices=list(LogLevel),
111
+ help=f'Output queue (default: info)'
112
+ )
113
+
114
+ parser.add_argument(
115
+ '-f', '--file',
116
+ required=True,
117
+ help=f'File to load'
118
+ )
119
+
120
+ args = parser.parse_args()
121
+
122
+ while True:
123
+
124
+ try:
125
+ p = Loader(
126
+ pulsar_host=args.pulsar_host,
127
+ output_queue=args.output_queue,
128
+ log_level=args.log_level,
129
+ file=args.file,
130
+ )
131
+
132
+ p.run()
133
+
134
+ print("File loaded.")
135
+ break
136
+
137
+ except Exception as e:
138
+
139
+ print("Exception:", e, flush=True)
140
+ print("Will retry...", flush=True)
141
+
142
+ time.sleep(10)
143
+
144
+ main()
145
+
@@ -0,0 +1,144 @@
1
+ #!/usr/bin/env python3
2
+
3
+ """
4
+ Loads Graph embeddings into TrustGraph processing.
5
+ """
6
+
7
+ import pulsar
8
+ from pulsar.schema import JsonSchema
9
+ from trustgraph.schema import Triple, Value
10
+ from trustgraph.schema import triples_store_queue
11
+ import argparse
12
+ import os
13
+ import time
14
+ import pyarrow as pa
15
+ import pyarrow.parquet as pq
16
+
17
+ from trustgraph.log_level import LogLevel
18
+
19
+ class Loader:
20
+
21
+ def __init__(
22
+ self,
23
+ pulsar_host,
24
+ output_queue,
25
+ log_level,
26
+ file,
27
+ ):
28
+
29
+ self.client = pulsar.Client(
30
+ pulsar_host,
31
+ logger=pulsar.ConsoleLogger(log_level.to_pulsar())
32
+ )
33
+
34
+ self.producer = self.client.create_producer(
35
+ topic=output_queue,
36
+ schema=JsonSchema(Triple),
37
+ chunking_enabled=True,
38
+ )
39
+
40
+ self.file = file
41
+
42
+ def run(self):
43
+
44
+ try:
45
+
46
+ path = self.file
47
+
48
+ print("Reading file...")
49
+ table = pq.read_table(path)
50
+ print("Loaded.")
51
+
52
+ names = set(table.column_names)
53
+
54
+ if "s" not in names:
55
+ print("No 's' column")
56
+
57
+ if "p" not in names:
58
+ print("No 'p' column")
59
+
60
+ if "o" not in names:
61
+ print("No 'o' column")
62
+
63
+ sc = table.column("s")
64
+ pc = table.column("p")
65
+ oc = table.column("o")
66
+
67
+ for s, p, o in zip(sc, pc, oc):
68
+
69
+ r = Triple(
70
+ s=Value(value=s.as_py(), is_uri=True),
71
+ p=Value(value=p.as_py(), is_uri=True),
72
+ o=Value(value=o.as_py(), is_uri=o.as_py().startswith("https:"))
73
+ )
74
+
75
+ self.producer.send(r)
76
+
77
+ except Exception as e:
78
+ print(e, flush=True)
79
+
80
+ def __del__(self):
81
+ self.client.close()
82
+
83
+ def main():
84
+
85
+ parser = argparse.ArgumentParser(
86
+ prog='loader',
87
+ description=__doc__,
88
+ )
89
+
90
+ default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://localhost:6650')
91
+ default_output_queue = triples_store_queue
92
+
93
+ parser.add_argument(
94
+ '-p', '--pulsar-host',
95
+ default=default_pulsar_host,
96
+ help=f'Pulsar host (default: {default_pulsar_host})',
97
+ )
98
+
99
+ parser.add_argument(
100
+ '-o', '--output-queue',
101
+ default=default_output_queue,
102
+ help=f'Output queue (default: {default_output_queue})'
103
+ )
104
+
105
+ parser.add_argument(
106
+ '-l', '--log-level',
107
+ type=LogLevel,
108
+ default=LogLevel.ERROR,
109
+ choices=list(LogLevel),
110
+ help=f'Output queue (default: info)'
111
+ )
112
+
113
+ parser.add_argument(
114
+ '-f', '--file',
115
+ required=True,
116
+ help=f'File to load'
117
+ )
118
+
119
+ args = parser.parse_args()
120
+
121
+ while True:
122
+
123
+ try:
124
+ p = Loader(
125
+ pulsar_host=args.pulsar_host,
126
+ output_queue=args.output_queue,
127
+ log_level=args.log_level,
128
+ file=args.file,
129
+ )
130
+
131
+ p.run()
132
+
133
+ print("File loaded.")
134
+ break
135
+
136
+ except Exception as e:
137
+
138
+ print("Exception:", e, flush=True)
139
+ print("Will retry...", flush=True)
140
+
141
+ time.sleep(10)
142
+
143
+ main()
144
+
@@ -6,7 +6,7 @@ Loads a PDF documented into TrustGraph processing.
6
6
 
7
7
  import pulsar
8
8
  from pulsar.schema import JsonSchema
9
- from trustgraph.schema import Document, Source
9
+ from trustgraph.schema import Document, Source, document_ingest_queue
10
10
  import base64
11
11
  import hashlib
12
12
  import argparse
@@ -72,7 +72,7 @@ def main():
72
72
  )
73
73
 
74
74
  default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://localhost:6650')
75
- default_output_queue = 'document-load'
75
+ default_output_queue = document_ingest_queue
76
76
 
77
77
  parser.add_argument(
78
78
  '-p', '--pulsar-host',
@@ -4,7 +4,7 @@ import os
4
4
  with open("README.md", "r") as fh:
5
5
  long_description = fh.read()
6
6
 
7
- version = "0.5.2"
7
+ version = "0.5.5"
8
8
 
9
9
  setuptools.setup(
10
10
  name="trustgraph",
@@ -48,9 +48,12 @@ setuptools.setup(
48
48
  ],
49
49
  scripts=[
50
50
  "scripts/chunker-recursive",
51
+ "scripts/concat-parquet",
52
+ "scripts/dump-parquet",
51
53
  "scripts/embeddings-hf",
52
54
  "scripts/embeddings-ollama",
53
55
  "scripts/embeddings-vectorize",
56
+ "scripts/ge-dump-parquet",
54
57
  "scripts/ge-write-milvus",
55
58
  "scripts/graph-rag",
56
59
  "scripts/graph-show",
@@ -58,6 +61,8 @@ setuptools.setup(
58
61
  "scripts/init-pulsar-manager",
59
62
  "scripts/kg-extract-definitions",
60
63
  "scripts/kg-extract-relationships",
64
+ "scripts/load-graph-embeddings",
65
+ "scripts/load-triples",
61
66
  "scripts/loader",
62
67
  "scripts/pdf-decoder",
63
68
  "scripts/query",
@@ -66,8 +71,7 @@ setuptools.setup(
66
71
  "scripts/text-completion-claude",
67
72
  "scripts/text-completion-ollama",
68
73
  "scripts/text-completion-vertexai",
69
- "scripts/triples-write-cassandra",
70
- "scripts/dump-parquet",
71
74
  "scripts/triples-dump-parquet",
75
+ "scripts/triples-write-cassandra",
72
76
  ]
73
77
  )
@@ -60,10 +60,10 @@ class BaseProcessor:
60
60
  )
61
61
 
62
62
  parser.add_argument(
63
- '-M', '--metrics-enabled',
64
- type=bool,
63
+ '--metrics',
64
+ action=argparse.BooleanOptionalAction,
65
65
  default=True,
66
- help=f'Pulsar host (default: true)',
66
+ help=f'Metrics enabled (default: true)',
67
67
  )
68
68
 
69
69
  parser.add_argument(
@@ -79,20 +79,22 @@ class BaseProcessor:
79
79
  @classmethod
80
80
  def start(cls, prog, doc):
81
81
 
82
- while True:
82
+ parser = argparse.ArgumentParser(
83
+ prog=prog,
84
+ description=doc
85
+ )
83
86
 
84
- parser = argparse.ArgumentParser(
85
- prog=prog,
86
- description=doc
87
- )
87
+ cls.add_args(parser)
88
88
 
89
- cls.add_args(parser)
89
+ args = parser.parse_args()
90
+ args = vars(args)
90
91
 
91
- args = parser.parse_args()
92
- args = vars(args)
92
+ print(args)
93
93
 
94
- if args["metrics_enabled"]:
95
- start_http_server(args["metrics_port"])
94
+ if args["metrics"]:
95
+ start_http_server(args["metrics_port"])
96
+
97
+ while True:
96
98
 
97
99
  try:
98
100
 
@@ -0,0 +1,87 @@
1
+
2
+ """
3
+ Write graph embeddings to parquet files in a directory.
4
+ """
5
+
6
+ import pulsar
7
+ import base64
8
+ import os
9
+ import argparse
10
+ import time
11
+
12
+ from .... trustgraph import TrustGraph
13
+ from .... schema import GraphEmbeddings
14
+ from .... schema import graph_embeddings_store_queue
15
+ from .... log_level import LogLevel
16
+ from .... base import Consumer
17
+
18
+ from . writer import ParquetWriter
19
+
20
+ module = ".".join(__name__.split(".")[1:-1])
21
+
22
+ default_input_queue = graph_embeddings_store_queue
23
+ default_subscriber = module
24
+ default_graph_host='localhost'
25
+ default_directory = "."
26
+ default_file_template = "graph-embeds-{id}.parquet"
27
+ default_rotation_time = 60
28
+
29
+ class Processor(Consumer):
30
+
31
+ def __init__(self, **params):
32
+
33
+ input_queue = params.get("input_queue", default_input_queue)
34
+ subscriber = params.get("subscriber", default_subscriber)
35
+ directory = params.get("directory", default_directory)
36
+ file_template = params.get("file_template", default_file_template)
37
+ rotation_time = params.get("rotation_time", default_rotation_time)
38
+
39
+ super(Processor, self).__init__(
40
+ **params | {
41
+ "input_queue": input_queue,
42
+ "subscriber": subscriber,
43
+ "input_schema": GraphEmbeddings,
44
+ }
45
+ )
46
+
47
+ self.writer = ParquetWriter(directory, file_template, rotation_time)
48
+
49
+ def __del__(self):
50
+ if hasattr(self, "writer"):
51
+ del self.writer
52
+
53
+ def handle(self, msg):
54
+
55
+ v = msg.value()
56
+ self.writer.write(v.vectors, v.entity.value)
57
+
58
+ @staticmethod
59
+ def add_args(parser):
60
+
61
+ Consumer.add_args(
62
+ parser, default_input_queue, default_subscriber,
63
+ )
64
+
65
+ parser.add_argument(
66
+ '-d', '--directory',
67
+ default=default_directory,
68
+ help=f'Directory to write to (default: {default_directory})'
69
+ )
70
+
71
+ parser.add_argument(
72
+ '-f', '--file-template',
73
+ default=default_file_template,
74
+ help=f'Directory to write to (default: {default_file_template})'
75
+ )
76
+
77
+ parser.add_argument(
78
+ '-t', '--rotation-time',
79
+ type=int,
80
+ default=default_rotation_time,
81
+ help=f'Rotation time / seconds (default: {default_rotation_time})'
82
+ )
83
+
84
+ def run():
85
+
86
+ Processor.start(module, __doc__)
87
+
@@ -0,0 +1,94 @@
1
+
2
+ import threading
3
+ import queue
4
+ import time
5
+ import uuid
6
+ import pyarrow as pa
7
+ import pyarrow.parquet as pq
8
+
9
+ class ParquetWriter:
10
+
11
+ def __init__(self, directory, file_template, rotation_time):
12
+ self.directory = directory
13
+ self.file_template = file_template
14
+ self.rotation_time = rotation_time
15
+
16
+ self.q = queue.Queue()
17
+
18
+ self.running = True
19
+
20
+ self.thread = threading.Thread(target=(self.writer_thread))
21
+ self.thread.start()
22
+
23
+ def writer_thread(self):
24
+
25
+ items = []
26
+
27
+ timeout = None
28
+
29
+ while self.running:
30
+
31
+ try:
32
+
33
+ item = self.q.get(timeout=1)
34
+
35
+ if timeout == None:
36
+ timeout = time.time() + self.rotation_time
37
+
38
+ items.append(item)
39
+
40
+ except queue.Empty:
41
+ pass
42
+
43
+ if timeout:
44
+ if time.time() > timeout:
45
+
46
+ self.write_file(items)
47
+ timeout = None
48
+ items = []
49
+
50
+ def write_file(self, items):
51
+
52
+ try:
53
+
54
+ schema = pa.schema([
55
+ pa.field('embeddings', pa.list_(pa.list_(pa.float64()))),
56
+ pa.field('entity', pa.string()),
57
+ ])
58
+
59
+ fname = self.file_template.format(id=str(uuid.uuid4()))
60
+ path = f"{self.directory}/{fname}"
61
+
62
+ writer = pq.ParquetWriter(path, schema)
63
+
64
+ batch = pa.record_batch(
65
+ [
66
+ [i[0] for i in items],
67
+ [i[1] for i in items],
68
+ ],
69
+ names=['embeddings', 'entity']
70
+ )
71
+
72
+ writer.write_batch(batch)
73
+
74
+ writer.close()
75
+
76
+ print(f"Wrote {path}.")
77
+
78
+ except Exception as e:
79
+
80
+ print("Parquet write:", e)
81
+
82
+ def write(self, embeds, ent):
83
+ self.q.put((embeds, ent))
84
+
85
+ def __del__(self):
86
+
87
+ self.running = False
88
+
89
+ if hasattr(self, "q"):
90
+ self.thread.join()
91
+
92
+
93
+
94
+
@@ -0,0 +1,3 @@
1
+
2
+ from . processor import *
3
+
@@ -36,7 +36,7 @@ class Processor(ConsumerProducer):
36
36
 
37
37
  self.embeddings = EmbeddingsClient(
38
38
  pulsar_host=self.pulsar_host,
39
- subscriber=module + "emb",
39
+ subscriber=module + "-emb",
40
40
  )
41
41
 
42
42
  def emit(self, source, chunk, vectors):
@@ -87,6 +87,9 @@ class Processor(ConsumerProducer):
87
87
 
88
88
  o = defn["definition"]
89
89
 
90
+ if s == "": continue
91
+ if o == "": continue
92
+
90
93
  s_value = Value(value=str(s_uri), is_uri=True)
91
94
  o_value = Value(value=str(o), is_uri=False)
92
95