henge 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- henge/__init__.py +8 -5
- henge/_version.py +1 -1
- henge/henge.py +32 -9
- henge/scconf.py +359 -0
- henge-0.2.3.dist-info/METADATA +132 -0
- henge-0.2.3.dist-info/RECORD +11 -0
- {henge-0.2.1.dist-info → henge-0.2.3.dist-info}/WHEEL +1 -1
- henge-0.2.1.dist-info/METADATA +0 -28
- henge-0.2.1.dist-info/RECORD +0 -11
- henge-0.2.1.dist-info/entry_points.txt +0 -2
- {henge-0.2.1.dist-info → henge-0.2.3.dist-info/licenses}/LICENSE.txt +0 -0
- {henge-0.2.1.dist-info → henge-0.2.3.dist-info}/top_level.txt +0 -0
henge/__init__.py
CHANGED
|
@@ -1,10 +1,13 @@
|
|
|
1
|
-
# Project configuration
|
|
1
|
+
# Project configuration.
|
|
2
2
|
|
|
3
|
-
import logmuse
|
|
4
3
|
from ._version import __version__
|
|
5
4
|
from .henge import *
|
|
6
5
|
|
|
7
6
|
__classes__ = ["Henge"]
|
|
8
|
-
__all__ = __classes__ + [
|
|
9
|
-
|
|
10
|
-
|
|
7
|
+
__all__ = __classes__ + [
|
|
8
|
+
"connect_mongo",
|
|
9
|
+
"split_schema",
|
|
10
|
+
"NotFoundException",
|
|
11
|
+
"canonical_str",
|
|
12
|
+
"sha512t24u_digest",
|
|
13
|
+
]
|
henge/_version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.2.
|
|
1
|
+
__version__ = "0.2.3"
|
henge/henge.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""An interface to a database back-end for DRUIDs"""
|
|
2
2
|
|
|
3
|
+
import base64
|
|
3
4
|
import copy
|
|
4
5
|
import hashlib
|
|
5
6
|
import jsonschema
|
|
6
|
-
import logging
|
|
7
7
|
import json
|
|
8
|
+
import logging
|
|
8
9
|
import os
|
|
9
10
|
import sys
|
|
10
11
|
import yacman
|
|
@@ -28,6 +29,13 @@ class NotFoundException(Exception):
|
|
|
28
29
|
return self.message
|
|
29
30
|
|
|
30
31
|
|
|
32
|
+
def sha512t24u_digest(seq: str, offset: int = 24) -> str:
|
|
33
|
+
"""GA4GH digest function"""
|
|
34
|
+
digest = hashlib.sha512(seq.encode()).digest()
|
|
35
|
+
tdigest_b64us = base64.urlsafe_b64encode(digest[:offset])
|
|
36
|
+
return tdigest_b64us.decode("ascii")
|
|
37
|
+
|
|
38
|
+
|
|
31
39
|
def md5(seq):
|
|
32
40
|
return hashlib.md5(seq.encode()).hexdigest()
|
|
33
41
|
|
|
@@ -49,7 +57,6 @@ def read_url(url):
|
|
|
49
57
|
raise e
|
|
50
58
|
data = response.read() # a `bytes` object
|
|
51
59
|
text = data.decode("utf-8")
|
|
52
|
-
print(text)
|
|
53
60
|
return yaml.safe_load(text)
|
|
54
61
|
|
|
55
62
|
|
|
@@ -142,11 +149,11 @@ class Henge(object):
|
|
|
142
149
|
:param bool raw: Return the value as a raw, henge-delimited string, instead
|
|
143
150
|
of processing into a mapping. Default: False.
|
|
144
151
|
"""
|
|
145
|
-
try:
|
|
152
|
+
try:
|
|
146
153
|
item_type = self.database[druid + ITEM_TYPE]
|
|
147
154
|
except KeyError:
|
|
148
155
|
raise NotFoundException(druid)
|
|
149
|
-
|
|
156
|
+
|
|
150
157
|
digested_string = self.lookup(druid, item_type)
|
|
151
158
|
reconstructed_item = json.loads(digested_string)
|
|
152
159
|
|
|
@@ -243,8 +250,9 @@ class Henge(object):
|
|
|
243
250
|
|
|
244
251
|
if item_type not in self.schemas.keys():
|
|
245
252
|
_LOGGER.error(
|
|
246
|
-
"I don't know about items of type '{}'. "
|
|
247
|
-
|
|
253
|
+
"I don't know about items of type '{}'. I know of: '{}'".format(
|
|
254
|
+
item_type, list(self.schemas.keys())
|
|
255
|
+
)
|
|
248
256
|
)
|
|
249
257
|
return False
|
|
250
258
|
|
|
@@ -328,8 +336,9 @@ class Henge(object):
|
|
|
328
336
|
"""
|
|
329
337
|
if item_type not in self.schemas.keys():
|
|
330
338
|
_LOGGER.error(
|
|
331
|
-
"I don't know about items of type '{}'. "
|
|
332
|
-
|
|
339
|
+
"I don't know about items of type '{}'. I know of: '{}'".format(
|
|
340
|
+
item_type, list(self.schemas.keys())
|
|
341
|
+
)
|
|
333
342
|
)
|
|
334
343
|
return False
|
|
335
344
|
|
|
@@ -442,6 +451,20 @@ class Henge(object):
|
|
|
442
451
|
for k, v in self.database.items():
|
|
443
452
|
print(k, v)
|
|
444
453
|
|
|
454
|
+
def __len__(self):
|
|
455
|
+
return len(self.database)
|
|
456
|
+
|
|
457
|
+
def list(self, limit=1000, offset=0):
|
|
458
|
+
"""
|
|
459
|
+
List all items in the database.
|
|
460
|
+
"""
|
|
461
|
+
return {
|
|
462
|
+
"count": len(self.database),
|
|
463
|
+
"limit": limit,
|
|
464
|
+
"offset": offset,
|
|
465
|
+
"items": list(self.database.keys())[offset : (offset + limit)],
|
|
466
|
+
}
|
|
467
|
+
|
|
445
468
|
def __repr__(self):
|
|
446
469
|
repr = "Henge object. Item types: " + ",".join(self.item_types)
|
|
447
470
|
return repr
|
henge/scconf.py
ADDED
|
@@ -0,0 +1,359 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import psycopg2
|
|
4
|
+
|
|
5
|
+
from collections.abc import Mapping
|
|
6
|
+
from psycopg2 import OperationalError, sql
|
|
7
|
+
from psycopg2.errors import UniqueViolation
|
|
8
|
+
|
|
9
|
+
_LOGGER = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
# Use like:
|
|
12
|
+
# pgdb = RDBDict(...) # Open connection
|
|
13
|
+
# pgdb["key"] = "value" # Insert item
|
|
14
|
+
# pgdb["key"] # Retrieve item
|
|
15
|
+
# pgdb.close() # Close connection
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# This was originally written in seqcolapi.
|
|
19
|
+
# I am moving it here in 2025, because the whole point was to enable
|
|
20
|
+
# interesting database back-ends to have dict-style key-value pair
|
|
21
|
+
# mechanisms, which was enabling henge to use these various backends
|
|
22
|
+
# to back arbitrary databases.
|
|
23
|
+
# with the move to sqlmodel, I abandoned the henge backend approach,
|
|
24
|
+
# so intermediates are no longer important for seqcol.
|
|
25
|
+
|
|
26
|
+
# they could become relevant for other henge use cases, so they
|
|
27
|
+
# fit better here now.
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def getenv(varname):
|
|
31
|
+
"""Simple wrapper to make the Exception more informative for missing env var"""
|
|
32
|
+
try:
|
|
33
|
+
return os.environ[varname]
|
|
34
|
+
except KeyError:
|
|
35
|
+
raise Exception(f"Environment variable {varname} not set.")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
import pipestat
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class PipestatMapping(pipestat.PipestatManager):
|
|
42
|
+
"""A wrapper class to allow using a PipestatManager as a dict-like object."""
|
|
43
|
+
|
|
44
|
+
def __getitem__(self, key):
|
|
45
|
+
# This little hack makes this work with `in`;
|
|
46
|
+
# e.g.: for x in rdbdict, which is now disabled, instead of infinite.
|
|
47
|
+
if isinstance(key, int):
|
|
48
|
+
raise IndexError
|
|
49
|
+
return self.retrieve(key)
|
|
50
|
+
|
|
51
|
+
def __setitem__(self, key, value):
|
|
52
|
+
return self.insert({key: value})
|
|
53
|
+
|
|
54
|
+
def __len__(self):
|
|
55
|
+
return self.count_records()
|
|
56
|
+
|
|
57
|
+
def _next_page(self):
|
|
58
|
+
self._buf["page_index"] += 1
|
|
59
|
+
limit = self._buf["page_size"]
|
|
60
|
+
offset = self._buf["page_index"] * limit
|
|
61
|
+
self._buf["keys"] = self.get_records(limit, offset)
|
|
62
|
+
return self._buf["keys"][0]
|
|
63
|
+
|
|
64
|
+
def __iter__(self):
|
|
65
|
+
_LOGGER.debug("Iterating...")
|
|
66
|
+
self._buf = { # buffered iterator
|
|
67
|
+
"current_view_index": 0,
|
|
68
|
+
"len": len(self),
|
|
69
|
+
"page_size": 100,
|
|
70
|
+
"page_index": -1,
|
|
71
|
+
"keys": self._next_page(),
|
|
72
|
+
}
|
|
73
|
+
return self
|
|
74
|
+
|
|
75
|
+
def __next__(self):
|
|
76
|
+
if self._buf["current_view_index"] > self._buf["len"]:
|
|
77
|
+
raise StopIteration
|
|
78
|
+
|
|
79
|
+
idx = (
|
|
80
|
+
self._buf["current_view_index"]
|
|
81
|
+
- self._buf["page_index"] * self._buf["page_size"]
|
|
82
|
+
)
|
|
83
|
+
if idx <= self._buf["page_size"]:
|
|
84
|
+
self._buf["current_view_index"] += 1
|
|
85
|
+
return self._buf["keys"][idx - 1]
|
|
86
|
+
else: # current index is beyond current page, but not beyond total
|
|
87
|
+
return self._next_page()
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class RDBDict(Mapping):
|
|
91
|
+
"""
|
|
92
|
+
A Relational DataBase Dict.
|
|
93
|
+
|
|
94
|
+
Simple database connection manager object that allows us to use a
|
|
95
|
+
PostgresQL database as a simple key-value store to back Python
|
|
96
|
+
dict-style access to database items.
|
|
97
|
+
"""
|
|
98
|
+
|
|
99
|
+
def __init__(
|
|
100
|
+
self,
|
|
101
|
+
db_name: str = None,
|
|
102
|
+
db_user: str = None,
|
|
103
|
+
db_password: str = None,
|
|
104
|
+
db_host: str = None,
|
|
105
|
+
db_port: str = None,
|
|
106
|
+
db_table: str = None,
|
|
107
|
+
):
|
|
108
|
+
self.connection = None
|
|
109
|
+
self.db_name = db_name or getenv("POSTGRES_DB")
|
|
110
|
+
self.db_user = db_user or getenv("POSTGRES_USER")
|
|
111
|
+
self.db_host = db_host or os.environ.get("POSTGRES_HOST") or "localhost"
|
|
112
|
+
self.db_port = db_port or os.environ.get("POSTGRES_PORT") or "5432"
|
|
113
|
+
self.db_table = db_table or os.environ.get("POSTGRES_TABLE") or "seqcol"
|
|
114
|
+
db_password = db_password or getenv("POSTGRES_PASSWORD")
|
|
115
|
+
|
|
116
|
+
try:
|
|
117
|
+
self.connection = self.create_connection(
|
|
118
|
+
self.db_name, self.db_user, db_password, self.db_host, self.db_port
|
|
119
|
+
)
|
|
120
|
+
if not self.connection:
|
|
121
|
+
raise Exception("Connection failed")
|
|
122
|
+
except Exception as e:
|
|
123
|
+
_LOGGER.info(f"{self}")
|
|
124
|
+
raise e
|
|
125
|
+
_LOGGER.info(self.connection)
|
|
126
|
+
self.connection.autocommit = True
|
|
127
|
+
|
|
128
|
+
def __repr__(self):
|
|
129
|
+
return (
|
|
130
|
+
"RDBD object\n"
|
|
131
|
+
+ "db_table: {}\n".format(self.db_table)
|
|
132
|
+
+ "db_name: {}\n".format(self.db_name)
|
|
133
|
+
+ "db_user: {}\n".format(self.db_user)
|
|
134
|
+
+ "db_host: {}\n".format(self.db_host)
|
|
135
|
+
+ "db_port: {}\n".format(self.db_port)
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
def init_table(self):
|
|
139
|
+
# Wrap statements to prevent SQL injection attacks
|
|
140
|
+
stmt = sql.SQL(
|
|
141
|
+
"""
|
|
142
|
+
CREATE TABLE IF NOT EXISTS {table}(
|
|
143
|
+
key TEXT PRIMARY KEY,
|
|
144
|
+
value TEXT);
|
|
145
|
+
"""
|
|
146
|
+
).format(table=sql.Identifier(self.db_table))
|
|
147
|
+
return self.execute_query(stmt, params=None)
|
|
148
|
+
|
|
149
|
+
def insert(self, key, value):
|
|
150
|
+
stmt = sql.SQL(
|
|
151
|
+
"""
|
|
152
|
+
INSERT INTO {table}(key, value)
|
|
153
|
+
VALUES (%(key)s, %(value)s);
|
|
154
|
+
"""
|
|
155
|
+
).format(table=sql.Identifier(self.db_table))
|
|
156
|
+
params = {"key": key, "value": value}
|
|
157
|
+
return self.execute_query(stmt, params)
|
|
158
|
+
|
|
159
|
+
def update(self, key, value):
|
|
160
|
+
stmt = sql.SQL(
|
|
161
|
+
"""
|
|
162
|
+
UPDATE {table} SET value=%(value)s WHERE key=%(key)s
|
|
163
|
+
"""
|
|
164
|
+
).format(table=sql.Identifier(self.db_table))
|
|
165
|
+
params = {"key": key, "value": value}
|
|
166
|
+
return self.execute_query(stmt, params)
|
|
167
|
+
|
|
168
|
+
def __getitem__(self, key):
|
|
169
|
+
# This little hack makes this work with `in`;
|
|
170
|
+
# e.g.: for x in rdbdict, which is now disabled, instead of infinite.
|
|
171
|
+
if isinstance(key, int):
|
|
172
|
+
raise IndexError
|
|
173
|
+
stmt = sql.SQL(
|
|
174
|
+
"""
|
|
175
|
+
SELECT value FROM {table} WHERE key=%(key)s
|
|
176
|
+
"""
|
|
177
|
+
).format(table=sql.Identifier(self.db_table))
|
|
178
|
+
params = {"key": key}
|
|
179
|
+
res = self.execute_read_query(stmt, params)
|
|
180
|
+
if not res:
|
|
181
|
+
_LOGGER.info("Not found: {}".format(key))
|
|
182
|
+
return res
|
|
183
|
+
|
|
184
|
+
def __setitem__(self, key, value):
|
|
185
|
+
try:
|
|
186
|
+
return self.insert(key, value)
|
|
187
|
+
except UniqueViolation as e:
|
|
188
|
+
_LOGGER.info("Updating existing value for {}".format(key))
|
|
189
|
+
return self.update(key, value)
|
|
190
|
+
|
|
191
|
+
def __delitem__(self, key):
|
|
192
|
+
stmt = sql.SQL(
|
|
193
|
+
"""
|
|
194
|
+
DELETE FROM {table} WHERE key=%(key)s
|
|
195
|
+
"""
|
|
196
|
+
).format(table=sql.Identifier(self.db_table))
|
|
197
|
+
params = {"key": key}
|
|
198
|
+
res = self.execute_query(stmt, params)
|
|
199
|
+
return res
|
|
200
|
+
|
|
201
|
+
def create_connection(self, db_name, db_user, db_password, db_host, db_port):
|
|
202
|
+
connection = None
|
|
203
|
+
try:
|
|
204
|
+
connection = psycopg2.connect(
|
|
205
|
+
database=db_name,
|
|
206
|
+
user=db_user,
|
|
207
|
+
password=db_password,
|
|
208
|
+
host=db_host,
|
|
209
|
+
port=db_port,
|
|
210
|
+
)
|
|
211
|
+
_LOGGER.info("Connection to PostgreSQL DB successful")
|
|
212
|
+
except OperationalError as e:
|
|
213
|
+
_LOGGER.info("Error: {e}".format(e=str(e)))
|
|
214
|
+
return connection
|
|
215
|
+
|
|
216
|
+
def execute_read_query(self, query, params=None):
|
|
217
|
+
cursor = self.connection.cursor()
|
|
218
|
+
result = None
|
|
219
|
+
try:
|
|
220
|
+
cursor.execute(query, params)
|
|
221
|
+
result = cursor.fetchone()
|
|
222
|
+
if result:
|
|
223
|
+
return result[0]
|
|
224
|
+
else:
|
|
225
|
+
_LOGGER.debug(f"Query: {query}")
|
|
226
|
+
_LOGGER.debug(f"Result: {result}")
|
|
227
|
+
return None
|
|
228
|
+
except OperationalError as e:
|
|
229
|
+
_LOGGER.info("Error: {e}".format(e=str(e)))
|
|
230
|
+
raise
|
|
231
|
+
except TypeError as e:
|
|
232
|
+
_LOGGER.info("TypeError: {e}, item: {q}".format(e=str(e), q=query))
|
|
233
|
+
raise
|
|
234
|
+
|
|
235
|
+
def execute_multi_query(self, query, params=None):
|
|
236
|
+
cursor = self.connection.cursor()
|
|
237
|
+
result = None
|
|
238
|
+
try:
|
|
239
|
+
cursor.execute(query, params)
|
|
240
|
+
result = cursor.fetchall()
|
|
241
|
+
return result
|
|
242
|
+
except OperationalError as e:
|
|
243
|
+
_LOGGER.info("Error: {e}".format(e=str(e)))
|
|
244
|
+
raise
|
|
245
|
+
except TypeError as e:
|
|
246
|
+
_LOGGER.info("TypeError: {e}, item: {q}".format(e=str(e), q=query))
|
|
247
|
+
raise
|
|
248
|
+
|
|
249
|
+
def execute_query(self, query, params=None):
|
|
250
|
+
cursor = self.connection.cursor()
|
|
251
|
+
try:
|
|
252
|
+
return cursor.execute(query, params)
|
|
253
|
+
_LOGGER.info("Query executed successfully")
|
|
254
|
+
except OperationalError as e:
|
|
255
|
+
_LOGGER.info("Error: {e}".format(e=str(e)))
|
|
256
|
+
|
|
257
|
+
def close(self):
|
|
258
|
+
_LOGGER.info("Closing connection")
|
|
259
|
+
return self.connection.close()
|
|
260
|
+
|
|
261
|
+
def __del__(self):
|
|
262
|
+
if self.connection:
|
|
263
|
+
self.close()
|
|
264
|
+
|
|
265
|
+
def __len__(self):
|
|
266
|
+
stmt = sql.SQL(
|
|
267
|
+
"""
|
|
268
|
+
SELECT COUNT(*) FROM {table}
|
|
269
|
+
"""
|
|
270
|
+
).format(table=sql.Identifier(self.db_table))
|
|
271
|
+
_LOGGER.debug(stmt)
|
|
272
|
+
res = self.execute_read_query(stmt)
|
|
273
|
+
return res
|
|
274
|
+
|
|
275
|
+
def get_paged_keys(self, limit=None, offset=None):
|
|
276
|
+
stmt = sql.SQL("SELECT key FROM {table}").format(
|
|
277
|
+
table=sql.Identifier(self.db_table)
|
|
278
|
+
)
|
|
279
|
+
params = {}
|
|
280
|
+
if limit is not None:
|
|
281
|
+
stmt = sql.SQL("{} LIMIT %(limit)s").format(stmt)
|
|
282
|
+
params["limit"] = limit
|
|
283
|
+
if offset is not None:
|
|
284
|
+
stmt = sql.SQL("{} OFFSET %(offset)s").format(stmt)
|
|
285
|
+
params["offset"] = offset
|
|
286
|
+
res = self.execute_multi_query(stmt, params if params else None)
|
|
287
|
+
return res
|
|
288
|
+
|
|
289
|
+
def _next_page(self):
|
|
290
|
+
self._buf["page_index"] += 1
|
|
291
|
+
limit = self._buf["page_size"]
|
|
292
|
+
offset = self._buf["page_index"] * limit
|
|
293
|
+
self._buf["keys"] = self.get_paged_keys(limit, offset)
|
|
294
|
+
return self._buf["keys"][0]
|
|
295
|
+
|
|
296
|
+
def __iter__(self):
|
|
297
|
+
_LOGGER.debug("Iterating...")
|
|
298
|
+
self._buf = { # buffered iterator
|
|
299
|
+
"current_view_index": 0,
|
|
300
|
+
"len": len(self),
|
|
301
|
+
"page_size": 10,
|
|
302
|
+
"page_index": 0,
|
|
303
|
+
"keys": self.get_paged_keys(10, 0),
|
|
304
|
+
}
|
|
305
|
+
return self
|
|
306
|
+
|
|
307
|
+
def __next__(self):
|
|
308
|
+
if self._buf["current_view_index"] > self._buf["len"]:
|
|
309
|
+
raise StopIteration
|
|
310
|
+
|
|
311
|
+
idx = (
|
|
312
|
+
self._buf["current_view_index"]
|
|
313
|
+
- self._buf["page_index"] * self._buf["page_size"]
|
|
314
|
+
)
|
|
315
|
+
if idx <= self._buf["page_size"]:
|
|
316
|
+
self._buf["current_view_index"] += 1
|
|
317
|
+
return self._buf["keys"][idx - 1]
|
|
318
|
+
else: # current index is beyond current page, but not beyond total
|
|
319
|
+
return self._next_page()
|
|
320
|
+
|
|
321
|
+
# Old, non-paged iterator:
|
|
322
|
+
# def __iter__(self):
|
|
323
|
+
# self._current_idx = 0
|
|
324
|
+
# return self
|
|
325
|
+
|
|
326
|
+
# def __next__(self):
|
|
327
|
+
# stmt = sql.SQL(
|
|
328
|
+
# """
|
|
329
|
+
# SELECT key,value FROM {table} LIMIT 1 OFFSET %(idx)s
|
|
330
|
+
# """
|
|
331
|
+
# ).format(table=sql.Identifier(self.db_table))
|
|
332
|
+
# res = self.execute_read_query(stmt, {"idx": self._current_idx})
|
|
333
|
+
# self._current_idx += 1
|
|
334
|
+
# if not res:
|
|
335
|
+
# _LOGGER.info("Not found: {}".format(self._current_idx))
|
|
336
|
+
# raise StopIteration
|
|
337
|
+
# return res
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
# We don't need the full SeqColHenge,
|
|
341
|
+
# which also has loading capability, and requires pyfaidx, which requires
|
|
342
|
+
# biopython, which requires numpy, which is huge and can't compile the in
|
|
343
|
+
# default fastapi container.
|
|
344
|
+
# So, I had written the below class which provides retrieve only.
|
|
345
|
+
# HOWEVER, switching from alpine to slim allows install of numpy;
|
|
346
|
+
# This inflates the container size from 262Mb to 350Mb; perhaps that's worth paying.
|
|
347
|
+
# So I can avoid duplicating this and just use the full SeqColHenge from seqcol
|
|
348
|
+
# class SeqColHenge(refget.RefGetClient):
|
|
349
|
+
# def retrieve(self, druid, reclimit=None, raw=False):
|
|
350
|
+
# try:
|
|
351
|
+
# return super(SeqColHenge, self).retrieve(druid, reclimit, raw)
|
|
352
|
+
# except henge.NotFoundException as e:
|
|
353
|
+
# _LOGGER.debug(e)
|
|
354
|
+
# try:
|
|
355
|
+
# return self.refget(druid)
|
|
356
|
+
# except Exception as e:
|
|
357
|
+
# _LOGGER.debug(e)
|
|
358
|
+
# raise e
|
|
359
|
+
# return henge.NotFoundException("{} not found in database, or in refget.".format(druid))
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: henge
|
|
3
|
+
Version: 0.2.3
|
|
4
|
+
Summary: Storage and retrieval of object-derived, decomposable recursive unique identifiers.
|
|
5
|
+
Home-page: https://databio.org
|
|
6
|
+
Author: Nathan Sheffield
|
|
7
|
+
Author-email: nathan@code.databio.org
|
|
8
|
+
License: BSD2
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Classifier: License :: OSI Approved :: BSD License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Classifier: Topic :: System :: Distributed Computing
|
|
16
|
+
Requires-Python: >=3.10
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE.txt
|
|
19
|
+
Requires-Dist: jsonschema
|
|
20
|
+
Requires-Dist: ubiquerg>=0.5.2
|
|
21
|
+
Requires-Dist: yacman>=0.6.7
|
|
22
|
+
Dynamic: author
|
|
23
|
+
Dynamic: author-email
|
|
24
|
+
Dynamic: classifier
|
|
25
|
+
Dynamic: description
|
|
26
|
+
Dynamic: description-content-type
|
|
27
|
+
Dynamic: home-page
|
|
28
|
+
Dynamic: keywords
|
|
29
|
+
Dynamic: license
|
|
30
|
+
Dynamic: license-file
|
|
31
|
+
Dynamic: requires-dist
|
|
32
|
+
Dynamic: requires-python
|
|
33
|
+
Dynamic: summary
|
|
34
|
+
|
|
35
|
+
# Henge
|
|
36
|
+
|
|
37
|
+
Henge is a Python package for building data storage and retrieval interfaces for arbitrary data. Henge is based on the idea of **decomposable recursive unique identifiers (DRUIDs)**, which are hash-based unique identifiers for data derived from the data itself. For arbitrary data with any structure, Henge can mint unique DRUIDs to identify data, store the data in a key-value database of your choice, and provide lookup functions to retrieve the data in its original structure using its DRUID identifier.
|
|
38
|
+
|
|
39
|
+
Henge was intended as a building block for [sequence collections](https://github.com/refgenie/seqcol), but is generic enough to use for any data type that needs content-derived identifiers with database lookup capability.
|
|
40
|
+
|
|
41
|
+
## Install
|
|
42
|
+
|
|
43
|
+
```
|
|
44
|
+
pip install henge
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Quick Start
|
|
48
|
+
|
|
49
|
+
Create a Henge object by providing a database and a data schema. The database can be a Python dict or backed by persistent storage. Data schemas are [JSON-schema](https://json-schema.org/) descriptions of data types, and can be hierarchical.
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
import henge
|
|
53
|
+
|
|
54
|
+
schemas = ["path/to/json_schema.yaml"]
|
|
55
|
+
h = henge.Henge(database={}, schemas=schemas)
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Insert items into the henge. Upon insert, henge returns the DRUID (digest/checksum/unique identifier) for your object:
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
druid = h.insert({"name": "Pat", "age": 38}, item_type="person")
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Retrieve the original object using the DRUID:
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
h.retrieve(druid)
|
|
68
|
+
# {'age': '38', 'name': 'Pat'}
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Tutorial
|
|
72
|
+
|
|
73
|
+
For a comprehensive walkthrough covering basic types, arrays, nested objects, and advanced features, see the [tutorial notebook](docs/tutorial.ipynb).
|
|
74
|
+
|
|
75
|
+
## What are DRUIDs?
|
|
76
|
+
|
|
77
|
+
DRUIDs are a special type of unique identifiers with two powerful properties:
|
|
78
|
+
|
|
79
|
+
- **Decomposable**: Identifiers in henge automatically retrieve structured data (tuples, arrays, objects). The structure is defined by a JSON schema, so henge can be used as a back-end for arbitrary data types.
|
|
80
|
+
|
|
81
|
+
- **Recursive**: Individual elements retrieved by henge can be tagged as recursive, meaning these attributes contain their own DRUIDs. Henge can recurse through these, allowing you to mint unique identifiers for arbitrary nested data structures.
|
|
82
|
+
|
|
83
|
+
A DRUID is ultimately the result of a digest operation (such as `md5` or `sha256`) on some data. Because DRUIDs are computed deterministically from the item, they represent globally unique identifiers. If you insert the same item repeatedly, it will produce the same DRUID -- this is true across henges as long as they share a data schema.
|
|
84
|
+
|
|
85
|
+
## Persisting Data
|
|
86
|
+
|
|
87
|
+
### In-memory (default)
|
|
88
|
+
|
|
89
|
+
Use a Python `dict` as the database for testing or ephemeral use:
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
h = henge.Henge(database={}, schemas=schemas)
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### SQLite backend
|
|
96
|
+
|
|
97
|
+
For persistent storage with SQLite:
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
from sqlitedict import SqliteDict
|
|
101
|
+
|
|
102
|
+
mydict = SqliteDict('./my_db.sqlite', autocommit=True)
|
|
103
|
+
h = henge.Henge(mydict, schemas=schemas)
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Requires: `pip install sqlitedict`
|
|
107
|
+
|
|
108
|
+
### MongoDB backend
|
|
109
|
+
|
|
110
|
+
For production use with MongoDB:
|
|
111
|
+
|
|
112
|
+
1. **Start MongoDB with Docker:**
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
docker run --network="host" mongo
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
For persistent storage, mount a volume to `/data/db`:
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
docker run -it --network="host" -v /path/to/data:/data/db mongo
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
2. **Connect henge to MongoDB:**
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
import henge
|
|
128
|
+
|
|
129
|
+
h = henge.Henge(henge.connect_mongo(), schemas=schemas)
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
Requires: `pip install pymongo mongodict`
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
henge/__init__.py,sha256=PBJsthV7TCS3Wu-4fLxBIaBFf_GwcnN3HyWELHuWdAI,246
|
|
2
|
+
henge/_version.py,sha256=PNiDER4qM19h9zdsdfgKt2_dT4WgYK7EguJ8RU2qA_g,22
|
|
3
|
+
henge/const.py,sha256=0t3EgqdjmKBd-zu5L8AJnGoWv0T3sAtvXf-3b62Dd-Y,194
|
|
4
|
+
henge/deprecated.py,sha256=C8eINR2gWCiNaN2b3gbFYn8jfJ0ftJm8a_fIgVVVzXc,11248
|
|
5
|
+
henge/henge.py,sha256=UHQCXNcGs0zsOXarll3ZCzddb6pWqGWml5ZqBXSM6WU,24088
|
|
6
|
+
henge/scconf.py,sha256=b201HzE_l3hO0JgusQnI2XhdLayWgXEoqeaZzJwSke4,12132
|
|
7
|
+
henge-0.2.3.dist-info/licenses/LICENSE.txt,sha256=oB6ZGDa4kcznznJKJsLLFFcOZyi8Y6e2Jv0rJozgp-I,1269
|
|
8
|
+
henge-0.2.3.dist-info/METADATA,sha256=Qhf7hrZyhXJRpJVnDdi2bn717BNsScFKk1UncE8JGHo,4402
|
|
9
|
+
henge-0.2.3.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
10
|
+
henge-0.2.3.dist-info/top_level.txt,sha256=QyovlLuKhhKP1r8bMVmxLdke9F6PZFIN7VlkzvB0xIQ,6
|
|
11
|
+
henge-0.2.3.dist-info/RECORD,,
|
henge-0.2.1.dist-info/METADATA
DELETED
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.1
|
|
2
|
-
Name: henge
|
|
3
|
-
Version: 0.2.1
|
|
4
|
-
Summary: Storage and retrieval of object-derived, decomposable recursive unique identifiers.
|
|
5
|
-
Home-page: https://databio.org
|
|
6
|
-
Author: Nathan Sheffield
|
|
7
|
-
Author-email: nathan@code.databio.org
|
|
8
|
-
License: BSD2
|
|
9
|
-
Classifier: Development Status :: 4 - Beta
|
|
10
|
-
Classifier: License :: OSI Approved :: BSD License
|
|
11
|
-
Classifier: Programming Language :: Python :: 3.7
|
|
12
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
-
Classifier: Topic :: System :: Distributed Computing
|
|
16
|
-
Description-Content-Type: text/markdown
|
|
17
|
-
License-File: LICENSE.txt
|
|
18
|
-
Requires-Dist: jsonschema
|
|
19
|
-
Requires-Dist: ubiquerg (>=0.5.2)
|
|
20
|
-
Requires-Dist: yacman (>=0.6.7)
|
|
21
|
-
|
|
22
|
-
[](https://travis-ci.com/databio/henge)
|
|
23
|
-
|
|
24
|
-
# Henge
|
|
25
|
-
|
|
26
|
-
Henge is a Python package that builds backends for generic decomposable recursive unique identifiers (or, *DRUIDs*). It is intended to be used as a building block for sequence collections (see the [seqcol package](https://github.com/databio/seqcol)), and also for other data types that need content-derived identifiers.
|
|
27
|
-
|
|
28
|
-
Documentation at [http://henge.databio.org](http://henge.databio.org).
|
henge-0.2.1.dist-info/RECORD
DELETED
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
henge/__init__.py,sha256=Su4ZQre-GS24y-LjiAIf57GYovLTww0hkk2fDxzMt_g,289
|
|
2
|
-
henge/_version.py,sha256=Zn1KFblwuFHiDRdRAiRnDBRkbPttWh44jKa5zG2ov0E,22
|
|
3
|
-
henge/const.py,sha256=0t3EgqdjmKBd-zu5L8AJnGoWv0T3sAtvXf-3b62Dd-Y,194
|
|
4
|
-
henge/deprecated.py,sha256=C8eINR2gWCiNaN2b3gbFYn8jfJ0ftJm8a_fIgVVVzXc,11248
|
|
5
|
-
henge/henge.py,sha256=mvAcXefMy5r8XcgRe9XXPH5uuXdyL9-sXRYWLNp_fos,23454
|
|
6
|
-
henge-0.2.1.dist-info/LICENSE.txt,sha256=oB6ZGDa4kcznznJKJsLLFFcOZyi8Y6e2Jv0rJozgp-I,1269
|
|
7
|
-
henge-0.2.1.dist-info/METADATA,sha256=zW6QW4rlOKOpkymjUGkvlMRhbdXZQU_7YvTx3WM2ap8,1270
|
|
8
|
-
henge-0.2.1.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
|
|
9
|
-
henge-0.2.1.dist-info/entry_points.txt,sha256=c2OKgrH1a5Cx2osbUFSe9NFK8CbN82lPPsi4wry77_M,61
|
|
10
|
-
henge-0.2.1.dist-info/top_level.txt,sha256=QyovlLuKhhKP1r8bMVmxLdke9F6PZFIN7VlkzvB0xIQ,6
|
|
11
|
-
henge-0.2.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|