henge 0.1.1__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
henge-0.2.1/PKG-INFO ADDED
@@ -0,0 +1,25 @@
1
+ Metadata-Version: 2.1
2
+ Name: henge
3
+ Version: 0.2.1
4
+ Summary: Storage and retrieval of object-derived, decomposable recursive unique identifiers.
5
+ Home-page: https://databio.org
6
+ Author: Nathan Sheffield
7
+ Author-email: nathan@code.databio.org
8
+ License: BSD2
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: License :: OSI Approved :: BSD License
11
+ Classifier: Programming Language :: Python :: 3.7
12
+ Classifier: Programming Language :: Python :: 3.8
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Topic :: System :: Distributed Computing
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE.txt
18
+
19
+ [![Build Status](https://travis-ci.com/databio/henge.svg?branch=master)](https://travis-ci.com/databio/henge)
20
+
21
+ # Henge
22
+
23
+ Henge is a Python package that builds backends for generic decomposable recursive unique identifiers (or, *DRUIDs*). It is intended to be used as a building block for sequence collections (see the [seqcol package](https://github.com/databio/seqcol)), and also for other data types that need content-derived identifiers.
24
+
25
+ Documentation at [http://henge.databio.org](http://henge.databio.org).
@@ -0,0 +1,12 @@
1
+ # Project configuration.
2
+
3
+ from ._version import __version__
4
+ from .henge import *
5
+
6
+ __classes__ = ["Henge"]
7
+ __all__ = __classes__ + [
8
+ "connect_mongo",
9
+ "split_schema",
10
+ "NotFoundException",
11
+ "canonical_str",
12
+ ]
@@ -0,0 +1 @@
1
+ __version__ = "0.2.1"
@@ -1,4 +1,4 @@
1
1
  LIBS_BY_BACKEND = {"mongo": ["pymongo", "mongodict"]}
2
2
  DELIM_ATTR = "," # chr(30); separating attributes in an item
3
3
  DELIM_ITEM = "," # separating items in a collection
4
- ITEM_TYPE = "_item_type"
4
+ ITEM_TYPE = "_item_type"
@@ -0,0 +1,224 @@
1
+
2
+ def retrieveOld(self, druid, reclimit=None, raw=False):
3
+
4
+ try:
5
+ item_type = self.database[druid + ITEM_TYPE]
6
+ except:
7
+ _LOGGER.debug(f"Item type not saved in database for {druid}")
8
+ raise NotFoundException(druid)
9
+
10
+ # _LOGGER.debug("item_type: {}".format(item_type))
11
+ # _LOGGER.debug("henge_to_query: {}".format(henge_to_query))
12
+
13
+ schema = self.schemas[item_type] #"type" in schema and
14
+ # string = druid
15
+ _LOGGER.debug("Got druid to retrieve: {} / item_type: {} / schema: {}".format(
16
+ druid, item_type, schema))
17
+
18
+ if schema["type"] == "array":
19
+ string = self.lookup(druid, item_type)
20
+ _LOGGER.debug("Lookup/array/Recursive: {}; Schema: {}".format(string, schema))
21
+ splitstr = string.split(DELIM_ITEM)
22
+ # if self.flexible_digests:
23
+ # pass
24
+ # item_name = splitstr.pop(0)
25
+ if isinstance(reclimit, int) and reclimit == 0:
26
+ return splitstr
27
+ if 'henge_class' in schema['items']:
28
+ _LOGGER.debug("Henge classed array: {}; Schema: {}".format(string, schema))
29
+ if isinstance(reclimit, int):
30
+ reclimit = reclimit - 1
31
+ return [self.retrieve(substr, reclimit) for substr in splitstr]
32
+ else:
33
+ return splitstr
34
+ elif schema["type"] == "object":
35
+ string = self.lookup(druid, item_type)
36
+ attr_array = string.split(DELIM_ATTR)
37
+ if self.flexible_digests:
38
+ keys = attr_array[::2] # evens
39
+ vals = attr_array[1::2] # odds
40
+ item_reconstituted = dict(zip(keys,vals))
41
+ else:
42
+ item_reconstituted = dict(zip(schema['properties'].keys(),
43
+ attr_array))
44
+ # I think this part needs to be removed... it's based on the
45
+ # previous 'recursive' for arrays, which went away...
46
+ # but actually these may be added in by me, so nevermind.
47
+ if 'recursive' in schema:
48
+ if isinstance(reclimit, int) and reclimit == 0:
49
+ _LOGGER.debug("Lookup/obj/Recursive: {}; Schema: {}".format(string, schema))
50
+ return item_reconstituted
51
+ else:
52
+ if isinstance(reclimit, int):
53
+ reclimit = reclimit - 1
54
+ for recursive_attr in schema['recursive']:
55
+ if recursive_attr in item_reconstituted \
56
+ and item_reconstituted[recursive_attr] != "":
57
+ item_reconstituted[recursive_attr] = self.retrieve(
58
+ item_reconstituted[recursive_attr],
59
+ reclimit,
60
+ raw)
61
+ return item_reconstituted
62
+ else: # It must be a primitive type
63
+ # but it could be a primitive (string) that represents something to lookup,
64
+ # or something not-to-lookup (or already looked up)
65
+ _LOGGER.debug("Lookup/prim: {}; Schema: {}".format(druid, schema))
66
+ # return string
67
+ if 'henge_class' in schema and self.schemas[schema['henge_class']]['type'] in ['object', 'array']:
68
+ if isinstance(reclimit, int) and reclimit == 0:
69
+ _LOGGER.debug("Lookup/prim/Recursive-skip: {}; Schema: {}".format(string, schema))
70
+ string = self.lookup(druid, item_type)
71
+ return string
72
+ else:
73
+ if isinstance(reclimit, int):
74
+ reclimit = reclimit - 1
75
+ _LOGGER.debug("Lookup/prim/Recursive: {}; Schema: {}".format(druid, schema))
76
+ return self.retrieve(druid, reclimit, raw)
77
+ else:
78
+ string = self.lookup(druid, item_type)
79
+ _LOGGER.debug("Lookup/prim/Non-recursive: {}; Schema: {}".format(string, schema))
80
+ return string #self.retrieve(string, reclimit, raw)
81
+
82
+ # try:
83
+ # string = henge_to_query.database[druid]
84
+ # except KeyError:
85
+ # raise NotFoundException(druid)
86
+
87
+ # return reconstruct_item(string, schema, reclimit)
88
+
89
+ def retrieve2(self, druid, reclimit=None, raw=False):
90
+ """
91
+ Retrieve an item given a digest
92
+
93
+ :param str druid: The Decomposable recursive unique identifier (DRUID), or
94
+ digest that uniquely identifies that item to retrieve.
95
+ :param int reclimit: Recursion limit. Set to None for no limit (default).
96
+ :param bool raw: Return the value as a raw, henge-delimited string, instead
97
+ of processing into a mapping. Default: False.
98
+ """
99
+ def reconstruct_item(string, schema, reclimit):
100
+ if "type" in schema and schema["type"] == "array":
101
+ _LOGGER.debug("Lookup/array/Recursive: {}; Schema: {}".format(string, schema))
102
+ splitstr = string.split(DELIM_ITEM)
103
+ # if self.flexible_digests:
104
+ # pass
105
+ # item_name = splitstr.pop(0)
106
+ if 'henge_class' in schema['items'] and schema['items']['type'] not in ["object", "array"]:
107
+ _LOGGER.debug("Henge classed array: {}; Schema: {}".format(string, schema))
108
+ return "ASDF"
109
+ return [reconstruct_item(self.henges[item_type].database[substr], schema["items"], reclimit)
110
+ for substr in splitstr]
111
+ else:
112
+ return [reconstruct_item(substr, schema["items"], reclimit)
113
+ for substr in splitstr]
114
+ elif schema["type"] == "object":
115
+ attr_array = string.split(DELIM_ATTR)
116
+ if self.flexible_digests:
117
+ keys = attr_array[::2] # evens
118
+ vals = attr_array[1::2] # odds
119
+ item_reconstituted = dict(zip(keys,vals))
120
+ else:
121
+ item_reconstituted = dict(zip(schema['properties'].keys(),
122
+ attr_array))
123
+ # I think this part needs to be removed... it's based on the
124
+ # previous 'recursive' for arrays, which went away...
125
+ # but actually these may be added in by me, so nevermind.
126
+ if 'recursive' in schema:
127
+ if isinstance(reclimit, int) and reclimit == 0:
128
+ _LOGGER.debug("Lookup/obj/Recursive: {}; Schema: {}".format(string, schema))
129
+ return item_reconstituted
130
+ else:
131
+ if isinstance(reclimit, int):
132
+ reclimit = reclimit - 1
133
+ for recursive_attr in schema['recursive']:
134
+ if item_reconstituted[recursive_attr] \
135
+ and item_reconstituted[recursive_attr] != "":
136
+ item_reconstituted[recursive_attr] = self.retrieve(
137
+ item_reconstituted[recursive_attr],
138
+ reclimit,
139
+ raw)
140
+ return item_reconstituted
141
+ else: # it must be a primitive
142
+ # but it could be a primitive (string) that represents something to lookup,
143
+ # or something not-to-lookup (or already looked up)
144
+ _LOGGER.debug("Lookup/prim: {}; Schema: {}".format(string, schema))
145
+ # return string
146
+ if 'henge_class' in schema and self.schemas[schema['henge_class']]['type'] in ['object', 'array']:
147
+ if isinstance(reclimit, int) and reclimit == 0:
148
+ _LOGGER.debug("Lookup/prim/Recursive-skip: {}; Schema: {}".format(string, schema))
149
+ return string
150
+ else:
151
+ if isinstance(reclimit, int):
152
+ reclimit = reclimit - 1
153
+ _LOGGER.debug("Lookup/prim/Recursive: {}; Schema: {}".format(string, schema))
154
+ return self.retrieve(string, reclimit, raw)
155
+ else:
156
+ _LOGGER.debug("Lookup/prim/Non-recursive: {}; Schema: {}".format(string, schema))
157
+ return string
158
+
159
+ # This requires the database to have __iter__ defined...and it scrolls through
160
+ # not a great way, take it out! 2021-01 NS
161
+ # I'll instead do a try block
162
+ # if not druid + ITEM_TYPE in self.database:
163
+ # raise NotFoundException(druid)
164
+
165
+ try:
166
+ item_type = self.database[druid + ITEM_TYPE]
167
+ except:
168
+ _LOGGER.debug(f"Item type not saved in database for {druid}")
169
+ raise NotFoundException(druid)
170
+
171
+ try:
172
+ henge_to_query = self.henges[item_type]
173
+ except:
174
+ _LOGGER.debug("No henges available for this item type")
175
+ raise NotFoundException(druid)
176
+ # _LOGGER.debug("item_type: {}".format(item_type))
177
+ # _LOGGER.debug("henge_to_query: {}".format(henge_to_query))
178
+ try:
179
+ string = henge_to_query.database[druid]
180
+ except KeyError:
181
+ raise NotFoundException(druid)
182
+
183
+ schema = self.schemas[item_type]
184
+ _LOGGER.debug("Got druid to retrieve: {} / item_type: {} / schema: {}".format(
185
+ druid, item_type, schema))
186
+ return reconstruct_item(string, schema, reclimit)
187
+
188
+
189
+
190
+ Part of: _insert_flat
191
+
192
+ def safestr(item, x):
193
+ try:
194
+ return str(item[x])
195
+ except (ValueError, TypeError, KeyError):
196
+ return ""
197
+
198
+
199
+
200
+ def build_attr_string(item, schema, item_name=None):
201
+ if "type" in schema and schema["type"] == "array":
202
+ if self.flexible_digests:
203
+ return DELIM_ITEM.join([build_attr_string(x, schema['items'])
204
+ for x in item])
205
+ else:
206
+ return DELIM_ITEM.join([build_attr_string(x, schema['items'])
207
+ for x in item])
208
+ elif schema["type"] == "object" and 'properties' in schema:
209
+ if self.flexible_digests:
210
+ # flexible schema
211
+ keys_to_include = sorted([x for x in item.keys() if x in list(schema['properties'].keys())])
212
+ return DELIM_ATTR.join([DELIM_ATTR.join([k, safestr(item, k)]) for k in keys_to_include])
213
+
214
+ else:
215
+ # fixed schema
216
+ return DELIM_ATTR.join([safestr(item, x) for x in
217
+ list(schema['properties'].keys())])
218
+ else: #assume it's a primitive
219
+ if self.flexible_digests:
220
+ return item
221
+ attr_string = DELIM_ATTR.join([item_name, item])
222
+ return attr_string
223
+ else:
224
+ return item