datafun 0.5.2__tar.gz → 0.6.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datafun-0.5.2 → datafun-0.6.1}/PKG-INFO +4 -3
- {datafun-0.5.2 → datafun-0.6.1}/README.md +1 -1
- {datafun-0.5.2 → datafun-0.6.1}/datafun/sources/elk.py +41 -25
- {datafun-0.5.2 → datafun-0.6.1}/datafun.egg-info/PKG-INFO +4 -3
- {datafun-0.5.2 → datafun-0.6.1}/pyproject.toml +1 -1
- {datafun-0.5.2 → datafun-0.6.1}/LICENSE +0 -0
- {datafun-0.5.2 → datafun-0.6.1}/datafun/__init__.py +0 -0
- {datafun-0.5.2 → datafun-0.6.1}/datafun/cache.py +0 -0
- {datafun-0.5.2 → datafun-0.6.1}/datafun/dataset.py +0 -0
- {datafun-0.5.2 → datafun-0.6.1}/datafun/sources/__init__.py +0 -0
- {datafun-0.5.2 → datafun-0.6.1}/datafun/sources/gcs.py +0 -0
- {datafun-0.5.2 → datafun-0.6.1}/datafun/sources/iterable.py +0 -0
- {datafun-0.5.2 → datafun-0.6.1}/datafun/sources/local_file.py +0 -0
- {datafun-0.5.2 → datafun-0.6.1}/datafun/sources/rest.py +0 -0
- {datafun-0.5.2 → datafun-0.6.1}/datafun/utils.py +0 -0
- {datafun-0.5.2 → datafun-0.6.1}/datafun.egg-info/SOURCES.txt +0 -0
- {datafun-0.5.2 → datafun-0.6.1}/datafun.egg-info/dependency_links.txt +0 -0
- {datafun-0.5.2 → datafun-0.6.1}/datafun.egg-info/requires.txt +0 -0
- {datafun-0.5.2 → datafun-0.6.1}/datafun.egg-info/top_level.txt +0 -0
- {datafun-0.5.2 → datafun-0.6.1}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: datafun
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.1
|
|
4
4
|
Summary: datafun brings the fun back to data pipelines
|
|
5
5
|
Author-email: "Diego Giorgini, Luigi Di Sotto, Saeed Choobani" <diego.giorgini@aitechnologies.it>
|
|
6
6
|
Requires-Python: >=3.8
|
|
@@ -12,6 +12,7 @@ Requires-Dist: tqdm
|
|
|
12
12
|
Requires-Dist: google-cloud-storage
|
|
13
13
|
Requires-Dist: elasticsearch<8
|
|
14
14
|
Requires-Dist: requests
|
|
15
|
+
Dynamic: license-file
|
|
15
16
|
|
|
16
17
|
# 🍻 datafun [](https://pepy.tech/project/datafun)
|
|
17
18
|
|
|
@@ -304,7 +305,7 @@ You can see examples for every operation in the [dedicated notebook](./examples/
|
|
|
304
305
|
| **start_isodate** | str (ISO datetime) | Yes | | Elastic start date range with format: "2021-09-15T10:00:00.000Z" |
|
|
305
306
|
| **end_isodate** | str (ISO datetime) | Yes | | Elastic end date range with format: "2021-09-15T10:00:00.000Z" |
|
|
306
307
|
| **date_field** | str | No | @timestamp | Elastic date field. Can be nested into list, eg. "messages.date" |
|
|
307
|
-
| **date_field_separator** | str | No | . | Separator for date_field used to split the path. Use different ones to NOT split and consider date_field as single field |
|
|
308
|
+
| **date_field_separator** | str | No | . | [DEPRECATED] (separator automatically inferred) Separator for date_field used to split the path. Use different ones to NOT split and consider date_field as single field |
|
|
308
309
|
|
|
309
310
|
**Returned element type**: ```dict```. Each element is a document matching the given query.
|
|
310
311
|
|
|
@@ -289,7 +289,7 @@ You can see examples for every operation in the [dedicated notebook](./examples/
|
|
|
289
289
|
| **start_isodate** | str (ISO datetime) | Yes | | Elastic start date range with format: "2021-09-15T10:00:00.000Z" |
|
|
290
290
|
| **end_isodate** | str (ISO datetime) | Yes | | Elastic end date range with format: "2021-09-15T10:00:00.000Z" |
|
|
291
291
|
| **date_field** | str | No | @timestamp | Elastic date field. Can be nested into list, eg. "messages.date" |
|
|
292
|
-
| **date_field_separator** | str | No | . | Separator for date_field used to split the path. Use different ones to NOT split and consider date_field as single field |
|
|
292
|
+
| **date_field_separator** | str | No | . | [DEPRECATED] (separator automatically inferred) Separator for date_field used to split the path. Use different ones to NOT split and consider date_field as single field |
|
|
293
293
|
|
|
294
294
|
**Returned element type**: ```dict```. Each element is a document matching the given query.
|
|
295
295
|
|
|
@@ -31,6 +31,8 @@ class ELKDatasetConfig:
|
|
|
31
31
|
|
|
32
32
|
class ELKDataset(DatasetSource):
|
|
33
33
|
def __init__(self, config: ELKDatasetConfig, **kwargs):
|
|
34
|
+
if 'date_field_separator' in kwargs:
|
|
35
|
+
print("WARN: date_field_separator is deprecated, the separator is now automatically inferred")
|
|
34
36
|
super().__init__(config=config, **kwargs)
|
|
35
37
|
|
|
36
38
|
self.es = Elasticsearch(
|
|
@@ -53,28 +55,36 @@ class ELKDataset(DatasetSource):
|
|
|
53
55
|
|
|
54
56
|
def _generate_examples(self) -> Generator[dict, None, None]:
|
|
55
57
|
for Q in self.Qs:
|
|
56
|
-
res =
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
58
|
+
res = None
|
|
59
|
+
old_scroll_id = None
|
|
60
|
+
try:
|
|
61
|
+
res = self._search(index=self.config.index, body=json.dumps(Q), scroll='20s')
|
|
62
|
+
hits = dl.get(res, 'hits.hits', default=[])
|
|
63
|
+
|
|
64
|
+
old_scroll_id = dl.get(res, '_scroll_id')
|
|
65
|
+
while len(hits) > 0:
|
|
66
|
+
# iterate over the document hits for each 'scroll'
|
|
67
|
+
for doc in hits: # type: ignore
|
|
68
|
+
yield doc
|
|
69
|
+
|
|
70
|
+
# make a request using the Scroll API
|
|
71
|
+
try:
|
|
72
|
+
res = self._scroll(scroll_id=old_scroll_id, scroll='20s')
|
|
73
|
+
hits = dl.get(res, 'hits.hits')
|
|
74
|
+
# keep track of past scroll_id
|
|
75
|
+
old_scroll_id = dl.get(res, '_scroll_id')
|
|
76
|
+
except exceptions.NotFoundError:
|
|
77
|
+
hits = []
|
|
78
|
+
except exceptions.RequestError:
|
|
79
|
+
# _scroll_id is None, because the response is not "paginated"
|
|
80
|
+
hits = []
|
|
81
|
+
finally:
|
|
82
|
+
if old_scroll_id is not None:
|
|
83
|
+
try:
|
|
84
|
+
self.es.clear_scroll(scroll_id=old_scroll_id)
|
|
85
|
+
except Exception:
|
|
86
|
+
# Clear failures should not hide the original exception
|
|
87
|
+
pass
|
|
78
88
|
|
|
79
89
|
def set_time_interval(self, query: dict) -> dict:
|
|
80
90
|
if not isinstance(query, dict):
|
|
@@ -94,11 +104,17 @@ class ELKDataset(DatasetSource):
|
|
|
94
104
|
if not isinstance(xs, List):
|
|
95
105
|
raise TypeError(f'Field query.bool.filter must be of type List, but found of type {type(xs)}')
|
|
96
106
|
|
|
97
|
-
|
|
107
|
+
path_sep_alts = ['/', '//--@@--//']
|
|
108
|
+
path_sep = '.'
|
|
109
|
+
while path_sep in self.config.date_field:
|
|
110
|
+
try:
|
|
111
|
+
path_sep = path_sep_alts.pop(0)
|
|
112
|
+
except Exception as e:
|
|
113
|
+
raise ValueError(f'Field {self.config.date_field} contains invalid characters. Exception: {e}')
|
|
98
114
|
for idx, obj in enumerate(xs):
|
|
99
115
|
if dl.has(obj, "range"):
|
|
100
|
-
obj = dl.update(obj, f"range{
|
|
101
|
-
obj = dl.update(obj, f"range{
|
|
116
|
+
obj = dl.update(obj, f"range{path_sep}{self.config.date_field}{path_sep}gte", value=self.config.start_isodate, sep=path_sep)
|
|
117
|
+
obj = dl.update(obj, f"range{path_sep}{self.config.date_field}{path_sep}lte", value=self.config.end_isodate, sep=path_sep)
|
|
102
118
|
if not obj:
|
|
103
119
|
raise ValueError(f'{self.config.date_field}.lte or {self.config.date_field}.lte fields can\'t be updated, e.g. check '
|
|
104
120
|
'if they exist in the query.')
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: datafun
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.1
|
|
4
4
|
Summary: datafun brings the fun back to data pipelines
|
|
5
5
|
Author-email: "Diego Giorgini, Luigi Di Sotto, Saeed Choobani" <diego.giorgini@aitechnologies.it>
|
|
6
6
|
Requires-Python: >=3.8
|
|
@@ -12,6 +12,7 @@ Requires-Dist: tqdm
|
|
|
12
12
|
Requires-Dist: google-cloud-storage
|
|
13
13
|
Requires-Dist: elasticsearch<8
|
|
14
14
|
Requires-Dist: requests
|
|
15
|
+
Dynamic: license-file
|
|
15
16
|
|
|
16
17
|
# 🍻 datafun [](https://pepy.tech/project/datafun)
|
|
17
18
|
|
|
@@ -304,7 +305,7 @@ You can see examples for every operation in the [dedicated notebook](./examples/
|
|
|
304
305
|
| **start_isodate** | str (ISO datetime) | Yes | | Elastic start date range with format: "2021-09-15T10:00:00.000Z" |
|
|
305
306
|
| **end_isodate** | str (ISO datetime) | Yes | | Elastic end date range with format: "2021-09-15T10:00:00.000Z" |
|
|
306
307
|
| **date_field** | str | No | @timestamp | Elastic date field. Can be nested into list, eg. "messages.date" |
|
|
307
|
-
| **date_field_separator** | str | No | . | Separator for date_field used to split the path. Use different ones to NOT split and consider date_field as single field |
|
|
308
|
+
| **date_field_separator** | str | No | . | [DEPRECATED] (separator automatically inferred) Separator for date_field used to split the path. Use different ones to NOT split and consider date_field as single field |
|
|
308
309
|
|
|
309
310
|
**Returned element type**: ```dict```. Each element is a document matching the given query.
|
|
310
311
|
|
|
@@ -8,7 +8,7 @@ authors = [
|
|
|
8
8
|
{ name = "Diego Giorgini, Luigi Di Sotto, Saeed Choobani", email = "diego.giorgini@aitechnologies.it" }
|
|
9
9
|
]
|
|
10
10
|
description = "datafun brings the fun back to data pipelines"
|
|
11
|
-
version = "0.
|
|
11
|
+
version = "0.6.1"
|
|
12
12
|
requires-python = ">=3.8"
|
|
13
13
|
dependencies = [
|
|
14
14
|
"backoff",
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|