PyPI - datafun - Versions diffs - 0.5.2__tar.gz → 0.6.1__tar.gz - Mend

datafun 0.5.2tar.gz → 0.6.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

{datafun-0.5.2 → datafun-0.6.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: datafun
-Version: 0.5.2
+Version: 0.6.1
 Summary: datafun brings the fun back to data pipelines
 Author-email: "Diego Giorgini, Luigi Di Sotto, Saeed Choobani" <diego.giorgini@aitechnologies.it>
 Requires-Python: >=3.8
@@ -12,6 +12,7 @@ Requires-Dist: tqdm
 Requires-Dist: google-cloud-storage
 Requires-Dist: elasticsearch<8
 Requires-Dist: requests
+Dynamic: license-file
 # 🍻 datafun [![Downloads](https://pepy.tech/badge/datafun)](https://pepy.tech/project/datafun)
@@ -304,7 +305,7 @@ You can see examples for every operation in the [dedicated notebook](./examples/
 | **start_isodate**        | str (ISO datetime)    | Yes      |            | Elastic start date range with format: "2021-09-15T10:00:00.000Z"                                                         |
 | **end_isodate**          | str (ISO datetime)    | Yes      |            | Elastic end date range with format: "2021-09-15T10:00:00.000Z"                                                           |
 | **date_field**           | str                   | No       | @timestamp | Elastic date field. Can be nested into list, eg. "messages.date"                                                         |
-| **date_field_separator** | str                   | No       | .          | Separator for date_field used to split the path. Use different ones to NOT split and consider date_field as single field |
+| **date_field_separator** | str                   | No       | .          | [DEPRECATED] (separator automatically inferred) Separator for date_field used to split the path. Use different ones to NOT split and consider date_field as single field |
 **Returned element type**: ```dict```. Each element is a document matching the given query.

{datafun-0.5.2 → datafun-0.6.1}/README.md RENAMED Viewed

@@ -289,7 +289,7 @@ You can see examples for every operation in the [dedicated notebook](./examples/
 | **start_isodate**        | str (ISO datetime)    | Yes      |            | Elastic start date range with format: "2021-09-15T10:00:00.000Z"                                                         |
 | **end_isodate**          | str (ISO datetime)    | Yes      |            | Elastic end date range with format: "2021-09-15T10:00:00.000Z"                                                           |
 | **date_field**           | str                   | No       | @timestamp | Elastic date field. Can be nested into list, eg. "messages.date"                                                         |
-| **date_field_separator** | str                   | No       | .          | Separator for date_field used to split the path. Use different ones to NOT split and consider date_field as single field |
+| **date_field_separator** | str                   | No       | .          | [DEPRECATED] (separator automatically inferred) Separator for date_field used to split the path. Use different ones to NOT split and consider date_field as single field |
 **Returned element type**: ```dict```. Each element is a document matching the given query.

{datafun-0.5.2 → datafun-0.6.1}/datafun/sources/elk.py RENAMED Viewed

@@ -31,6 +31,8 @@ class ELKDatasetConfig:
 class ELKDataset(DatasetSource):
     def __init__(self, config: ELKDatasetConfig, **kwargs):
+        if 'date_field_separator' in kwargs:
+            print("WARN: date_field_separator is deprecated, the separator is now automatically inferred")
         super().__init__(config=config, **kwargs)
         self.es = Elasticsearch(
@@ -53,28 +55,36 @@ class ELKDataset(DatasetSource):
     def _generate_examples(self) -> Generator[dict, None, None]:
         for Q in self.Qs:
-            res = self._search(index=self.config.index, body=json.dumps(Q), scroll='20s')
-            hits = dl.get(res, 'hits.hits', default=[])
-            old_scroll_id = dl.get(res, '_scroll_id')
-            while len(hits) > 0:
-                # iterate over the document hits for each 'scroll'
-                for doc in hits:  # type: ignore
-                    yield doc
-                # make a request using the Scroll API
-                try:
-                    res = self._scroll(scroll_id=old_scroll_id, scroll='20s')
-                    hits = dl.get(res, 'hits.hits')
-                    # keep track of past scroll_id
-                    old_scroll_id = dl.get(res, '_scroll_id')
-                except exceptions.NotFoundError:
-                    hits = []
-                except exceptions.RequestError:
-                    # _scroll_id is None, because the response is not "paginated"
-                    hits = []
-            self.es.clear_scroll(scroll_id=old_scroll_id)
+            res = None
+            old_scroll_id = None
+            try:
+                res = self._search(index=self.config.index, body=json.dumps(Q), scroll='20s')
+                hits = dl.get(res, 'hits.hits', default=[])
+                old_scroll_id = dl.get(res, '_scroll_id')
+                while len(hits) > 0:
+                    # iterate over the document hits for each 'scroll'
+                    for doc in hits:  # type: ignore
+                        yield doc
+                    # make a request using the Scroll API
+                    try:
+                        res = self._scroll(scroll_id=old_scroll_id, scroll='20s')
+                        hits = dl.get(res, 'hits.hits')
+                        # keep track of past scroll_id
+                        old_scroll_id = dl.get(res, '_scroll_id')
+                    except exceptions.NotFoundError:
+                        hits = []
+                    except exceptions.RequestError:
+                        # _scroll_id is None, because the response is not "paginated"
+                        hits = []
+            finally:
+                if old_scroll_id is not None:
+                    try:
+                        self.es.clear_scroll(scroll_id=old_scroll_id)
+                    except Exception:
+                        # Clear failures should not hide the original exception
+                        pass
     def set_time_interval(self, query: dict) -> dict:
         if not isinstance(query, dict):
@@ -94,11 +104,17 @@ class ELKDataset(DatasetSource):
         if not isinstance(xs, List):
             raise TypeError(f'Field query.bool.filter must be of type List, but found of type {type(xs)}')
-        sep = self.config.date_field_separator
+        path_sep_alts = ['/', '//--@@--//']
+        path_sep = '.'
+        while path_sep in self.config.date_field:
+            try:
+                path_sep = path_sep_alts.pop(0)
+            except Exception as e:
+                raise ValueError(f'Field {self.config.date_field} contains invalid characters. Exception: {e}')
         for idx, obj in enumerate(xs):
             if dl.has(obj, "range"):
-                obj = dl.update(obj, f"range{sep}{self.config.date_field}{sep}gte", value=self.config.start_isodate, sep=sep)
-                obj = dl.update(obj, f"range{sep}{self.config.date_field}{sep}lte", value=self.config.end_isodate, sep=sep)
+                obj = dl.update(obj, f"range{path_sep}{self.config.date_field}{path_sep}gte", value=self.config.start_isodate, sep=path_sep)
+                obj = dl.update(obj, f"range{path_sep}{self.config.date_field}{path_sep}lte", value=self.config.end_isodate, sep=path_sep)
                 if not obj:
                     raise ValueError(f'{self.config.date_field}.lte or {self.config.date_field}.lte fields can\'t be updated, e.g. check '
                                     'if they exist in the query.')

{datafun-0.5.2 → datafun-0.6.1}/datafun.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: datafun
-Version: 0.5.2
+Version: 0.6.1
 Summary: datafun brings the fun back to data pipelines
 Author-email: "Diego Giorgini, Luigi Di Sotto, Saeed Choobani" <diego.giorgini@aitechnologies.it>
 Requires-Python: >=3.8
@@ -12,6 +12,7 @@ Requires-Dist: tqdm
 Requires-Dist: google-cloud-storage
 Requires-Dist: elasticsearch<8
 Requires-Dist: requests
+Dynamic: license-file
 # 🍻 datafun [![Downloads](https://pepy.tech/badge/datafun)](https://pepy.tech/project/datafun)
@@ -304,7 +305,7 @@ You can see examples for every operation in the [dedicated notebook](./examples/
 | **start_isodate**        | str (ISO datetime)    | Yes      |            | Elastic start date range with format: "2021-09-15T10:00:00.000Z"                                                         |
 | **end_isodate**          | str (ISO datetime)    | Yes      |            | Elastic end date range with format: "2021-09-15T10:00:00.000Z"                                                           |
 | **date_field**           | str                   | No       | @timestamp | Elastic date field. Can be nested into list, eg. "messages.date"                                                         |
-| **date_field_separator** | str                   | No       | .          | Separator for date_field used to split the path. Use different ones to NOT split and consider date_field as single field |
+| **date_field_separator** | str                   | No       | .          | [DEPRECATED] (separator automatically inferred) Separator for date_field used to split the path. Use different ones to NOT split and consider date_field as single field |
 **Returned element type**: ```dict```. Each element is a document matching the given query.

{datafun-0.5.2 → datafun-0.6.1}/pyproject.toml RENAMED Viewed

@@ -8,7 +8,7 @@ authors = [
     { name = "Diego Giorgini, Luigi Di Sotto, Saeed Choobani", email = "diego.giorgini@aitechnologies.it" }
 ]
 description = "datafun brings the fun back to data pipelines"
-version = "0.5.2"
+version = "0.6.1"
 requires-python = ">=3.8"
 dependencies = [
     "backoff",