PyPI - ckanapi-harvesters - Versions diffs - 0.0.0__py3-none-any.whl → 0.0.2__py3-none-any.whl - Mend

ckanapi-harvesters 0.0.0py3-none-any.whl → 0.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (110) hide show

ckanapi_harvesters/__init__.py +32 -10
ckanapi_harvesters/auxiliary/__init__.py +26 -0
ckanapi_harvesters/auxiliary/ckan_action.py +93 -0
ckanapi_harvesters/auxiliary/ckan_api_key.py +213 -0
ckanapi_harvesters/auxiliary/ckan_auxiliary.py +293 -0
ckanapi_harvesters/auxiliary/ckan_configuration.py +50 -0
ckanapi_harvesters/auxiliary/ckan_defs.py +10 -0
ckanapi_harvesters/auxiliary/ckan_errors.py +129 -0
ckanapi_harvesters/auxiliary/ckan_map.py +509 -0
ckanapi_harvesters/auxiliary/ckan_model.py +992 -0
ckanapi_harvesters/auxiliary/ckan_vocabulary_deprecated.py +104 -0
ckanapi_harvesters/auxiliary/deprecated.py +82 -0
ckanapi_harvesters/auxiliary/error_level_message.py +51 -0
ckanapi_harvesters/auxiliary/external_code_import.py +98 -0
ckanapi_harvesters/auxiliary/list_records.py +60 -0
ckanapi_harvesters/auxiliary/login.py +163 -0
ckanapi_harvesters/auxiliary/path.py +208 -0
ckanapi_harvesters/auxiliary/proxy_config.py +298 -0
ckanapi_harvesters/auxiliary/urls.py +40 -0
ckanapi_harvesters/builder/__init__.py +40 -0
ckanapi_harvesters/builder/builder_aux.py +20 -0
ckanapi_harvesters/builder/builder_ckan.py +238 -0
ckanapi_harvesters/builder/builder_errors.py +36 -0
ckanapi_harvesters/builder/builder_field.py +122 -0
ckanapi_harvesters/builder/builder_package.py +9 -0
ckanapi_harvesters/builder/builder_package_1_basic.py +1291 -0
ckanapi_harvesters/builder/builder_package_2_harvesters.py +40 -0
ckanapi_harvesters/builder/builder_package_3_multi_threaded.py +45 -0
ckanapi_harvesters/builder/builder_package_example.xlsx +0 -0
ckanapi_harvesters/builder/builder_resource.py +589 -0
ckanapi_harvesters/builder/builder_resource_datastore.py +561 -0
ckanapi_harvesters/builder/builder_resource_datastore_multi_abc.py +367 -0
ckanapi_harvesters/builder/builder_resource_datastore_multi_folder.py +273 -0
ckanapi_harvesters/builder/builder_resource_datastore_multi_harvester.py +278 -0
ckanapi_harvesters/builder/builder_resource_datastore_unmanaged.py +145 -0
ckanapi_harvesters/builder/builder_resource_datastore_url.py +150 -0
ckanapi_harvesters/builder/builder_resource_init.py +126 -0
ckanapi_harvesters/builder/builder_resource_multi_abc.py +361 -0
ckanapi_harvesters/builder/builder_resource_multi_datastore.py +146 -0
ckanapi_harvesters/builder/builder_resource_multi_file.py +505 -0
ckanapi_harvesters/builder/example/__init__.py +21 -0
ckanapi_harvesters/builder/example/builder_example.py +21 -0
ckanapi_harvesters/builder/example/builder_example_aux_fun.py +24 -0
ckanapi_harvesters/builder/example/builder_example_download.py +44 -0
ckanapi_harvesters/builder/example/builder_example_generate_data.py +73 -0
ckanapi_harvesters/builder/example/builder_example_patch_upload.py +51 -0
ckanapi_harvesters/builder/example/builder_example_policy.py +114 -0
ckanapi_harvesters/builder/example/builder_example_test_sql.py +53 -0
ckanapi_harvesters/builder/example/builder_example_tests.py +87 -0
ckanapi_harvesters/builder/example/builder_example_tests_offline.py +57 -0
ckanapi_harvesters/builder/example/package/ckan-dpg.svg +74 -0
ckanapi_harvesters/builder/example/package/users_local.csv +3 -0
ckanapi_harvesters/builder/mapper_datastore.py +93 -0
ckanapi_harvesters/builder/mapper_datastore_multi.py +262 -0
ckanapi_harvesters/builder/specific/__init__.py +11 -0
ckanapi_harvesters/builder/specific/configuration_builder.py +66 -0
ckanapi_harvesters/builder/specific_builder_abc.py +23 -0
ckanapi_harvesters/ckan_api/__init__.py +20 -0
ckanapi_harvesters/ckan_api/ckan_api.py +11 -0
ckanapi_harvesters/ckan_api/ckan_api_0_base.py +896 -0
ckanapi_harvesters/ckan_api/ckan_api_1_map.py +1028 -0
ckanapi_harvesters/ckan_api/ckan_api_2_readonly.py +934 -0
ckanapi_harvesters/ckan_api/ckan_api_3_policy.py +229 -0
ckanapi_harvesters/ckan_api/ckan_api_4_readwrite.py +579 -0
ckanapi_harvesters/ckan_api/ckan_api_5_manage.py +1225 -0
ckanapi_harvesters/ckan_api/ckan_api_params.py +192 -0
ckanapi_harvesters/ckan_api/deprecated/__init__.py +9 -0
ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated.py +267 -0
ckanapi_harvesters/ckan_api/deprecated/ckan_api_deprecated_vocabularies.py +189 -0
ckanapi_harvesters/harvesters/__init__.py +23 -0
ckanapi_harvesters/harvesters/data_cleaner/__init__.py +17 -0
ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_abc.py +240 -0
ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_errors.py +23 -0
ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload.py +9 -0
ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_1_basic.py +430 -0
ckanapi_harvesters/harvesters/data_cleaner/data_cleaner_upload_2_geom.py +98 -0
ckanapi_harvesters/harvesters/file_formats/__init__.py +10 -0
ckanapi_harvesters/harvesters/file_formats/csv_format.py +43 -0
ckanapi_harvesters/harvesters/file_formats/file_format_abc.py +39 -0
ckanapi_harvesters/harvesters/file_formats/file_format_init.py +25 -0
ckanapi_harvesters/harvesters/file_formats/shp_format.py +129 -0
ckanapi_harvesters/harvesters/harvester_abc.py +190 -0
ckanapi_harvesters/harvesters/harvester_errors.py +31 -0
ckanapi_harvesters/harvesters/harvester_init.py +30 -0
ckanapi_harvesters/harvesters/harvester_model.py +49 -0
ckanapi_harvesters/harvesters/harvester_params.py +323 -0
ckanapi_harvesters/harvesters/postgre_harvester.py +495 -0
ckanapi_harvesters/harvesters/postgre_params.py +86 -0
ckanapi_harvesters/harvesters/pymongo_data_cleaner.py +173 -0
ckanapi_harvesters/harvesters/pymongo_harvester.py +355 -0
ckanapi_harvesters/harvesters/pymongo_params.py +54 -0
ckanapi_harvesters/policies/__init__.py +20 -0
ckanapi_harvesters/policies/data_format_policy.py +269 -0
ckanapi_harvesters/policies/data_format_policy_abc.py +97 -0
ckanapi_harvesters/policies/data_format_policy_custom_fields.py +156 -0
ckanapi_harvesters/policies/data_format_policy_defs.py +135 -0
ckanapi_harvesters/policies/data_format_policy_errors.py +79 -0
ckanapi_harvesters/policies/data_format_policy_lists.py +234 -0
ckanapi_harvesters/policies/data_format_policy_tag_groups.py +35 -0
ckanapi_harvesters/reports/__init__.py +11 -0
ckanapi_harvesters/reports/admin_report.py +292 -0
{ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.2.dist-info}/METADATA +74 -38
ckanapi_harvesters-0.0.2.dist-info/RECORD +105 -0
ckanapi_harvesters/divider/__init__.py +0 -27
ckanapi_harvesters/divider/divider.py +0 -53
ckanapi_harvesters/divider/divider_error.py +0 -59
ckanapi_harvesters/main.py +0 -30
ckanapi_harvesters-0.0.0.dist-info/RECORD +0 -9
{ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.2.dist-info}/WHEEL +0 -0
{ckanapi_harvesters-0.0.0.dist-info → ckanapi_harvesters-0.0.2.dist-info}/top_level.txt +0 -0

ckanapi_harvesters/builder/builder_resource_datastore_multi_abc.py ADDED Viewed

@@ -0,0 +1,367 @@
+#!python3
+# -*- coding: utf-8 -*-
+"""
+Code to initiate a DataStore defined by a large number of files to concatenate into one table
+"""
+from concurrent.futures import ThreadPoolExecutor
+import threading
+from threading import current_thread
+from abc import ABC, abstractmethod
+from typing import Dict, List, Callable, Any, Tuple, Generator, Union, Set
+from warnings import warn
+import copy
+import pandas as pd
+from ckanapi_harvesters.builder.builder_resource_datastore import BuilderDataStoreABC
+from ckanapi_harvesters.builder.builder_aux import positive_end_index
+from ckanapi_harvesters.auxiliary.ckan_model import UpsertChoice, CkanResourceInfo
+from ckanapi_harvesters.auxiliary.ckan_auxiliary import datastore_id_col
+from ckanapi_harvesters.ckan_api import CkanApi
+from ckanapi_harvesters.builder.mapper_datastore_multi import RequestFileMapperABC, default_file_mapper_from_primary_key
+from ckanapi_harvesters.builder.builder_resource_multi_file import BuilderMultiABC, default_progress_callback
+# apply last_condition for each upsert request when in a multi-threaded upload on a same DataStore:
+datastore_multi_threaded_always_last_condition:bool = True
+# when there are multiple files, apply last insertion commands after each document? True: after each csv file, False: only at the end
+datastore_multi_apply_last_condition_intermediary:bool = False
+class BuilderDataStoreMultiABC(BuilderDataStoreABC, BuilderMultiABC, ABC):
+    """
+    generic class to manage large DataStore, divided into files/parts
+    This abstract class is intended to be overloaded in order to be used to generate data from the workspace, without using CSV files
+    """
+    def __init__(self, *, name:str=None, format:str=None, description:str=None,
+                 resource_id:str=None, download_url:str=None, dirname:str=None):
+        super().__init__(name=name, format=format, description=description, resource_id=resource_id, download_url=download_url)
+        # Functions inputs/outputs
+        self.df_mapper: RequestFileMapperABC = default_file_mapper_from_primary_key(self.primary_key)
+        self.reupload_if_needed = False  # do not reupload if needed because this could cause data loss (the upload function only uploads the first table)
+        self.upsert_method: UpsertChoice = UpsertChoice.Upsert
+        # BuilderMultiABC:
+        self.stop_event = threading.Event()
+        self.thread_ckan: Dict[str, CkanApi] = {}
+        self.progress_callback: Union[Callable[[int, int, Any], None], None] = default_progress_callback
+        self.progress_callback_kwargs: dict = {}
+        self.enable_multi_threaded_upload:bool = True
+        self.enable_multi_threaded_download:bool = True
+    def copy(self, *, dest=None):
+        super().copy(dest=dest)
+        dest.reupload_if_needed = self.reupload_if_needed
+        # BuilderMultiABC:
+        dest.progress_callback = self.progress_callback
+        dest.progress_callback_kwargs = copy.deepcopy(self.progress_callback_kwargs)
+        dest.enable_multi_threaded_upload = self.enable_multi_threaded_upload
+        dest.enable_multi_threaded_download = self.enable_multi_threaded_download
+        # do not copy stop_event
+        return dest
+    ## upload ---------
+    @abstractmethod
+    def get_local_df_generator(self, resources_base_dir:str) -> Generator[pd.DataFrame, None, None]:
+        """
+        Returns an iterator over the parts of the upload, loaded as DataFrames (not recommended in a multi-threaded context).
+        """
+        raise NotImplementedError()
+    @abstractmethod
+    def load_local_df(self, file: Any, **kwargs) -> pd.DataFrame:
+        """
+        Load the DataFrame pointed by the upload part "file"
+        """
+        raise NotImplementedError()
+    # do not change default argument apply_last_condition=True
+    # def upsert_request_df(self, ckan: CkanApi, df_upload:pd.DataFrame,
+    #                       method:UpsertChoice=UpsertChoice.Upsert,
+    #                       apply_last_condition:bool=None) -> Tuple[pd.DataFrame, pd.DataFrame]:
+    #     # calls super method, with apply_last_condition defaulting to datastore_multi_apply_last_condition_intermediary
+    #     if apply_last_condition is None:
+    #         apply_last_condition = True  # datastore_multi_apply_last_condition_intermediary
+    #     return super().upsert_request_df(ckan=ckan, df_upload=df_upload, method=method,
+    #                                      apply_last_condition=apply_last_condition)
+    def _get_primary_key_indexes(self, data_cleaner_index: Set[str], current_fields:Set[str], error_missing:bool, empty_datastore:bool=False) -> Tuple[Union[List[str],None], Union[List[str],None]]:
+        primary_key, indexes = super()._get_primary_key_indexes(data_cleaner_index, current_fields, error_missing, empty_datastore)
+        # it is highly recommended to specify a primary key: warning if not defined
+        if primary_key is None:
+            msg = f"It is highly recommended to specify the primary key for a DataStore defined from a directory to ensure no duplicate values are upserted to the database. Resource: {self.name}"
+            warn(msg)
+        else:
+            ultra_required_fields = set(primary_key)
+            missing_fields = ultra_required_fields
+            if current_fields is not None:
+                missing_fields -= current_fields
+            if len(missing_fields) > 0:
+                msg = f"The primary key {self.primary_key} is set for resource {self.name} but it is not present in the sample data."
+                warn(msg)
+        if primary_key is None or len(primary_key) == 0:
+            self.upsert_method = UpsertChoice.Insert  # do not use upsert
+        return primary_key, indexes
+    def upsert_request_final(self, ckan: CkanApi, *, force:bool=False) -> None:
+        """
+        Final steps after the last upsert query.
+        This call is mandatory at the end of all requests if the user called upsert_request_df for a multi-part DataStore manually.
+        :param ckan:
+        :param force: perform request anyways
+        :return:
+        """
+        force = force or not datastore_multi_apply_last_condition_intermediary
+        return super().upsert_request_final(ckan, force=force)
+    def upload_request_final(self, ckan: CkanApi, *, force:bool=False) -> None:
+        return self.upsert_request_final(ckan=ckan, force=force)
+    def upsert_request_df_no_return(self, ckan: CkanApi, df_upload:pd.DataFrame,
+                                    method:UpsertChoice=UpsertChoice.Upsert,
+                                    apply_last_condition:bool=None, always_last_condition:bool=None) -> None:
+        """
+        Calls upsert_request_df but does not return anything
+        :return:
+        """
+        self.upsert_request_df(ckan=ckan, df_upload=df_upload, method=method,
+                               apply_last_condition=apply_last_condition, always_last_condition=always_last_condition)
+        return None
+    def _unit_upload_apply(self, *, ckan: CkanApi, file: str,
+                           index: int, start_index: int, end_index: int, total: int,
+                           method: UpsertChoice, **kwargs) -> None:
+        if index == 0 and self.upsert_method == UpsertChoice.Insert:
+            return  # do not reupload the first document, which was used for the initialization of the dataset
+        if start_index <= index and index < end_index:
+            df_upload_local = self.load_local_df(file, **kwargs)
+            self._call_progress_callback(index, total, info=df_upload_local,
+                                         context=f"{ckan.identifier} single-thread upload")
+            self.upsert_request_df_no_return(ckan=ckan, df_upload=df_upload_local, method=method,
+                                             apply_last_condition=datastore_multi_apply_last_condition_intermediary)
+        else:
+            # self._call_progress_callback(index, total, info=df_upload_local, context=f"{ckan.identifier} single-thread skip")
+            pass
+    def upload_request_full(self, ckan:CkanApi, resources_base_dir:str, *,
+                            method:UpsertChoice=None,
+                            threads:int=1, external_stop_event=None,
+                            only_missing:bool=False,
+                            start_index:int=0, end_index:int=None, **kwargs) -> None:
+        self.df_mapper.upsert_only_missing_rows = only_missing
+        if method is None:
+            if self.primary_key is None or len(self.primary_key) == 0:
+                self.upsert_method = UpsertChoice.Insert  # do not use upsert if there is no primary key
+            method = self.upsert_method
+        super().upload_request_full(ckan=ckan, resources_base_dir=resources_base_dir,
+                                    threads=threads, external_stop_event=external_stop_event,
+                                    start_index=start_index, end_index=end_index,
+                                    method=method, **kwargs)
+        # if threads < 0:
+        #     # cancel large uploads in this case
+        #     return None
+        # elif threads is None or threads > 1:
+        #     return self.upload_request_full_multi_threaded(resources_base_dir=resources_base_dir, ckan=ckan, method=method,
+        #                                                    threads=threads, external_stop_event=external_stop_event,
+        #                                                    start_index=start_index, end_index=end_index)
+        # else:
+        #     self.init_local_files_list(resources_base_dir=resources_base_dir, cancel_if_present=True)
+        #     if ckan.verbose_extra:
+        #         print(f"Launching single-threaded upload of multi-file resource {self.name}")
+        #     total = self.get_local_file_len()
+        #     end_index = positive_end_index(end_index, total)
+        #     for index, file in enumerate(self.get_local_file_generator(resources_base_dir=resources_base_dir)):
+        #         if external_stop_event is not None and external_stop_event.is_set():
+        #             print(f"{ckan.identifier} Interrupted")
+        #             return
+        #         self._unit_upload_apply(ckan=ckan, file=file,
+        #                                 index=index, start_index=start_index, end_index=end_index, total=total,
+        #                                 method=method)
+        #     self._call_progress_callback(total, total, context=f"{ckan.identifier} single-thread upload")
+        #     # at last, apply final actions:
+        #     self.upload_request_final(ckan, force=not datastore_multi_apply_last_condition_intermediary)
+    # def upsert_request_file_graceful(self, ckan: CkanApi, file: Any, index:int,
+    #                                  method: UpsertChoice = UpsertChoice.Upsert, external_stop_event=None,
+    #                                  start_index:int=0, end_index:int=None) -> None:
+    #     """
+    #     Calls upsert_request_df_clear with checks specific to multi-threading.
+    #
+    #     :return:
+    #     """
+    #     # ckan.session_reset()
+    #     # ckan.identifier = current_thread().name
+    #     ckan = self.thread_ckan[current_thread().name]
+    #     total = self.get_local_file_len()
+    #     end_index = positive_end_index(end_index, total)
+    #     if self.stop_event.is_set():
+    #         return
+    #     if external_stop_event is not None and external_stop_event.is_set():
+    #         print(f"{ckan.identifier} Interrupted")
+    #         return
+    #     try:
+    #         self._unit_upload_apply(ckan=ckan, file=file,
+    #                                 index=index, start_index=start_index, end_index=end_index, total=total,
+    #                                 method=method)
+    #     except Exception as e:
+    #         self.stop_event.set()  # Ensure all threads stop
+    #         if ckan.verbose_extra:
+    #             print(f"Stopping all threads because an exception occurred in thread: {e}")
+    #         raise e from e
+    # def upload_request_full_multi_threaded(self, ckan: CkanApi, resources_base_dir: str, threads: int = None,
+    #                                        method: UpsertChoice = UpsertChoice.Upsert, external_stop_event=None,
+    #                                        start_index:int=0, end_index:int=None, **kwargs):
+    #     """
+    #     Multi-threaded implementation of upload_request_full, using ThreadPoolExecutor.
+    #     """
+    #     self.init_local_files_list(resources_base_dir=resources_base_dir, cancel_if_present=True)
+    #     resource_id = self.get_or_query_resource_id(ckan=ckan, error_not_found=True)  # prepare CKAN object for multi-threading: perform mapping requests if necessary
+    #     self._prepare_for_multithreading(ckan)
+    #     try:
+    #         with ThreadPoolExecutor(max_workers=threads, initializer=self._init_thread, initargs=(ckan,)) as executor:
+    #             if ckan.verbose_extra:
+    #                 print(f"Launching multi-threaded upload of multi-file resource {self.name}")
+    #             futures = [executor.submit(self.upsert_request_file_graceful, ckan=ckan, file=file, method=method, index=index,
+    #                                        start_index=start_index, end_index=end_index, external_stop_event=external_stop_event)
+    #                        for index, file in enumerate(self.get_local_file_generator(resources_base_dir=resources_base_dir))]
+    #             for future in futures:
+    #                 future.result()  # This will propagate the exception
+    #         total = self.get_local_file_len()
+    #         self._call_progress_callback(total, total, context=f"{ckan.identifier} multi-thread upload")
+    #     except Exception as e:
+    #         self.stop_event.set()  # Ensure all threads stop
+    #         if ckan.verbose_extra:
+    #             print(f"Stopping all threads because an exception occurred: {e}")
+    #         raise e from e
+    #     finally:
+    #         self.stop_event.set()  # Ensure all threads stop
+    #         if ckan.verbose_extra:
+    #             print("End of multi-threaded upload...")
+    #     # at last, apply final actions:
+    #     self.upload_request_final(ckan, force=not datastore_multi_apply_last_condition_intermediary)
+    ## download -------
+    def download_request_df(self, ckan: CkanApi, file_query:dict) -> Union[pd.DataFrame,None]:
+        """
+        Download the DataFrame with the file_query arguments
+        """
+        resource_id = self.get_or_query_resource_id(ckan, error_not_found=self.download_error_not_found)
+        if resource_id is None and not self.download_error_not_found:
+            return None
+        df_download = self.df_mapper.download_file_query(ckan=ckan, resource_id=resource_id, file_query=file_query)
+        df = self.df_mapper.df_download_alter(df_download, file_query=file_query, fields=self._get_fields_info())
+        return df
+    def _unit_download_apply(self, ckan:CkanApi, file_query_item:Any, out_dir:str,
+                           index:int, start_index:int, end_index:int, total:int) -> Any:
+        if start_index <= index and index < end_index:
+            self._call_progress_callback(index, total, info=file_query_item,
+                                         context=f"{ckan.identifier} single-thread download")
+            self.download_file_query_item(ckan=ckan, out_dir=out_dir, file_query_item=file_query_item)
+        else:
+            pass
+            # self._call_progress_callback(index, total, info=file_query_item, context=f"{ckan.identifier} single-thread skip")
+    def download_request_full(self, ckan: CkanApi, out_dir: str, threads:int=1, external_stop_event=None,
+                              start_index:int=0, end_index:int=None, force:bool=False) -> None:
+        return super().download_request_full(ckan=ckan, out_dir=out_dir,
+                                             threads=threads, external_stop_event=external_stop_event,
+                                             start_index=start_index, end_index=end_index, force=force)
+        # if (not self.enable_download) and (not force):
+        #     msg = f"Did not download resource {self.name} because download was disabled."
+        #     warn(msg)
+        #     return None
+        # if threads < 0:
+        #     # do not download large datasets in this case
+        #     return None
+        # elif threads is None or threads > 1:
+        #     return self.download_request_full_multi_threaded(ckan=ckan, out_dir=out_dir,
+        #                                                      threads=threads, external_stop_event=external_stop_event,
+        #                                                      start_index=start_index, end_index=end_index)
+        # else:
+        #     self.init_download_file_query_list(ckan=ckan, out_dir=out_dir, cancel_if_present=True)
+        #     if ckan.verbose_extra:
+        #         print(f"Launching single-threaded download of multi-file resource {self.name}")
+        #     total = self.get_file_query_len()
+        #     end_index = positive_end_index(end_index, total)
+        #     for index, file_query_item in enumerate(self.get_file_query_generator()):
+        #         if external_stop_event is not None and external_stop_event.is_set():
+        #             print(f"{ckan.identifier} Interrupted")
+        #             return
+        #         self._unit_download_apply(ckan=ckan, file_query_item=file_query_item,
+        #                                   index=index, start_index=start_index, end_index=end_index, total=total)
+        #     self._call_progress_callback(total, total, context=f"{ckan.identifier} single-thread download")
+    def download_request_generator(self, ckan: CkanApi, out_dir: str) -> Generator[Tuple[Any, pd.DataFrame], Any, None]:
+        """
+        Iterator on file_queries.
+        """
+        self.init_download_file_query_list(ckan=ckan, out_dir=out_dir, cancel_if_present=True)
+        for file_query_item in self.get_file_query_generator():
+            yield self.download_file_query_item(ckan=ckan, out_dir=out_dir, file_query_item=file_query_item)
+    # def download_file_query_item_graceful(self, ckan: CkanApi, out_dir: str, file_query_item: Any, index:int,
+    #                                       external_stop_event=None, start_index:int=0, end_index:int=None) -> None:
+    #     """
+    #     Implementation of download_file_query_item with checks for a multi-threaded download.
+    #     """
+    #     # ckan.session_reset()
+    #     # ckan.identifier = current_thread().name
+    #     ckan = self.thread_ckan[current_thread().name]
+    #     total = self.get_file_query_len()
+    #     end_index = positive_end_index(end_index, total)
+    #     if self.stop_event.is_set():
+    #         return
+    #     if external_stop_event is not None and external_stop_event.is_set():
+    #         print(f"{ckan.identifier} Interrupted")
+    #         return
+    #     try:
+    #         # self._unit_download_apply(ckan=ckan, file_query_item=file_query_item,
+    #         #                           index=index, start_index=start_index, end_index=end_index, total=total)
+    #     except Exception as e:
+    #         self.stop_event.set()  # Ensure all threads stop
+    #         if ckan.verbose_extra:
+    #             print(f"Stopping all threads because an exception occurred in thread: {e}")
+    #         raise e from e
+    # def download_request_full_multi_threaded(self, ckan: CkanApi, out_dir: str,
+    #                                          threads: int = None, external_stop_event=None,
+    #                                          start_index:int=0, end_index:int=-1) -> None:
+    #     """
+    #     Multi-threaded implementation of download_request_full using ThreadPoolExecutor.
+    #     """
+    #     self.init_download_file_query_list(ckan=ckan, out_dir=out_dir, cancel_if_present=True)
+    #     self._prepare_for_multithreading(ckan)
+    #     try:
+    #         with ThreadPoolExecutor(max_workers=threads, initializer=self._init_thread, initargs=(ckan,)) as executor:
+    #             if ckan.verbose_extra:
+    #                 print(f"Launching multi-threaded download of multi-file resource {self.name}")
+    #             futures = [executor.submit(self.download_file_query_item_graceful, ckan=ckan, out_dir=out_dir, file_query_item=file_query_item,
+    #                                        index=index, external_stop_event=external_stop_event, start_index=start_index, end_index=end_index)
+    #                        for index, file_query_item in enumerate(self.get_file_query_generator())]
+    #             for future in futures:
+    #                 future.result()  # This will propagate the exception
+    #         total = self.get_file_query_len()
+    #         self._call_progress_callback(total, total, context=f"multi-thread download")
+    #     except Exception as e:
+    #         self.stop_event.set()  # Ensure all threads stop
+    #         if ckan.verbose_extra:
+    #             print(f"Stopping all threads because an exception occurred: {e}")
+    #         raise e from e
+    #     finally:
+    #         self.stop_event.set()  # Ensure all threads stop
+    #         if ckan.verbose_extra:
+    #             print("End of multi-threaded download...")
+    def download_sample_df(self, ckan: CkanApi, search_all:bool=False, **kwargs) -> pd.DataFrame:
+        # alias with search_all=False by default
+        return super().download_sample_df(ckan=ckan, search_all=search_all, **kwargs)
+    def download_sample(self, ckan:CkanApi, full_download:bool=False, **kwargs) -> bytes:
+        # alias with full_download=False by default
+        return super().download_sample(ckan=ckan, full_download=full_download, **kwargs)

ckanapi_harvesters/builder/builder_resource_datastore_multi_folder.py ADDED Viewed

@@ -0,0 +1,273 @@
+#!python3
+# -*- coding: utf-8 -*-
+"""
+Code to initiate a DataStore defined by a large number of files to concatenate into one table.
+This concrete implementation is linked to the file system.
+"""
+from typing import Dict, List, Collection, Callable, Any, Tuple, Generator, Union
+import os
+from warnings import warn
+import glob
+import copy
+import pandas as pd
+from ckanapi_harvesters.auxiliary.error_level_message import ContextErrorLevelMessage, ErrorLevel
+from ckanapi_harvesters.builder.mapper_datastore import DataSchemeConversion
+from ckanapi_harvesters.builder.builder_errors import ResourceFileNotExistMessage
+from ckanapi_harvesters.builder.builder_resource_datastore_multi_abc import BuilderDataStoreMultiABC
+from ckanapi_harvesters.builder.builder_resource_datastore_multi_abc import datastore_multi_apply_last_condition_intermediary
+from ckanapi_harvesters.auxiliary.ckan_model import UpsertChoice
+from ckanapi_harvesters.auxiliary.path import resolve_rel_path, glob_rm_glob, list_files_scandir
+from ckanapi_harvesters.ckan_api import CkanApi
+from ckanapi_harvesters.auxiliary.ckan_auxiliary import _string_from_element
+from ckanapi_harvesters.builder.mapper_datastore_multi import RequestMapperABC, RequestFileMapperABC
+from ckanapi_harvesters.builder.mapper_datastore_multi import default_file_mapper_from_primary_key
+from ckanapi_harvesters.builder.builder_resource_datastore import BuilderDataStoreFile
+class BuilderDataStoreFolder(BuilderDataStoreMultiABC):
+    def __init__(self, *, file_query_list: List[Tuple[str,dict]]=None, name:str=None, format:str=None, description:str=None,
+                 resource_id:str=None, download_url:str=None, dir_name:str=None):
+        super().__init__(name=name, format=format, description=description, resource_id=resource_id, download_url=download_url)
+        self.dir_name = dir_name
+        # Functions inputs/outputs
+        self.df_mapper: RequestFileMapperABC = default_file_mapper_from_primary_key(self.primary_key, file_query_list)
+        self.local_file_list_base_dir:Union[str,None] = None
+        self.local_file_list:Union[List[str],None] = None
+        self.downloaded_file_query_list:Collection[Tuple[str,dict]] = file_query_list
+    def copy(self, *, dest=None):
+        if dest is None:
+            dest = BuilderDataStoreFolder()
+        super().copy(dest=dest)
+        dest.dir_name = self.dir_name
+        dest.local_file_list_base_dir = self.local_file_list_base_dir
+        dest.local_file_list = copy.deepcopy(self.local_file_list)
+        dest.downloaded_file_query_list = copy.deepcopy(self.downloaded_file_query_list)
+        return dest
+    def _load_from_df_row(self, row: pd.Series, base_dir:str=None) -> None:
+        super()._load_from_df_row(row=row)
+        self.df_mapper = default_file_mapper_from_primary_key(self.primary_key)
+        self.dir_name: str = _string_from_element(row["file/url"])
+    def setup_default_file_mapper(self, primary_key:List[str]=None, file_query_list:Collection[Tuple[str, dict]]=None) -> None:
+        """
+        This function enables the user to define the primary key and initializes the default file mapper.
+        :param primary_key: manually specify the primary key
+        :return:
+        """
+        df_mapper_mem = self.df_mapper
+        if primary_key is not None:
+            self.primary_key = primary_key
+        self.df_mapper = default_file_mapper_from_primary_key(self.primary_key, file_query_list)
+        if file_query_list is not None:
+            self.downloaded_file_query_list = file_query_list
+        # preserve upload/download functions
+        self.df_mapper.df_upload_fun = df_mapper_mem.df_upload_fun
+        self.df_mapper.df_download_fun = df_mapper_mem.df_download_fun
+    @staticmethod
+    def resource_mode_str() -> str:
+        return "DataStore from Folder"
+    def _to_dict(self, include_id:bool=True) -> dict:
+        d = super()._to_dict(include_id=include_id)
+        d["File/URL"] = self.dir_name
+        return d
+    @staticmethod
+    def from_file_datastore(resource_file: BuilderDataStoreFile,
+                            *, dir_name:str=None, primary_key:List[str]=None,
+                            file_query_list:Collection[Tuple[str,dict]]=None) -> "BuilderDataStoreFolder":
+        resource_folder = BuilderDataStoreFolder()
+        resource_folder._load_from_df_row(resource_file._to_row())
+        resource_folder.field_builders = resource_file.field_builders
+        if dir_name is not None:
+            resource_folder.dir_name = dir_name
+        elif isinstance(resource_file, BuilderDataStoreFolder):
+            resource_folder.dir_name = resource_file.dir_name
+        else:
+            resource_folder.dir_name, _ = os.path.splitext(resource_file.file_name)
+        resource_folder.package_name = resource_file.package_name
+        if isinstance(resource_file.df_mapper, RequestMapperABC):
+            resource_folder.df_mapper = resource_file.df_mapper.copy()
+        else:
+            resource_folder.df_mapper.df_upload_fun = resource_file.df_mapper.df_upload_fun
+            resource_folder.df_mapper.df_download_fun = resource_file.df_mapper.df_download_fun
+        if primary_key is not None or file_query_list is not None:
+            resource_folder.setup_default_file_mapper(primary_key=primary_key, file_query_list=file_query_list)
+        resource_folder.downloaded_file_query_list = file_query_list
+        return resource_folder
+    ## upload ---------------------------------------------------
+    def upload_file_checks(self, *, resources_base_dir:str=None, ckan: CkanApi=None, **kwargs) -> Union[None,ContextErrorLevelMessage]:
+        if os.path.isdir(resolve_rel_path(resources_base_dir, glob_rm_glob(self.dir_name), field=f"File/URL of resource {self.name}")):
+            if len(self.list_local_files(resources_base_dir=resources_base_dir)) > 0:
+                return None
+            else:
+                return ResourceFileNotExistMessage(self.name, ErrorLevel.Error,
+                    f"Empty resource directory for resource {self.name}: {os.path.join(resources_base_dir, self.dir_name)}")
+        else:
+            return ResourceFileNotExistMessage(self.name, ErrorLevel.Error,
+                f"Missing directory for resource {self.name}: {os.path.join(resources_base_dir, self.dir_name)}")
+    def get_sample_file_path(self, resources_base_dir:str, file_index:int=0) -> Union[str,None]:
+        self.list_local_files(resources_base_dir=resources_base_dir)
+        return self.local_file_list[file_index]
+    def load_sample_df(self, resources_base_dir:str, *, upload_alter:bool=True, file_index:int=0, **kwargs) -> pd.DataFrame:
+        file_path:str = self.get_sample_file_path(resources_base_dir, file_index=file_index)
+        return self.load_local_df(file=file_path, upload_alter=upload_alter, **kwargs)
+    def load_local_df(self, file: str, *, upload_alter:bool=True, **kwargs) -> pd.DataFrame:
+        # self.sample_data_source = resolve_rel_path(resources_base_dir, self.dir_name, file, field=f"File/URL of resource {self.name}")
+        self.sample_data_source = file
+        df_local = self.local_file_format.read_file(self.sample_data_source, fields=self._get_fields_info())
+        if isinstance(df_local, pd.DataFrame):
+            df_local.attrs["source"] = self.sample_data_source
+        if upload_alter:
+            df_upload = self.df_mapper.df_upload_alter(df_local, self.sample_data_source, fields=self._get_fields_info())
+            return df_upload
+        else:
+            return df_local
+    def get_local_file_generator(self, resources_base_dir:str, **kwargs) -> Generator[str, None, None]:
+        self.list_local_files(resources_base_dir=resources_base_dir)
+        for file_name in self.local_file_list:
+            yield file_name
+    def get_local_df_generator(self, resources_base_dir:str, **kwargs) -> Generator[pd.DataFrame, None, None]:
+        self.list_local_files(resources_base_dir=resources_base_dir)
+        for file_name in self.local_file_list:
+            yield self.load_local_df(file_name, **kwargs)
+    def list_local_files(self, resources_base_dir:str, cancel_if_present:bool=True) -> List[str]:
+        if cancel_if_present and self.local_file_list is not None and self.local_file_list_base_dir == resources_base_dir:
+            return self.local_file_list
+        dir_search_path = resolve_rel_path(resources_base_dir, self.dir_name, field=f"File/URL of resource {self.name}")
+        # file_list = [os.path.join(dir_search_path, file_name) for file_name in os.listdir(dir_search_path)]
+        # file_list = [os.path.join(file.path, file.name) for file in list(os.scandir(dir_search_path)) if file.is_file()]
+        search_query = dir_search_path
+        file_list = glob.glob(search_query)
+        # file_list = list_files_scandir(dir_search_path)
+        file_list.sort()
+        self.local_file_list = file_list
+        self.local_file_list_base_dir = resources_base_dir
+        return file_list
+    def init_local_files_list(self, resources_base_dir:str, cancel_if_present:bool=True, **kwargs) -> List[str]:
+        return self.list_local_files(resources_base_dir=resources_base_dir, cancel_if_present=cancel_if_present)
+    def get_local_file_len(self) -> int:
+        if self.local_file_list is None:
+            raise RuntimeError("You must call list_local_files first")
+        return len(self.local_file_list)
+    def upsert_request_df(self, ckan: CkanApi, df_upload:pd.DataFrame,
+                          method:UpsertChoice=UpsertChoice.Upsert,
+                          apply_last_condition:bool=None, always_last_condition:bool=None) -> Tuple[pd.DataFrame, pd.DataFrame]:
+        """
+        Call to ckan datastore_upsert.
+        Before sending the DataFrame, a call to df_upload_alter is made.
+        This implementation optionally checks for the last line of the DataFrame based on the first columns of the primary key.
+        :param ckan:
+        :param df_upload:
+        :param method:
+        :return:
+        """
+        if apply_last_condition is None:
+            apply_last_condition = True  # datastore_multi_apply_last_condition_intermediary
+        resource_id = self.get_or_query_resource_id(ckan=ckan, error_not_found=True)
+        df_upload_local = df_upload
+        df_upload_transformed = self.df_mapper.df_upload_alter(df_upload_local, fields=self._get_fields_info())
+        file_query = self.df_mapper.get_file_query_of_df(df_upload_transformed)
+        if file_query is not None:
+            i_restart, upload_needed, row_count, df_row = self.df_mapper.last_inserted_index_request(ckan=ckan,
+                                     resource_id=resource_id, df_upload=df_upload_transformed, file_query=file_query)
+        else:
+            i_restart, upload_needed, row_count, df_row = 0, True, -1, None
+        if upload_needed:
+            if i_restart > 0 and ckan.params.verbose_extra:
+                print(f"Starting transfer from index {i_restart}")
+            ret_df = ckan.datastore_upsert(df_upload_transformed.iloc[i_restart:], resource_id, method=method,
+                                           apply_last_condition=apply_last_condition,
+                                           always_last_condition=always_last_condition, data_cleaner=self.data_cleaner_upload)
+        elif 0 <= row_count and row_count < len(df_row):
+            msg = f"Sending full dataframe because is was shorter on server side"
+            warn(msg)
+            ret_df = ckan.datastore_upsert(df_upload_transformed, resource_id, method=method,
+                                           apply_last_condition=apply_last_condition,
+                                           always_last_condition=always_last_condition, data_cleaner=self.data_cleaner_upload)
+        else:
+            if ckan.params.verbose_extra:
+                print(f"File up to date on server side")
+            ret_df = None
+        return df_upload_transformed, ret_df
+    ## download ---------------------------------------------------------------------------
+    def download_file_query_list(self, ckan: CkanApi, cancel_if_present:bool=True) -> List[Tuple[str, dict]]:
+        resource_id = self.get_or_query_resource_id(ckan=ckan, error_not_found=self.download_error_not_found)
+        if resource_id is None and self.download_error_not_found:
+            self.downloaded_file_query_list = []
+            return []
+        if not(cancel_if_present and self.downloaded_file_query_list is not None):
+            file_query_list = self.df_mapper.download_file_query_list(ckan=ckan, resource_id=resource_id)
+            self.downloaded_file_query_list = [(self.df_mapper.get_file_name_of_query(file_query), file_query) for file_query in file_query_list]
+        return self.downloaded_file_query_list
+    def setup_download_file_query_list(self, file_query_list: List[Tuple[str,dict]]) -> None:
+        self.downloaded_file_query_list = file_query_list
+    def init_download_file_query_list(self, ckan: CkanApi, out_dir: str, cancel_if_present:bool=True, **kwargs) -> List[Any]:
+        if out_dir is not None:
+            dir_tables = resolve_rel_path(out_dir, glob_rm_glob(self.dir_name, default_rec_dir=self.name), field=f"File/URL of resource {self.name}")
+            os.makedirs(dir_tables, exist_ok=True)
+        return self.download_file_query_list(ckan=ckan, cancel_if_present=cancel_if_present)
+    def get_file_query_len(self) -> int:
+        if self.downloaded_file_query_list is None:
+            raise RuntimeError("You must call download_file_query_list first")
+        return len(self.downloaded_file_query_list)
+    def get_file_query_generator(self) -> Generator[Tuple[str,dict], Any, None]:
+        for file_name, file_query in self.downloaded_file_query_list:
+            yield file_name, file_query
+    def download_file_query(self, ckan: CkanApi, out_dir: str, file_name:str, file_query:dict) \
+            -> Tuple[Union[str,None], Union[pd.DataFrame,None]]:
+        resource_id = self.get_or_query_resource_id(ckan=ckan, error_not_found=self.download_error_not_found)
+        if resource_id is None and self.download_error_not_found:
+            return None, None
+        self.download_file_query_list(ckan, cancel_if_present=True)
+        file_out = None
+        if out_dir is not None:
+            file_out = resolve_rel_path(out_dir, glob_rm_glob(self.dir_name, default_rec_dir=self.name), file_name, field=f"File/URL of resource {self.name}")
+            if self.download_skip_existing and os.path.exists(file_out):
+                if ckan.params.verbose_extra:
+                    print(f"Skipping existing file {file_out}")
+                return file_out, None
+        df_download = self.df_mapper.download_file_query(ckan=ckan, resource_id=resource_id, file_query=file_query)
+        df = self.df_mapper.df_download_alter(df_download, file_query=file_query, fields=self._get_fields_info())
+        if out_dir is not None:
+            self.local_file_format.write_file(df, file_out, fields=self._get_fields_info())
+        else:
+            file_out = None
+        return file_out, df
+    def download_file_query_item(self, ckan: CkanApi, out_dir: str, file_query_item: Tuple[str,dict]) -> Tuple[str, pd.DataFrame]:
+        file_name, file_query = file_query_item
+        return self.download_file_query(ckan=ckan, file_name=file_name, file_query=file_query, out_dir=out_dir)
+    def download_request(self, ckan: CkanApi, out_dir: str, *, full_download:bool=False, force:bool=False, threads:int=1) -> None:
+        # limit download to first page by default
+        if not full_download:
+            super().download_request(ckan=ckan, out_dir=out_dir, full_download=False, force=force, threads=threads)
+        else:
+            self.download_request_full(ckan=ckan, out_dir=out_dir, threads=threads, force=force)

ckanapi-harvesters 0.0.0__py3-none-any.whl → 0.0.2__py3-none-any.whl

ckanapi-harvesters 0.0.0py3-none-any.whl → 0.0.2py3-none-any.whl