sibi-dst 0.3.63__py3-none-any.whl → 2025.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. sibi_dst/df_helper/_df_helper.py +186 -591
  2. sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -2
  3. sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +161 -115
  4. sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +291 -97
  5. sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +34 -105
  6. sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +175 -162
  7. sibi_dst/df_helper/core/__init__.py +0 -4
  8. sibi_dst/df_helper/core/_defaults.py +1 -50
  9. sibi_dst/df_helper/core/_query_config.py +2 -2
  10. sibi_dst/utils/__init__.py +0 -2
  11. sibi_dst/utils/data_wrapper.py +9 -12
  12. sibi_dst/utils/log_utils.py +15 -11
  13. sibi_dst/utils/update_planner.py +2 -0
  14. sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +325 -50
  15. sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +2 -2
  16. sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +330 -51
  17. sibi_dst/v3/__init__.py +0 -0
  18. sibi_dst/v3/backends/__init__.py +0 -0
  19. sibi_dst/v3/df_helper/__init__.py +0 -0
  20. sibi_dst/v3/df_helper/_df_helper.py +91 -0
  21. sibi_dst-2025.1.1.dist-info/METADATA +55 -0
  22. {sibi_dst-0.3.63.dist-info → sibi_dst-2025.1.1.dist-info}/RECORD +23 -26
  23. sibi_dst/df_helper/backends/django/__init__.py +0 -11
  24. sibi_dst/df_helper/backends/django/_db_connection.py +0 -88
  25. sibi_dst/df_helper/backends/django/_io_dask.py +0 -450
  26. sibi_dst/df_helper/backends/django/_load_from_db.py +0 -227
  27. sibi_dst/df_helper/backends/django/_sql_model_builder.py +0 -493
  28. sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py +0 -119
  29. sibi_dst/utils/airflow_manager.py +0 -212
  30. sibi_dst-0.3.63.dist-info/METADATA +0 -90
  31. {sibi_dst-0.3.63.dist-info → sibi_dst-2025.1.1.dist-info}/WHEEL +0 -0
@@ -1,31 +1,25 @@
1
1
  sibi_dst/__init__.py,sha256=3pbriM7Ym5f9gew7n9cO4G_p9n-0bnxdmQ0hwBdJjr4,253
2
2
  sibi_dst/df_helper/__init__.py,sha256=McYrw2N0MsMgtawLrONXTGdyHfQWVOBUvIDbklfjb54,342
3
3
  sibi_dst/df_helper/_artifact_updater_multi_wrapper.py,sha256=-Y4i5KAxKY2BNkmoVeMEZxjTFD7zaM9oQ0aRsvUbQrs,9340
4
- sibi_dst/df_helper/_df_helper.py,sha256=uKP6i-7dasZQ5zViD8-VJU0lNHumrdZG6IXvDFijZ18,31214
4
+ sibi_dst/df_helper/_df_helper.py,sha256=DJRQWTihnEtgBm3X0ar9nH-xcE1PCkWmh1JgID3WDsY,10939
5
5
  sibi_dst/df_helper/_parquet_artifact.py,sha256=Nio5GSD6rTYl52nf_TSpQhYIF0hKqRrB3H3A4zYnaG8,14987
6
6
  sibi_dst/df_helper/_parquet_reader.py,sha256=L6mr2FeKtTeIn37G9EGpvOx8PwMqXb6qnEECqBaiwxo,3954
7
7
  sibi_dst/df_helper/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- sibi_dst/df_helper/backends/django/__init__.py,sha256=uWHi-DtQX5re7b2HcqoXUH3_FZWOw1VTmDf552FAkNs,256
9
- sibi_dst/df_helper/backends/django/_db_connection.py,sha256=AGbqCnmiX4toMaFPE5ne5h7QCkImjnBKvzGtUD6Ge8Q,3698
10
- sibi_dst/df_helper/backends/django/_io_dask.py,sha256=NjvJg6y9qKKCRiNrJL4f_A03iKDKEcjCi7LGbr9DgtM,19555
11
- sibi_dst/df_helper/backends/django/_load_from_db.py,sha256=htG9ec4ix371ClEHQVpx4r3mhBdQaSykeHUCCRhN7L4,10637
12
- sibi_dst/df_helper/backends/django/_sql_model_builder.py,sha256=at9J7ecGkZbOOYba85uofe9C-ic4wwOqVgJcHpQNiYQ,21449
13
8
  sibi_dst/df_helper/backends/http/__init__.py,sha256=d1pfgYxbiYg7E0Iw8RbJ7xfqIfJShqqTBQQGU_S6OOo,105
14
9
  sibi_dst/df_helper/backends/http/_http_config.py,sha256=eGPFdqZ5M3Tscqx2P93B6XoBEEzlmdt7yNg7PXUQnNQ,4726
15
10
  sibi_dst/df_helper/backends/parquet/__init__.py,sha256=esWJ9aSuYC26d-T01z9dPrJ1uqJzvdaPNTYRb5qXTlQ,182
16
11
  sibi_dst/df_helper/backends/parquet/_filter_handler.py,sha256=TvDf0RXta7mwJv11GNQttYJsXgFf2XDj4oLIjt4xTzA,5219
17
12
  sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=TaU5_wG1Y3lQC8DVCItVvMnc6ZJmECLu3avssVEMbaM,10591
18
- sibi_dst/df_helper/backends/sqlalchemy/__init__.py,sha256=TuVp8Ce49dCIIxtyrtFGRblarQUl8QGcS-TDZd515IE,348
19
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py,sha256=JID-urZLbWjMd2dXt7onp6cPxAWQ3jnsY88s_lCscn8,7980
20
- sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py,sha256=58RCda1Hg_nsuJw-2V36IstsT8O84IQFgsdE7FnqvMk,4655
21
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py,sha256=BtiRSYA4kFIM-mBCdrwE20vzByfq8_Biv_jPLUCDv58,5466
22
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py,sha256=ltqB5814PMecxwZgmsJL6nDhQf72V-w71YWFAf7aYZ8,6490
23
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py,sha256=ksvJ0EvktrVsoJ9DTMIQHzHe8ghw2mzDIBD_YgWytgw,8402
24
- sibi_dst/df_helper/core/__init__.py,sha256=o4zDwgVmaijde3oix0ezb6KLxI5QFy-SGUhFTDVFLT4,569
25
- sibi_dst/df_helper/core/_defaults.py,sha256=eNpHD2sZxir-2xO0b3_V16ryw8YP_5FfpIKK0HNuiN4,7011
13
+ sibi_dst/df_helper/backends/sqlalchemy/__init__.py,sha256=LjWm9B7CweTvlvFOgB90XjSe0lVLILAIYMWKPkFXFm8,265
14
+ sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py,sha256=gppZrXLGK8U8xfkzRQPZCIFoWY-miP04nDNHpV8lXtU,10600
15
+ sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py,sha256=ph4w8Sd9eVr_jUIZuDhGyEwtDn0KQkb0lUkERrIXKGM,12852
16
+ sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py,sha256=NXVhtYF2mYsrW2fXBkL29VQ5gxAlOYPJkYa8HZKYUyM,2846
17
+ sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py,sha256=Q93O_xqK0SdrS3IrijVcqky_Zf6xKjtPHdI3qnf1g8E,7457
18
+ sibi_dst/df_helper/core/__init__.py,sha256=LfmTqFh6GUZup-g95bcXgAxX7J5Hkve7ftLE_CJg_AE,409
19
+ sibi_dst/df_helper/core/_defaults.py,sha256=9UMEMu2wXznO5UzEhnQ82f_ZazZ20JRyRXIi3HP3gDw,4043
26
20
  sibi_dst/df_helper/core/_filter_handler.py,sha256=Pmbzygry2mpkNPVS7DBMulHpAb1yYZNFqUU0bJTWJF0,11214
27
21
  sibi_dst/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxRrQKE5FQRxcEWsac,6736
28
- sibi_dst/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
22
+ sibi_dst/df_helper/core/_query_config.py,sha256=1ApqmuSGXTC3CdF-xMsSbCa3V2Z5hOP3Wq5huhzZwqY,439
29
23
  sibi_dst/df_helper/data_cleaner.py,sha256=lkxQoXLvGzXCicFUimnA5nen5qkrO1oxgl_p2Be2o8w,5183
30
24
  sibi_dst/geopy_helper/__init__.py,sha256=Q1RJiUZIOlV0QNNLjxZ_2IZS5LqIe5jRbeQkfD1Vm60,112
31
25
  sibi_dst/geopy_helper/geo_location_service.py,sha256=1ArI980QF_gRw096ZsABHwJt-m55jrfOlB8tPwL1BvY,2959
@@ -38,36 +32,35 @@ sibi_dst/osmnx_helper/basemaps/router_plotter.py,sha256=UAiijn-J-jjX4YnL0_P9SFqT
38
32
  sibi_dst/osmnx_helper/utils.py,sha256=BzuY8CtYnBAAO8UAr_M7EOk6CP1zcifNLs8pkdFZEFg,20577
39
33
  sibi_dst/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
34
  sibi_dst/tests/test_data_wrapper_class.py,sha256=6uFmZR2DxnxQz49L5jT2ehlKvlLnpUHMLFB_PqqUq7k,3336
41
- sibi_dst/utils/__init__.py,sha256=w0_q4rl3yD7x1Q5yWxH-GN_3Ju1XlebIzm3nJdrUeGE,1234
42
- sibi_dst/utils/airflow_manager.py,sha256=-d44EKUZNYJyp4wuNwRvilRQktunArPOB5fZuWdQv10,7526
35
+ sibi_dst/utils/__init__.py,sha256=H0Yr_Xo4dBTf03_Si_cggmPNSv6cf8_BBetoHJ86Tiw,1162
43
36
  sibi_dst/utils/clickhouse_writer.py,sha256=iAUe4_Kn2WR1xZjpLW2FOWCWfOTw6fCGMTUcWxIQJ60,9877
44
37
  sibi_dst/utils/credentials.py,sha256=cHJPPsmVyijqbUQIq7WWPe-lIallA-mI5RAy3YUuRME,1724
45
38
  sibi_dst/utils/data_from_http_source.py,sha256=AcpKNsqTgN2ClNwuhgUpuNCx62r5_DdsAiKY8vcHEBA,1867
46
39
  sibi_dst/utils/data_utils.py,sha256=MqbwXk33BuANWeKKmsabHouhb8GZswSmbM-VetWWE-M,10357
47
- sibi_dst/utils/data_wrapper.py,sha256=Tb9bHIHI6qVsdH791BOFN1VrPb-7GS4fHhhHV8hktec,9641
40
+ sibi_dst/utils/data_wrapper.py,sha256=69aPQFP178-QTJ_joJYqymP--wNxa1qzri_KkvvUTIw,9688
48
41
  sibi_dst/utils/date_utils.py,sha256=T3ij-WOQu3cIfmNAweSVMWWr-hVtuBcTGjEY-cMJIvU,18627
49
42
  sibi_dst/utils/df_utils.py,sha256=TzIAUCLbgOn3bvCFvzkc1S9YU-OlZTImdCj-88dtg8g,11401
50
43
  sibi_dst/utils/file_utils.py,sha256=Z99CZ_4nPDIaZqbCfzzUDfAYJjSudWDj-mwEO8grhbc,1253
51
44
  sibi_dst/utils/filepath_generator.py,sha256=-HHO0U-PR8fysDDFwnWdHRlgqksh_RkmgBZLWv9hM7s,6669
52
- sibi_dst/utils/log_utils.py,sha256=77xACRagKU83H9vn7aVeBzkQjxWlbe4dg4KuxPRCgvw,4635
45
+ sibi_dst/utils/log_utils.py,sha256=C2wkbxGC2n6hZIEU-z8rHrunDcq95MHkf3B1zGynHnE,4904
53
46
  sibi_dst/utils/manifest_manager.py,sha256=eyk6Dvrn86gUpAaAsnQvNnEJn5-Tno-sDDJsDMfHtTA,18161
54
47
  sibi_dst/utils/parquet_saver.py,sha256=O62xwPfphOpKgEiHqnts20CPSU96pxs49Cg7PVetLK0,8193
55
48
  sibi_dst/utils/phone_formatter.py,sha256=tsVTDamuthFYgy4-5UwmQkPQ-FGTGH7MjZyH8utAkIY,4945
56
49
  sibi_dst/utils/storage_config.py,sha256=TE15H-7d0mqwYPSUgrdidK9U7N7p87Z8JfUQH4-jdPs,4123
57
50
  sibi_dst/utils/storage_manager.py,sha256=btecX7ggNb7rfu5EK9Xuu2q_FZA7r_rB_tfhQ8V96qc,6567
58
- sibi_dst/utils/update_planner.py,sha256=dJXLC-KdbWrCs-MFe7Xa8F-ZhlNJq8P1szjLAzMJZk0,9684
51
+ sibi_dst/utils/update_planner.py,sha256=t9A5DLE9cDiYNO8ctQIWVyVWnkMSV0PfbBJ43A0bQv4,9742
59
52
  sibi_dst/utils/webdav_client.py,sha256=pYF1UsGOuxYeGLq7aBfwZFvkvD4meOcbbaiZ4d6GW9I,7107
60
53
  sibi_dst/v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
61
54
  sibi_dst/v2/df_helper/__init__.py,sha256=XuH6jKYAPg2DdRbsxxBSxp9X3x-ARyaT0xe27uILrVo,99
62
55
  sibi_dst/v2/df_helper/_df_helper.py,sha256=9pED3bjQ2Z81zqzJrZ9e7SguoO4-hBmNTJK4WOKrr4M,9297
63
56
  sibi_dst/v2/df_helper/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
64
57
  sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py,sha256=MOEedyWqcb1_RiRYKyyWX0uFNCfBgmyYbTjco8-GBxU,262
65
- sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py,sha256=8u3jdD0sR2fmm2H75GDdygoqiqDI6-N-azOJsLgUWFA,3189
66
- sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py,sha256=6V9DruwckEsonYW5YvBY93-NzXYHbTA7OsyMKMYIZEs,5472
58
+ sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py,sha256=1xeMxPBpChXDMpmQmTYtta6y43ndU6x6szdon8KIj9g,15443
59
+ sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py,sha256=LmUBK-HRrF-RncnZ2DsZSqxmTrAeQEyD8oqaBDfr518,5449
67
60
  sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py,sha256=jhgN0OO5Sk1zQFHrMUhJn2F_hHB5g3x3EJ8j5PXNb0U,6295
68
61
  sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py,sha256=jX_mQAzl_6xdh7CTYw4uvUIX2wMp3NzXMlfbC5alOzs,13632
69
62
  sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py,sha256=LcwJjVVxxrnVZalWqnz5m7r77i9tmJR0-U2k8eSQ-m8,249
70
- sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py,sha256=b5xmxQr4a8fhE4qdCGJrNWjjX1NW5hrPNLmlfP20rIg,2897
63
+ sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py,sha256=n3CDbda0OY3X7eTeu_PR2KcZ5hYyEJL7Hroo8yQkjG8,15435
71
64
  sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py,sha256=wVgNPo5V75aLtlZr_SIQ-yteyXq-Rg93eMfR8JCfkSo,5422
72
65
  sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py,sha256=FIs6UrNxdJ7eDHDvTv-cJuybIue2-oCRedhW-MNe7CU,6285
73
66
  sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py,sha256=k0dnMLkLMMvkDYDYWkGFgibW5UD8pJgB3YrEg_R7pj8,13556
@@ -77,6 +70,10 @@ sibi_dst/v2/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxR
77
70
  sibi_dst/v2/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
78
71
  sibi_dst/v2/utils/__init__.py,sha256=6H4cvhqTiFufnFPETBF0f8beVVMpfJfvUs6Ne0TQZNY,58
79
72
  sibi_dst/v2/utils/log_utils.py,sha256=rfk5VsLAt-FKpv6aPTC1FToIPiyrnHAFFBAkHme24po,4123
80
- sibi_dst-0.3.63.dist-info/METADATA,sha256=ZsVn8AeFIUeVrb0Ybxmjk393FdUyn2j2fOnGQ8MXM1k,4292
81
- sibi_dst-0.3.63.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
82
- sibi_dst-0.3.63.dist-info/RECORD,,
73
+ sibi_dst/v3/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
74
+ sibi_dst/v3/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
75
+ sibi_dst/v3/df_helper/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
76
+ sibi_dst/v3/df_helper/_df_helper.py,sha256=NKIQ4Y-Tn-e841sbZxzLh3Q071_Zo9Vu4y3OAXcsO98,3900
77
+ sibi_dst-2025.1.1.dist-info/METADATA,sha256=OBt3aCLjPtRPN-YxaKLvNL13_H5sRqjEo-NpDMK-nD0,2366
78
+ sibi_dst-2025.1.1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
79
+ sibi_dst-2025.1.1.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from ._io_dask import ReadFrameDask
4
- from ._db_connection import DjangoConnectionConfig
5
- from ._load_from_db import DjangoLoadFromDb
6
-
7
- __all__ = [
8
- "DjangoConnectionConfig",
9
- "ReadFrameDask",
10
- "DjangoLoadFromDb"
11
- ]
@@ -1,88 +0,0 @@
1
- from typing import Any
2
-
3
- from pydantic import BaseModel, model_validator
4
-
5
- from ._sql_model_builder import DjangoSqlModelBuilder
6
-
7
-
8
- class DjangoConnectionConfig(BaseModel):
9
- """
10
- Represents a configuration for establishing a Django database connection.
11
-
12
- This class is used for defining the configurations necessary to establish a Django
13
- database connection. It supports dynamic model generation if the model is not
14
- provided explicitly. It also validates the connection configuration to ensure it
15
- is properly set up before being used.
16
-
17
- :ivar live: Indicates whether the connection is live. Automatically set to False if
18
- a table is provided without a pre-built model.
19
- :type live: bool
20
- :ivar connection_name: The name of the database connection to use. This is a mandatory
21
- parameter and must be provided.
22
- :type connection_name: str
23
- :ivar table: The name of the database table to use. Required for dynamic model
24
- generation when no model is provided.
25
- :type table: str
26
- :ivar model: The Django model that represents the database table. If not provided,
27
- this can be generated dynamically by using the table name.
28
- :type model: Any
29
- """
30
- live: bool = False
31
- connection_name: str = None
32
- table: str = None
33
- model: Any = None
34
-
35
- @model_validator(mode="after")
36
- def check_model(self):
37
- """
38
- Validates and modifies the instance based on the provided attributes and conditions.
39
- This method ensures that all required parameters are populated and consistent, and it
40
- dynamically builds a model if necessary. The method also ensures the connection is
41
- validated after the model preparation process.
42
-
43
- :raises ValueError: If `connection_name` is not provided.
44
- :raises ValueError: If `table` name is not specified when building the model dynamically.
45
- :raises ValueError: If there are errors during the dynamic model-building process.
46
- :raises ValueError: If `validate_connection` fails due to invalid configuration.
47
- :return: The validated and potentially mutated instance.
48
- """
49
- # connection_name is mandatory
50
- if self.connection_name is None:
51
- raise ValueError("Connection name must be specified")
52
-
53
- # If table is provided, enforce live=False
54
- if self.table:
55
- self.live = False
56
-
57
- # If model is not provided, build it dynamically
58
- if not self.model:
59
- if not self.table:
60
- raise ValueError("Table name must be specified to build the model")
61
- try:
62
- self.model = DjangoSqlModelBuilder(
63
- connection_name=self.connection_name, table=self.table
64
- ).build_model()
65
- except Exception as e:
66
- raise ValueError(f"Failed to build model: {e}")
67
- else:
68
- self.live = True
69
- # Validate the connection after building the model
70
- self.validate_connection()
71
- return self
72
-
73
- def validate_connection(self):
74
- """
75
- Ensures the database connection is valid by performing a simple
76
- query. Raises a ValueError if the connection is broken or if any
77
- other exception occurs during the query.
78
-
79
- :raises ValueError: If the connection to the database cannot be
80
- established or if the query fails.
81
- """
82
- try:
83
- # Perform a simple query to test the connection
84
- self.model.objects.using(self.connection_name).exists()
85
- except Exception as e:
86
- raise ValueError(
87
- f"Failed to connect to the database '{self.connection_name}': {e}"
88
- )
@@ -1,450 +0,0 @@
1
- import itertools
2
-
3
- import dask.dataframe as dd
4
- import django
5
- import pandas as pd
6
- from django.core.cache import cache
7
- from django.core.exceptions import FieldDoesNotExist
8
- from django.db import models
9
- from django.db.models import Field
10
- from django.utils.encoding import force_str as force_text
11
-
12
-
13
- class ReadFrameDask:
14
- """
15
- Handles Django ORM QuerySet to Dask DataFrame conversion with support for field
16
- type inference, chunked data retrieval, and verbose updates.
17
-
18
- This class provides methods to efficiently convert a Django QuerySet into a
19
- Dask DataFrame while preserving field types and incorporating additional
20
- capabilities such as replacing fields with verbose choices or related object
21
- information. The class design leverages static and class methods to maintain
22
- flexibility and reusability for handling Django model fields and their data
23
- types.
24
-
25
- :ivar qs: The Django QuerySet to be converted into a Dask DataFrame.
26
- :type qs: django.db.models.query.QuerySet
27
- :ivar coerce_float: Whether to attempt to coerce numeric values to floats.
28
- :type coerce_float: bool
29
- :ivar chunk_size: The number of records to fetch and process per chunk from
30
- the QuerySet.
31
- :type chunk_size: int
32
- :ivar verbose: If True, provides verbose updates during DataFrame creation
33
- by replacing fields with readable representations (e.g., verbose names).
34
- :type verbose: bool
35
- """
36
- FieldDoesNotExist = (
37
- django.core.exceptions.FieldDoesNotExist
38
- if django.VERSION < (1, 8)
39
- else django.core.exceptions.FieldDoesNotExist
40
- )
41
-
42
- def __init__(
43
- self,
44
- qs,
45
- **kwargs,
46
- ):
47
- """
48
- An initialization method for a class that sets class attributes based on provided
49
- arguments or default values using the keyword arguments. The method allows
50
- customization of behaviors like coercing data types, handling chunked operations,
51
- and verbosity level during execution.
52
-
53
- :param qs: A data source or query set for processing; its type is dependent
54
- on the expected data being handled.
55
- :param kwargs: Additional keyword arguments that may include:
56
- - coerce_float: A boolean indicating whether floats should be coerced
57
- during handling. Default is False.
58
- - chunk_size: An integer value representing the size of chunks for
59
- data processing. Default is 1000.
60
- - verbose: A boolean to specify if verbose logging or output
61
- should occur during execution. Default is True.
62
- """
63
- self.qs = qs
64
- self.coerce_float = kwargs.setdefault("coerce_float", False)
65
- self.chunk_size = kwargs.setdefault("chunk_size", 1000)
66
- self.verbose = kwargs.setdefault("verbose", True)
67
-
68
- @staticmethod
69
- def replace_from_choices(choices):
70
- """
71
- Provides a method to replace elements in a list of values based on a mapping of choices.
72
-
73
- This static method generates a closure function that replaces items in a list by
74
- looking up their corresponding values in a provided dictionary of choices. If an
75
- item cannot be found in the dictionary, it is left unchanged.
76
-
77
- :param choices:
78
- Dictionary where keys are original values and values are their replacements.
79
- :return:
80
- A function that takes a list of values and replaces elements using the
81
- provided choices dictionary.
82
- """
83
- def inner(values):
84
- return [choices.get(v, v) for v in values]
85
-
86
- return inner
87
-
88
- @staticmethod
89
- def get_model_name(model):
90
- """
91
- Retrieves the model name from a given Django model instance.
92
-
93
- This method accesses the `_meta.model_name` attribute of the provided
94
- model object to extract and return the model's name.
95
-
96
- :param model: A Django model instance from which the model name is
97
- derived.
98
- :type model: object
99
- :return: The name of the model as a string.
100
- :rtype: str
101
- """
102
- return model._meta.model_name
103
-
104
- @staticmethod
105
- def get_related_model(field):
106
- """
107
- Retrieve the related model from the provided field.
108
-
109
- This function determines the related model associated with the given field.
110
- It checks various attributes commonly used to indicate relations in models and
111
- retrieves the related model if present.
112
-
113
- :param field: The field from which the related model is to be extracted.
114
- It must be an object that potentially contains attributes like
115
- `related_model` or `rel`.
116
- :return: The related model associated with the provided field, or None if
117
- no such model is found.
118
- """
119
- model = None
120
- if hasattr(field, "related_model") and field.related_model:
121
- model = field.related_model
122
- elif hasattr(field, "rel") and field.rel:
123
- model = field.rel.to
124
- return model
125
-
126
- @classmethod
127
- def get_base_cache_key(cls, model):
128
- """
129
- Generates a base cache key for caching purposes.
130
-
131
- This method constructs a base cache key that can be used in conjunction with
132
- Django models to uniquely identify cache entries. The key is formatted to
133
- include the app label and model name, ensuring that cache entries are
134
- namespaced accordingly.
135
-
136
- :param model: A Django model instance for which the base cache key is generated.
137
- :type model: Model
138
- :return: The string template for the base cache key, where `%s` can be replaced
139
- with specific identifiers to create unique keys.
140
- :rtype: str
141
- """
142
- return (
143
- f"dask_{model._meta.app_label}_{cls.get_model_name(model)}_%s_rendering"
144
- )
145
-
146
- @classmethod
147
- def replace_pk(cls, model):
148
- """
149
- Generates a function that replaces primary keys in a pandas Series with their
150
- corresponding cached values or database-retrieved representations.
151
-
152
- The function uses a cache mechanism to retrieve pre-stored values for primary
153
- keys in the series. If some primary keys are not found in the cache, it queries
154
- the database for their representations, updates the cache, and replaces the
155
- primary keys in the series accordingly.
156
-
157
- :param model: The Django model class associated with the primary keys to be
158
- processed.
159
- :type model: Type[Model]
160
-
161
- :return: A function that takes a pandas Series of primary keys as input and
162
- returns a Series with replaced values based on cache or database retrieval.
163
- :rtype: callable
164
- """
165
- base_cache_key = cls.get_base_cache_key(model)
166
-
167
- def get_cache_key_from_pk(pk):
168
- return None if pk is None else base_cache_key % str(pk)
169
-
170
- def inner(pk_series):
171
- pk_series = pk_series.astype(object).where(pk_series.notnull(), None)
172
- cache_keys = pk_series.apply(get_cache_key_from_pk, convert_dtype=False)
173
- unique_cache_keys = list(filter(None, cache_keys.unique()))
174
- if not unique_cache_keys:
175
- return pk_series
176
-
177
- out_dict = cache.get_many(unique_cache_keys)
178
- if len(out_dict) < len(unique_cache_keys):
179
- out_dict = dict(
180
- [
181
- (base_cache_key % obj.pk, force_text(obj))
182
- for obj in model.objects.filter(
183
- pk__in=list(filter(None, pk_series.unique()))
184
- )
185
- ]
186
- )
187
- cache.set_many(out_dict)
188
- return list(map(out_dict.get, cache_keys))
189
-
190
- return inner
191
-
192
- @classmethod
193
- def build_update_functions(cls, fieldnames, fields):
194
- """
195
- This method is responsible for building update functions based on the provided
196
- fieldnames and fields. It performs validation for the field type, checks for
197
- specific conditions such as `choices` or `ForeignKey` field types, and generates
198
- a generator of update functions for the given fieldnames and fields.
199
-
200
- :param fieldnames: A list of field names to be processed.
201
- :type fieldnames: list[str]
202
- :param fields: A list of field objects corresponding to the fieldnames.
203
- :type fields: list[Field]
204
- :return: A generator yielding tuples where the first element is a fieldname,
205
- and the second element is the corresponding update function or None.
206
- :rtype: generator[tuple[str, Callable | None]]
207
- """
208
- for fieldname, field in zip(fieldnames, fields):
209
- if not isinstance(field, Field):
210
- yield fieldname, None
211
- else:
212
- if field.choices:
213
- choices = dict([(k, force_text(v)) for k, v in field.flatchoices])
214
- yield fieldname, cls.replace_from_choices(choices)
215
- elif field.get_internal_type() == "ForeignKey":
216
- yield fieldname, cls.replace_pk(cls.get_related_model(field))
217
-
218
- @classmethod
219
- def update_with_verbose(cls, df, fieldnames, fields):
220
- """
221
- Updates the provided dataframe by applying transformation functions to specified fields.
222
- The method iterates over the provided field names and their corresponding functions, applying
223
- each transformation function to its related column in the dataframe.
224
-
225
- :param df: The input dataframe to be updated.
226
- :param fieldnames: A list of field names in the dataframe that need to be updated.
227
- :param fields: A list of transformation functions or mappings corresponding to the field names.
228
- :return: The dataframe with updated fields.
229
- """
230
- for fieldname, function in cls.build_update_functions(fieldnames, fields):
231
- if function is not None:
232
- df[fieldname] = df[fieldname].map_partitions(lambda x: function(x))
233
-
234
- @classmethod
235
- def to_fields(cls, qs, fieldnames):
236
- """
237
- Converts field names from a queryset into corresponding field objects, resolving relationships
238
- and related objects if necessary. This method is typically used to yield fully-resolved field
239
- objects for further interaction.
240
-
241
- :param qs: A QuerySet object from which the fields are resolved. This object provides access
242
- to the model and its metadata from which the fields are retrieved.
243
- :type qs: QuerySet
244
-
245
- :param fieldnames: A list of field name strings. These can include nested fields separated by
246
- double underscores (__) to denote relationships or subfields.
247
- :type fieldnames: List[str]
248
-
249
- :return: A generator that yields resolved field objects corresponding to the provided field names.
250
- :rtype: Generator[Field, None, None]
251
- """
252
- for fieldname in fieldnames:
253
- model = qs.model
254
- for fieldname_part in fieldname.split("__"):
255
- try:
256
- field = model._meta.get_field(fieldname_part)
257
- except cls.FieldDoesNotExist:
258
- try:
259
- rels = model._meta.get_all_related_objects_with_model()
260
- except AttributeError:
261
- field = fieldname
262
- else:
263
- for relobj, _ in rels:
264
- if relobj.get_accessor_name() == fieldname_part:
265
- field = relobj.field
266
- model = field.model
267
- break
268
- else:
269
- model = cls.get_related_model(field)
270
- yield field
271
-
272
- @staticmethod
273
- def is_values_queryset(qs):
274
- """
275
- Determines whether the provided queryset is a values queryset.
276
-
277
- This method checks if the `_iterable_class` attribute of the queryset corresponds
278
- to `django.db.models.query.ValuesIterable`. If an exception occurs during the check,
279
- the method returns `False`.
280
-
281
- :param qs: The queryset to be checked.
282
- :type qs: django.db.models.query.QuerySet
283
- :return: A boolean indicating whether the queryset is a values queryset.
284
- :rtype: bool
285
- """
286
- try:
287
- return qs._iterable_class == django.db.models.query.ValuesIterable
288
- except:
289
- return False
290
-
291
- @staticmethod
292
- def object_to_dict(obj, fields=None):
293
- """
294
- Converts an object to a dictionary representation.
295
-
296
- This static method transforms an object's attributes into a dictionary.
297
- If no specific fields are provided, all attribute key-value pairs are
298
- included. The "_state" attribute, if present, is safely removed in this
299
- case. When specific fields are supplied, only those fields are included
300
- in the resulting dictionary.
301
-
302
- :param obj: The object to be serialized into a dictionary. This object
303
- must have the `__dict__` attribute available.
304
- :param fields: A list of strings representing the attribute names to
305
- include in the dictionary. If None or not provided, all attributes
306
- are included except for "_state".
307
- :return: A dictionary representation of the object's attributes. If the
308
- provided object is None, an empty dictionary is returned.
309
- :rtype: dict
310
- """
311
- if obj is None:
312
- return {} # Return an empty dictionary if obj is None
313
- if not fields:
314
- obj.__dict__.pop("_state", None) # Remove _state safely
315
- return obj.__dict__
316
- return {field: obj.__dict__.get(field) for field in fields if field is not None}
317
-
318
- @staticmethod
319
- def infer_dtypes_from_django(qs):
320
- """
321
- Infer dtypes from a Django QuerySet model and annotated fields.
322
-
323
- This method infers the appropriate data types (dtypes) for a given
324
- Django QuerySet (`qs`) based on the fields defined in its model and
325
- any annotated fields included in the QuerySet. The function maps
326
- Django model field types to corresponding dtypes compatible with
327
- Dask or Pandas dataframes.
328
-
329
- - Fields in the model are identified through their metadata.
330
- - Reverse relationships and non-concrete fields are ignored.
331
- - Annotated fields are processed separately and default to object
332
- dtype if their type cannot be determined.
333
-
334
- :param qs: Django QuerySet whose model is used to infer dtypes.
335
- :type qs: QuerySet
336
- :return: A mapping of field names to inferred dtypes.
337
- :rtype: dict
338
- """
339
- django_to_dask_dtype = {
340
- 'AutoField': 'Int64', # Use nullable integer
341
- 'BigAutoField': 'Int64',
342
- 'BigIntegerField': 'Int64',
343
- 'BooleanField': 'bool',
344
- 'CharField': 'object',
345
- 'DateField': 'datetime64[ns]',
346
- 'DateTimeField': 'datetime64[ns]',
347
- 'DecimalField': 'float64',
348
- 'FloatField': 'float64',
349
- 'IntegerField': 'Int64', # Use nullable integer
350
- 'PositiveIntegerField': 'Int64',
351
- 'SmallIntegerField': 'Int64',
352
- 'TextField': 'object',
353
- 'TimeField': 'object',
354
- 'UUIDField': 'object',
355
- 'ForeignKey': 'Int64', # Use nullable integer for FK fields
356
- }
357
-
358
- dtypes = {}
359
- # Handle model fields
360
- for field in qs.model._meta.get_fields():
361
- # Skip reverse relationships and non-concrete fields
362
- if not getattr(field, 'concrete', False):
363
- continue
364
-
365
- # Check for AutoField or BigAutoField explicitly
366
- if isinstance(field, (models.AutoField, models.BigAutoField)):
367
- dtypes[field.name] = 'Int64' # Nullable integer for autoincremented fields
368
- else:
369
- # Use field type to infer dtype
370
- field_type = field.get_internal_type()
371
- dtypes[field.name] = django_to_dask_dtype.get(field_type, 'object')
372
-
373
- # Handle annotated fields
374
- for annotation_name, annotation in qs.query.annotation_select.items():
375
- if hasattr(annotation, 'output_field'):
376
- field_type = annotation.output_field.get_internal_type()
377
- dtype = django_to_dask_dtype.get(field_type, 'object')
378
- else:
379
- dtype = 'object' # Default to object for untyped annotations
380
- dtypes[annotation_name] = dtype
381
-
382
- return dtypes
383
-
384
- def read_frame(self, fillna_value=None):
385
- """
386
- Reads a Django QuerySet and returns a dask DataFrame by iterating over the QuerySet in chunks. It
387
- handles data type inference, missing values, timezone awareness, and creates partitions to form a
388
- single dask DataFrame efficiently.
389
-
390
- This method includes functionality for managing missing values, inferring data types from Django fields,
391
- and handling timezone-aware datetime objects. It processes data in chunks to optimize memory usage and
392
- supports converting chunks into pandas DataFrames before combining them into a unified dask DataFrame.
393
-
394
- :param fillna_value: The value to fill NaN values in the DataFrame. If None, NaNs are not filled.
395
- :type fillna_value: Any
396
- :return: A dask DataFrame constructed from the QuerySet after processing and combining all
397
- its partitions.
398
- :rtype: dask.dataframe.DataFrame
399
- """
400
- qs = self.qs
401
- coerce_float = self.coerce_float
402
- verbose = self.verbose
403
- chunk_size = self.chunk_size
404
-
405
- fields = qs.model._meta.fields
406
- fieldnames = [f.name for f in fields]
407
- fieldnames += list(qs.query.annotation_select.keys())
408
- fieldnames = tuple(fieldnames)
409
- # Infer dtypes from Django fields
410
- dtypes = self.infer_dtypes_from_django(qs)
411
- if fieldnames:
412
- dtypes = {field: dtype for field, dtype in dtypes.items() if field in fieldnames}
413
-
414
- # Create partitions for Dask by iterating through chunks
415
- partitions = []
416
- iterator = iter(qs.iterator(chunk_size=chunk_size))
417
-
418
- while True:
419
- chunk = list(itertools.islice(iterator, chunk_size))
420
- if not chunk:
421
- break
422
-
423
- # Convert chunk to DataFrame with inferred dtypes
424
- df = pd.DataFrame.from_records(
425
- [self.object_to_dict(obj, fieldnames) for obj in chunk],
426
- columns=fieldnames,
427
- coerce_float=coerce_float,
428
- )
429
- # Handle NaN values before casting, if specified
430
- if fillna_value is not None:
431
- df = df.fillna(fillna_value)
432
-
433
- # Convert timezone-aware columns to timezone-naive if needed
434
- for col in df.columns:
435
- if isinstance(df[col].dtype, pd.DatetimeTZDtype):
436
- df[col] = df[col].dt.tz_localize(None)
437
-
438
- # Convert to the appropriate data types
439
- df = df.astype(dtypes)
440
- partitions.append(dd.from_pandas(df, npartitions=1))
441
-
442
- # Concatenate partitions into a single Dask DataFrame
443
- # Ensure all partitions have the same columns
444
-
445
- dask_df = dd.concat(partitions, axis=0, ignore_index=True)
446
-
447
- if verbose:
448
- self.update_with_verbose(dask_df, fieldnames, fields)
449
-
450
- return dask_df