recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show
  1. recurvedata/__init__.py +0 -0
  2. recurvedata/__version__.py +1 -0
  3. recurvedata/client/__init__.py +3 -0
  4. recurvedata/client/client.py +150 -0
  5. recurvedata/client/server_client.py +91 -0
  6. recurvedata/config.py +99 -0
  7. recurvedata/connectors/__init__.py +20 -0
  8. recurvedata/connectors/_register.py +46 -0
  9. recurvedata/connectors/base.py +111 -0
  10. recurvedata/connectors/config_schema.py +1575 -0
  11. recurvedata/connectors/connectors/__init__.py +0 -0
  12. recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
  13. recurvedata/connectors/connectors/auth.py +44 -0
  14. recurvedata/connectors/connectors/azure_blob.py +89 -0
  15. recurvedata/connectors/connectors/azure_synapse.py +79 -0
  16. recurvedata/connectors/connectors/bigquery.py +359 -0
  17. recurvedata/connectors/connectors/clickhouse.py +219 -0
  18. recurvedata/connectors/connectors/dingtalk.py +61 -0
  19. recurvedata/connectors/connectors/doris.py +215 -0
  20. recurvedata/connectors/connectors/es.py +62 -0
  21. recurvedata/connectors/connectors/feishu.py +65 -0
  22. recurvedata/connectors/connectors/ftp.py +50 -0
  23. recurvedata/connectors/connectors/generic.py +49 -0
  24. recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
  25. recurvedata/connectors/connectors/google_service_account.py +225 -0
  26. recurvedata/connectors/connectors/hive.py +207 -0
  27. recurvedata/connectors/connectors/impala.py +210 -0
  28. recurvedata/connectors/connectors/jenkins.py +51 -0
  29. recurvedata/connectors/connectors/mail.py +89 -0
  30. recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
  31. recurvedata/connectors/connectors/mongo.py +79 -0
  32. recurvedata/connectors/connectors/mssql.py +131 -0
  33. recurvedata/connectors/connectors/mysql.py +191 -0
  34. recurvedata/connectors/connectors/n8n.py +141 -0
  35. recurvedata/connectors/connectors/oss.py +74 -0
  36. recurvedata/connectors/connectors/owncloud.py +36 -0
  37. recurvedata/connectors/connectors/phoenix.py +36 -0
  38. recurvedata/connectors/connectors/postgres.py +230 -0
  39. recurvedata/connectors/connectors/python.py +50 -0
  40. recurvedata/connectors/connectors/redshift.py +187 -0
  41. recurvedata/connectors/connectors/s3.py +93 -0
  42. recurvedata/connectors/connectors/sftp.py +87 -0
  43. recurvedata/connectors/connectors/slack.py +35 -0
  44. recurvedata/connectors/connectors/spark.py +99 -0
  45. recurvedata/connectors/connectors/starrocks.py +175 -0
  46. recurvedata/connectors/connectors/tencent_cos.py +40 -0
  47. recurvedata/connectors/connectors/tidb.py +49 -0
  48. recurvedata/connectors/const.py +315 -0
  49. recurvedata/connectors/datasource.py +189 -0
  50. recurvedata/connectors/dbapi.py +469 -0
  51. recurvedata/connectors/fs.py +66 -0
  52. recurvedata/connectors/ftp.py +40 -0
  53. recurvedata/connectors/object_store.py +60 -0
  54. recurvedata/connectors/pigeon.py +172 -0
  55. recurvedata/connectors/proxy.py +104 -0
  56. recurvedata/connectors/service.py +223 -0
  57. recurvedata/connectors/utils.py +47 -0
  58. recurvedata/consts.py +49 -0
  59. recurvedata/core/__init__.py +0 -0
  60. recurvedata/core/config.py +46 -0
  61. recurvedata/core/configurable.py +27 -0
  62. recurvedata/core/consts.py +2 -0
  63. recurvedata/core/templating.py +206 -0
  64. recurvedata/core/tracing.py +223 -0
  65. recurvedata/core/transformer.py +186 -0
  66. recurvedata/core/translation.py +91 -0
  67. recurvedata/dbt/client.py +97 -0
  68. recurvedata/dbt/consts.py +99 -0
  69. recurvedata/dbt/cosmos_utils.py +275 -0
  70. recurvedata/dbt/error_codes.py +18 -0
  71. recurvedata/dbt/schemas.py +98 -0
  72. recurvedata/dbt/service.py +451 -0
  73. recurvedata/dbt/utils.py +246 -0
  74. recurvedata/error_codes.py +71 -0
  75. recurvedata/exceptions.py +72 -0
  76. recurvedata/executors/__init__.py +4 -0
  77. recurvedata/executors/cli/__init__.py +7 -0
  78. recurvedata/executors/cli/connector.py +117 -0
  79. recurvedata/executors/cli/dbt.py +118 -0
  80. recurvedata/executors/cli/main.py +82 -0
  81. recurvedata/executors/cli/parameters.py +18 -0
  82. recurvedata/executors/client.py +190 -0
  83. recurvedata/executors/consts.py +50 -0
  84. recurvedata/executors/debug_executor.py +100 -0
  85. recurvedata/executors/executor.py +300 -0
  86. recurvedata/executors/link_executor.py +189 -0
  87. recurvedata/executors/models.py +34 -0
  88. recurvedata/executors/schemas.py +222 -0
  89. recurvedata/executors/service/__init__.py +0 -0
  90. recurvedata/executors/service/connector.py +380 -0
  91. recurvedata/executors/utils.py +172 -0
  92. recurvedata/filestorage/__init__.py +11 -0
  93. recurvedata/filestorage/_factory.py +33 -0
  94. recurvedata/filestorage/backends/__init__.py +0 -0
  95. recurvedata/filestorage/backends/fsspec.py +45 -0
  96. recurvedata/filestorage/backends/local.py +67 -0
  97. recurvedata/filestorage/backends/oss.py +56 -0
  98. recurvedata/filestorage/interface.py +84 -0
  99. recurvedata/operators/__init__.py +10 -0
  100. recurvedata/operators/base.py +28 -0
  101. recurvedata/operators/config.py +21 -0
  102. recurvedata/operators/context.py +255 -0
  103. recurvedata/operators/dbt_operator/__init__.py +2 -0
  104. recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
  105. recurvedata/operators/dbt_operator/operator.py +353 -0
  106. recurvedata/operators/link_operator/__init__.py +1 -0
  107. recurvedata/operators/link_operator/operator.py +120 -0
  108. recurvedata/operators/models.py +55 -0
  109. recurvedata/operators/notify_operator/__init__.py +1 -0
  110. recurvedata/operators/notify_operator/operator.py +180 -0
  111. recurvedata/operators/operator.py +119 -0
  112. recurvedata/operators/python_operator/__init__.py +1 -0
  113. recurvedata/operators/python_operator/operator.py +132 -0
  114. recurvedata/operators/sensor_operator/__init__.py +1 -0
  115. recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
  116. recurvedata/operators/sensor_operator/operator.py +172 -0
  117. recurvedata/operators/spark_operator/__init__.py +1 -0
  118. recurvedata/operators/spark_operator/operator.py +200 -0
  119. recurvedata/operators/spark_operator/spark_sample.py +47 -0
  120. recurvedata/operators/sql_operator/__init__.py +1 -0
  121. recurvedata/operators/sql_operator/operator.py +90 -0
  122. recurvedata/operators/task.py +211 -0
  123. recurvedata/operators/transfer_operator/__init__.py +40 -0
  124. recurvedata/operators/transfer_operator/const.py +10 -0
  125. recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
  126. recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
  127. recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
  128. recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
  129. recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
  130. recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
  131. recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
  132. recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
  133. recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
  134. recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
  135. recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
  136. recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
  137. recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
  138. recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
  139. recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
  140. recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
  141. recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
  142. recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
  143. recurvedata/operators/transfer_operator/load_task_email.py +188 -0
  144. recurvedata/operators/transfer_operator/load_task_es.py +86 -0
  145. recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
  146. recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
  147. recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
  148. recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
  149. recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
  150. recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
  151. recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
  152. recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
  153. recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
  154. recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
  155. recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
  156. recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
  157. recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
  158. recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
  159. recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
  160. recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
  161. recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
  162. recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
  163. recurvedata/operators/transfer_operator/mixin.py +31 -0
  164. recurvedata/operators/transfer_operator/operator.py +231 -0
  165. recurvedata/operators/transfer_operator/task.py +223 -0
  166. recurvedata/operators/transfer_operator/utils.py +134 -0
  167. recurvedata/operators/ui.py +80 -0
  168. recurvedata/operators/utils/__init__.py +51 -0
  169. recurvedata/operators/utils/file_factory.py +150 -0
  170. recurvedata/operators/utils/fs.py +10 -0
  171. recurvedata/operators/utils/lineage.py +265 -0
  172. recurvedata/operators/web_init.py +15 -0
  173. recurvedata/pigeon/connector/__init__.py +294 -0
  174. recurvedata/pigeon/connector/_registry.py +17 -0
  175. recurvedata/pigeon/connector/aliyun_oss.py +80 -0
  176. recurvedata/pigeon/connector/awss3.py +123 -0
  177. recurvedata/pigeon/connector/azure_blob.py +176 -0
  178. recurvedata/pigeon/connector/azure_synapse.py +51 -0
  179. recurvedata/pigeon/connector/cass.py +151 -0
  180. recurvedata/pigeon/connector/clickhouse.py +403 -0
  181. recurvedata/pigeon/connector/clickhouse_native.py +351 -0
  182. recurvedata/pigeon/connector/dbapi.py +571 -0
  183. recurvedata/pigeon/connector/doris.py +166 -0
  184. recurvedata/pigeon/connector/es.py +176 -0
  185. recurvedata/pigeon/connector/feishu.py +1135 -0
  186. recurvedata/pigeon/connector/ftp.py +163 -0
  187. recurvedata/pigeon/connector/google_bigquery.py +283 -0
  188. recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
  189. recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
  190. recurvedata/pigeon/connector/hdfs.py +204 -0
  191. recurvedata/pigeon/connector/hive_impala.py +383 -0
  192. recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
  193. recurvedata/pigeon/connector/mongodb.py +56 -0
  194. recurvedata/pigeon/connector/mssql.py +467 -0
  195. recurvedata/pigeon/connector/mysql.py +175 -0
  196. recurvedata/pigeon/connector/owncloud.py +92 -0
  197. recurvedata/pigeon/connector/postgresql.py +267 -0
  198. recurvedata/pigeon/connector/power_bi.py +179 -0
  199. recurvedata/pigeon/connector/qcloud_cos.py +79 -0
  200. recurvedata/pigeon/connector/redshift.py +123 -0
  201. recurvedata/pigeon/connector/sftp.py +73 -0
  202. recurvedata/pigeon/connector/sqlite.py +42 -0
  203. recurvedata/pigeon/connector/starrocks.py +144 -0
  204. recurvedata/pigeon/connector/tableau.py +162 -0
  205. recurvedata/pigeon/const.py +21 -0
  206. recurvedata/pigeon/csv.py +172 -0
  207. recurvedata/pigeon/docs/datasources-example.json +82 -0
  208. recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
  209. recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
  210. recurvedata/pigeon/dumper/__init__.py +171 -0
  211. recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
  212. recurvedata/pigeon/dumper/base.py +141 -0
  213. recurvedata/pigeon/dumper/cass.py +213 -0
  214. recurvedata/pigeon/dumper/dbapi.py +346 -0
  215. recurvedata/pigeon/dumper/es.py +112 -0
  216. recurvedata/pigeon/dumper/ftp.py +64 -0
  217. recurvedata/pigeon/dumper/mongodb.py +103 -0
  218. recurvedata/pigeon/handler/__init__.py +4 -0
  219. recurvedata/pigeon/handler/base.py +153 -0
  220. recurvedata/pigeon/handler/csv_handler.py +290 -0
  221. recurvedata/pigeon/loader/__init__.py +87 -0
  222. recurvedata/pigeon/loader/base.py +83 -0
  223. recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
  224. recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
  225. recurvedata/pigeon/loader/csv_to_doris.py +215 -0
  226. recurvedata/pigeon/loader/csv_to_es.py +51 -0
  227. recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
  228. recurvedata/pigeon/loader/csv_to_hive.py +468 -0
  229. recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
  230. recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
  231. recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
  232. recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
  233. recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
  234. recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
  235. recurvedata/pigeon/meta.py +116 -0
  236. recurvedata/pigeon/row_factory.py +42 -0
  237. recurvedata/pigeon/schema/__init__.py +124 -0
  238. recurvedata/pigeon/schema/types.py +13 -0
  239. recurvedata/pigeon/sync.py +283 -0
  240. recurvedata/pigeon/transformer.py +146 -0
  241. recurvedata/pigeon/utils/__init__.py +134 -0
  242. recurvedata/pigeon/utils/bloomfilter.py +181 -0
  243. recurvedata/pigeon/utils/date_time.py +323 -0
  244. recurvedata/pigeon/utils/escape.py +15 -0
  245. recurvedata/pigeon/utils/fs.py +266 -0
  246. recurvedata/pigeon/utils/json.py +44 -0
  247. recurvedata/pigeon/utils/keyed_tuple.py +85 -0
  248. recurvedata/pigeon/utils/mp.py +156 -0
  249. recurvedata/pigeon/utils/sql.py +328 -0
  250. recurvedata/pigeon/utils/timing.py +155 -0
  251. recurvedata/provider_manager.py +0 -0
  252. recurvedata/providers/__init__.py +0 -0
  253. recurvedata/providers/dbapi/__init__.py +0 -0
  254. recurvedata/providers/flywheel/__init__.py +0 -0
  255. recurvedata/providers/mysql/__init__.py +0 -0
  256. recurvedata/schedulers/__init__.py +1 -0
  257. recurvedata/schedulers/airflow.py +974 -0
  258. recurvedata/schedulers/airflow_db_process.py +331 -0
  259. recurvedata/schedulers/airflow_operators.py +61 -0
  260. recurvedata/schedulers/airflow_plugin.py +9 -0
  261. recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
  262. recurvedata/schedulers/base.py +99 -0
  263. recurvedata/schedulers/cli.py +228 -0
  264. recurvedata/schedulers/client.py +56 -0
  265. recurvedata/schedulers/consts.py +52 -0
  266. recurvedata/schedulers/debug_celery.py +62 -0
  267. recurvedata/schedulers/model.py +63 -0
  268. recurvedata/schedulers/schemas.py +97 -0
  269. recurvedata/schedulers/service.py +20 -0
  270. recurvedata/schedulers/system_dags.py +59 -0
  271. recurvedata/schedulers/task_status.py +279 -0
  272. recurvedata/schedulers/utils.py +73 -0
  273. recurvedata/schema/__init__.py +0 -0
  274. recurvedata/schema/field.py +88 -0
  275. recurvedata/schema/schema.py +55 -0
  276. recurvedata/schema/types.py +17 -0
  277. recurvedata/schema.py +0 -0
  278. recurvedata/server/__init__.py +0 -0
  279. recurvedata/server/app.py +7 -0
  280. recurvedata/server/connector/__init__.py +0 -0
  281. recurvedata/server/connector/api.py +79 -0
  282. recurvedata/server/connector/schemas.py +28 -0
  283. recurvedata/server/data_service/__init__.py +0 -0
  284. recurvedata/server/data_service/api.py +126 -0
  285. recurvedata/server/data_service/client.py +18 -0
  286. recurvedata/server/data_service/consts.py +1 -0
  287. recurvedata/server/data_service/schemas.py +68 -0
  288. recurvedata/server/data_service/service.py +218 -0
  289. recurvedata/server/dbt/__init__.py +0 -0
  290. recurvedata/server/dbt/api.py +116 -0
  291. recurvedata/server/error_code.py +49 -0
  292. recurvedata/server/exceptions.py +19 -0
  293. recurvedata/server/executor/__init__.py +0 -0
  294. recurvedata/server/executor/api.py +37 -0
  295. recurvedata/server/executor/schemas.py +30 -0
  296. recurvedata/server/executor/service.py +220 -0
  297. recurvedata/server/main.py +32 -0
  298. recurvedata/server/schedulers/__init__.py +0 -0
  299. recurvedata/server/schedulers/api.py +252 -0
  300. recurvedata/server/schedulers/schemas.py +50 -0
  301. recurvedata/server/schemas.py +50 -0
  302. recurvedata/utils/__init__.py +15 -0
  303. recurvedata/utils/_typer.py +61 -0
  304. recurvedata/utils/attrdict.py +19 -0
  305. recurvedata/utils/command_helper.py +20 -0
  306. recurvedata/utils/compat.py +12 -0
  307. recurvedata/utils/compression.py +203 -0
  308. recurvedata/utils/crontab.py +42 -0
  309. recurvedata/utils/crypto_util.py +305 -0
  310. recurvedata/utils/dataclass.py +11 -0
  311. recurvedata/utils/date_time.py +464 -0
  312. recurvedata/utils/dispatch.py +114 -0
  313. recurvedata/utils/email_util.py +104 -0
  314. recurvedata/utils/files.py +386 -0
  315. recurvedata/utils/helpers.py +170 -0
  316. recurvedata/utils/httputil.py +117 -0
  317. recurvedata/utils/imports.py +132 -0
  318. recurvedata/utils/json.py +80 -0
  319. recurvedata/utils/log.py +117 -0
  320. recurvedata/utils/log_capture.py +153 -0
  321. recurvedata/utils/mp.py +178 -0
  322. recurvedata/utils/normalizer.py +102 -0
  323. recurvedata/utils/redis_lock.py +474 -0
  324. recurvedata/utils/registry.py +54 -0
  325. recurvedata/utils/shell.py +15 -0
  326. recurvedata/utils/singleton.py +33 -0
  327. recurvedata/utils/sql.py +6 -0
  328. recurvedata/utils/timeout.py +28 -0
  329. recurvedata/utils/tracing.py +14 -0
  330. recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
  331. recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
  332. recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
  333. recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
@@ -0,0 +1,386 @@
1
+ import collections
2
+ import contextlib
3
+ import csv
4
+ import datetime
5
+ import fcntl
6
+ import glob
7
+ import hashlib
8
+ import json
9
+ import logging
10
+ import os
11
+ import shutil
12
+ import tempfile
13
+ from pathlib import Path
14
+ from typing import IO, Any, Sequence, Union
15
+
16
+ from recurvedata.utils import helpers, shell
17
+
18
+ logger = logging.getLogger(__name__)
19
+ PathLike = Union[str, os.PathLike]
20
+
21
+ _csv_dialect_options = {
22
+ "delimiter": ",",
23
+ "quoting": csv.QUOTE_ALL,
24
+ "lineterminator": "\r\n",
25
+ }
26
+
27
+
28
+ def new_tempfile(suffix: str = "", prefix: str = None, dir: str = None) -> str:
29
+ """Create a tempfile with a random filename.
30
+
31
+ Args:
32
+ suffix: suffix of the filename
33
+ prefix: prefix of the filename
34
+ dir: directory to store the file
35
+
36
+ Returns:
37
+ the filename
38
+ """
39
+ ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
40
+ kwargs = {"suffix": f"{ts}_{suffix}", "dir": dir}
41
+ if prefix:
42
+ kwargs["prefix"] = prefix
43
+ _, filename = tempfile.mkstemp(**kwargs)
44
+ return filename
45
+
46
+
47
+ def new_tempdir(suffix: str = "", prefix: str = None, dir: str = None) -> str:
48
+ """Create a tempdir with a random filename.
49
+
50
+ Args:
51
+ suffix: suffix of the filename
52
+ prefix: prefix of the filename
53
+ dir: directory to store the file
54
+
55
+ Returns:
56
+ the filename
57
+ """
58
+ ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
59
+ kwargs = {"suffix": f"{ts}_{suffix}", "dir": dir}
60
+ if prefix:
61
+ kwargs["prefix"] = prefix
62
+ return tempfile.mkdtemp(**kwargs)
63
+
64
+
65
+ def merge_files(
66
+ files: Sequence[PathLike],
67
+ filename: str = None,
68
+ num_skip_lines: int = 0,
69
+ delete: bool = True,
70
+ ) -> str:
71
+ """Concat multiple files into one.
72
+
73
+ Args:
74
+ files: source file names
75
+ filename: target filename, will create a tempfile if not provided
76
+ num_skip_lines: skip n lines before merge into target file
77
+ delete: delete source files after being merged
78
+
79
+ Returns:
80
+ the target filename
81
+ """
82
+ if filename is None:
83
+ _, filename = tempfile.mkstemp()
84
+
85
+ if num_skip_lines:
86
+ with open(filename, "wb") as fout:
87
+ for f in files:
88
+ with open(f, "rb") as fin:
89
+ for _ in range(num_skip_lines):
90
+ fin.readline()
91
+ shutil.copyfileobj(fin, fout)
92
+
93
+ else:
94
+ if len(files) == 1 and delete:
95
+ os.rename(files[0], filename)
96
+ else:
97
+ # merge by `cat` for better performance
98
+ shell.run(f'cat {" ".join(files)} > {filename}', logger)
99
+
100
+ if delete:
101
+ remove_files_safely(files)
102
+
103
+ return filename
104
+
105
+
106
+ def remove_lines_from_start(filename: PathLike, lines: int, inplace: bool = False) -> str:
107
+ """Skip the first n lines of a file.
108
+
109
+ Args:
110
+ filename: source file name
111
+ lines: number of lines to be skipped
112
+ inplace: modify the file in-place or not
113
+
114
+ Returns:
115
+ the filename, will be the same as the source file if inplace is True
116
+ """
117
+ tmp_file = new_tempfile()
118
+ with open(filename, "rb") as f_in, open(tmp_file, "wb") as f_out:
119
+ # skip the first n lines
120
+ for _ in range(lines):
121
+ next(f_in, None)
122
+
123
+ # copy the rest to another file
124
+ shutil.copyfileobj(f_in, f_out)
125
+
126
+ return replace_file_with_temp(tmp_file, filename, inplace)
127
+
128
+
129
+ def is_file_empty(filename: PathLike) -> bool:
130
+ """Detect file is empty or not, the non-exists file is considered as empty"""
131
+ try:
132
+ return os.stat(filename).st_size == 0
133
+ except FileNotFoundError:
134
+ return True
135
+
136
+
137
+ def remove_files(files: Sequence[PathLike]) -> None:
138
+ """Remove files."""
139
+ file_list: list[PathLike] = helpers.ensure_list(files)
140
+ for f in file_list:
141
+ os.unlink(f)
142
+
143
+
144
+ def remove_files_safely(files: Sequence[PathLike]) -> None:
145
+ """Remove files safely. Ignore the errors."""
146
+ with contextlib.suppress(OSError, TypeError, ValueError):
147
+ remove_files(files)
148
+
149
+
150
+ def remove_files_by_pattern(pattern: str) -> None:
151
+ """Remove files by pattern. Ignore the errors."""
152
+ files = glob.glob(pattern)
153
+ logger.info("files to be deleted: %s", str(files))
154
+ remove_files_safely(files)
155
+
156
+
157
+ def remove_folder_safely(folder: str) -> None:
158
+ """Remove folder safely. Ignore the errors."""
159
+ if not os.path.exists(folder):
160
+ return
161
+ shutil.rmtree(folder, ignore_errors=True)
162
+
163
+
164
+ @contextlib.contextmanager
165
+ def ensure_remove(filename: PathLike):
166
+ """Remove file safely after using."""
167
+ try:
168
+ yield filename
169
+ finally:
170
+ remove_files_safely(filename)
171
+
172
+
173
+ def convert_excel_to_csv(
174
+ src_file: PathLike,
175
+ dst_file: PathLike = None,
176
+ skiprows: int = 0,
177
+ inplace: bool = True,
178
+ ) -> str:
179
+ """Convert an Excel file to a CSV file
180
+
181
+ Args:
182
+ src_file: the path of the Excel file
183
+ dst_file: the path of output file, a temporary filename will be made otherwise
184
+ skiprows: skip the first N rows
185
+ inplace: replace the original file if True
186
+
187
+ Returns:
188
+ the target_filename
189
+ """
190
+ import pandas as pd
191
+
192
+ if not dst_file:
193
+ dst_file = new_tempfile(dir=os.path.dirname(src_file))
194
+
195
+ df = pd.read_excel(src_file, skiprows=skiprows)
196
+ df.to_csv(dst_file, line_terminator="\r\n", header=False, index=False)
197
+
198
+ return replace_file_with_temp(dst_file, src_file, inplace)
199
+
200
+
201
+ def convert_jsonlines_to_csv(
202
+ src_file: PathLike,
203
+ dst_file: PathLike = None,
204
+ skiprows: int = 0,
205
+ src_encoding: str = "utf8",
206
+ inplace: bool = True,
207
+ ) -> str:
208
+ """Convert a JSON Lines file to a CSV file
209
+
210
+ Args:
211
+ src_file: the path of the JSON Lines file
212
+ dst_file: the path of output file, a temporary filename will be made otherwise
213
+ skiprows: skip the first N rows
214
+ src_encoding: the encoding of the JSON Lines file
215
+ inplace: replace the original file if True
216
+
217
+ Returns:
218
+ the target_filename
219
+ """
220
+ if not dst_file:
221
+ dst_file = new_tempfile(dir=os.path.dirname(src_file))
222
+
223
+ decoder = json.JSONDecoder(object_pairs_hook=collections.OrderedDict)
224
+ with open(src_file, "r", encoding=src_encoding) as f_in, open(dst_file, "w") as f_out:
225
+ _skip_header_rows(f_in, skiprows)
226
+
227
+ line = f_in.readline()
228
+ row = decoder.decode(line)
229
+ writer = csv.DictWriter(f_out, fieldnames=list(row.keys()), **_csv_dialect_options)
230
+ writer.writerow(row)
231
+
232
+ for line in f_in:
233
+ writer.writerow(decoder.decode(line))
234
+
235
+ return replace_file_with_temp(dst_file, src_file, inplace)
236
+
237
+
238
+ def convert_encoding(
239
+ filename: PathLike,
240
+ src_encoding: str,
241
+ dst_encoding: str = "utf8",
242
+ skiprows: int = 0,
243
+ inplace: bool = True,
244
+ ) -> str:
245
+ """Convert the encoding of a file
246
+
247
+ Args:
248
+ filename: the path of the file
249
+ src_encoding: the encoding of the file
250
+ dst_encoding: the encoding to convert to
251
+ skiprows: skip the first N rows
252
+ inplace: replace the original file if True
253
+
254
+ Returns:
255
+ the target_filename
256
+ """
257
+ if src_encoding == dst_encoding:
258
+ return filename
259
+
260
+ target = new_tempfile(dir=os.path.dirname(filename))
261
+ with open(filename, "r", encoding=src_encoding) as f_in, open(target, "w", encoding=dst_encoding) as f_out:
262
+ _skip_header_rows(f_in, skiprows)
263
+ shutil.copyfileobj(f_in, f_out)
264
+
265
+ return replace_file_with_temp(target, filename, inplace)
266
+
267
+
268
+ def convert_csv_dialect(
269
+ filename: PathLike,
270
+ src_dialect_options: dict[str, Any],
271
+ dst_dialect_options: dict[str, Any] = None,
272
+ skiprows: int = 0,
273
+ src_encoding: str = "utf8",
274
+ inplace: bool = True,
275
+ ):
276
+ """Convert the dialect of a CSV file
277
+
278
+ Args:
279
+ filename: the path of the CSV file
280
+ src_dialect_options: the dialect of the file
281
+ dst_dialect_options: the dialect to convert to
282
+ skiprows: skip the first N rows
283
+ src_encoding: the encoding of the file
284
+ inplace: replace the original file if True
285
+
286
+ Returns:
287
+ the target_filename
288
+ """
289
+ if dst_dialect_options is None:
290
+ dst_dialect_options = _csv_dialect_options.copy()
291
+
292
+ if _same_dict(src_dialect_options, dst_dialect_options):
293
+ if src_encoding != "utf8":
294
+ convert_encoding(filename, src_encoding=src_encoding, skiprows=skiprows, inplace=True)
295
+ return filename
296
+
297
+ dst_file = new_tempfile(dir=os.path.dirname(filename))
298
+ with open(filename, "r", encoding=src_encoding) as f_in, open(dst_file, "w") as f_out:
299
+ _skip_header_rows(f_in, skiprows)
300
+
301
+ reader = csv.reader(f_in, **src_dialect_options)
302
+ writer = csv.writer(f_out, **dst_dialect_options)
303
+ for row in reader:
304
+ writer.writerow(row)
305
+
306
+ return replace_file_with_temp(dst_file, filename, inplace)
307
+
308
+
309
+ def replace_file_with_temp(tmp_file: PathLike, target_file: PathLike, inplace: bool = False) -> PathLike:
310
+ """Determine the filename of the converted file, and rename it if inplace is True"""
311
+ if inplace:
312
+ os.rename(tmp_file, target_file)
313
+ return target_file
314
+ return tmp_file
315
+
316
+
317
+ def _skip_header_rows(f: IO, n: int = 0):
318
+ for _ in range(n):
319
+ f.readline()
320
+
321
+
322
+ def _same_dict(a: dict, b: dict) -> bool:
323
+ if len(a) != len(b):
324
+ return False
325
+ for k in a:
326
+ if k not in b or a[k] != b[k]:
327
+ return False
328
+ return True
329
+
330
+
331
+ def calculate_md5(filepath: Path | str) -> str:
332
+ md5_hash = hashlib.md5()
333
+ chunk_size = 1024 * 1024
334
+ with open(filepath, "rb") as f:
335
+ for chunk in iter(lambda: f.read(chunk_size), b""):
336
+ md5_hash.update(chunk)
337
+
338
+ return md5_hash.hexdigest()
339
+
340
+
341
+ class FileLock:
342
+ """A file lock using fcntl.
343
+ copy from recurve web
344
+ """
345
+
346
+ def __init__(self, lock_file_path: str | Path):
347
+ self.lock_file_path = Path(lock_file_path)
348
+ self.fd = None
349
+
350
+ def acquire(self):
351
+ try:
352
+ self.fd = self.lock_file_path.open("w")
353
+ # Acquire an exclusive lock, this will block until the lock is acquired
354
+ fcntl.flock(self.fd, fcntl.LOCK_EX)
355
+ except Exception as e:
356
+ self._reset()
357
+ raise e # Propagate unexpected exceptions
358
+
359
+ def release(self):
360
+ if not self.fd:
361
+ return
362
+ try:
363
+ fcntl.flock(self.fd, fcntl.LOCK_UN)
364
+ except Exception as e:
365
+ raise e # Propagate unexpected exceptions
366
+ finally:
367
+ self._reset()
368
+
369
+ def _reset(self):
370
+ if self.fd:
371
+ self.fd.close()
372
+ self.fd = None
373
+
374
+ def __enter__(self):
375
+ self.acquire()
376
+ return self
377
+
378
+ def __exit__(self, exc_type, exc_val, exc_tb):
379
+ self.release()
380
+
381
+ def __del__(self):
382
+ try:
383
+ self.release()
384
+ except Exception:
385
+ # Suppress exceptions in __del__, as we've done our best
386
+ pass
@@ -0,0 +1,170 @@
1
+ import hashlib
2
+ import os
3
+ from typing import Callable, Generator, Iterable, TypeVar, Union, overload
4
+
5
+ import cytoolz as toolz
6
+
7
+ from recurvedata.consts import ENV_ID_KEY
8
+
9
+ T = TypeVar("T")
10
+ _VT = TypeVar("_VT")
11
+ _KT = TypeVar("_KT")
12
+
13
+
14
+ # Hash helpers
15
+
16
+
17
+ def _get_hash(v: Union[str, bytes], hash_func: Callable) -> str:
18
+ if isinstance(v, str):
19
+ v = v.encode()
20
+ if not isinstance(v, bytes):
21
+ v = str(v).encode()
22
+ return hash_func(v).hexdigest()
23
+
24
+
25
+ def sha256hash(v: Union[str, bytes]) -> str:
26
+ return _get_hash(v, hashlib.sha256)
27
+
28
+
29
+ def md5hash(v: Union[str, bytes]) -> str:
30
+ return _get_hash(v, hashlib.md5)
31
+
32
+
33
+ # String helpers
34
+
35
+
36
+ def trim_prefix(s: str, sub: str) -> str:
37
+ if not s.startswith(sub):
38
+ return s
39
+ return s[len(sub) :]
40
+
41
+
42
+ def trim_suffix(s: str, sub: str) -> str:
43
+ if not s.endswith(sub):
44
+ return s
45
+ return s[: -len(sub)]
46
+
47
+
48
+ def truncate_string(s: str, length: int, replacer: str = "...") -> str:
49
+ if len(s) > length:
50
+ return s[:length] + replacer
51
+ return s
52
+
53
+
54
+ def unescape_backslash(s: str) -> str:
55
+ return s.encode().decode("unicode_escape")
56
+
57
+
58
+ def safe_int(v: Union[str, int, float], default: int = 0) -> int:
59
+ try:
60
+ return int(v)
61
+ except Exception:
62
+ return default
63
+
64
+
65
+ def safe_float(v: Union[str, int, float], default: float = 0.0) -> float:
66
+ try:
67
+ return float(v)
68
+ except Exception:
69
+ return default
70
+
71
+
72
+ # Container helpers
73
+
74
+
75
+ def first(seq: Iterable[T], default: T = None) -> T:
76
+ try:
77
+ return toolz.first(seq)
78
+ except StopIteration:
79
+ return default
80
+
81
+
82
+ def chunkify(lst: list, size: int) -> Generator[list, None, None]:
83
+ for i in range(0, len(lst), size):
84
+ yield lst[i : i + size]
85
+
86
+
87
+ def extract_dict(d: dict[_KT, _VT], keys: Iterable[_KT]) -> dict[_KT, _VT]:
88
+ return {k: v for k, v in d.items() if k in keys}
89
+
90
+
91
+ def ensure_list(v: Union[T, Iterable[T]]) -> list[T]:
92
+ if isinstance(v, (tuple, set, list)):
93
+ return list(v)
94
+ return [v]
95
+
96
+
97
+ def ensure_str_list(v: str, sep: str = ",", strip: bool = True) -> list[str]:
98
+ if v is None:
99
+ return []
100
+
101
+ if isinstance(v, str):
102
+ if not v:
103
+ return []
104
+ if strip:
105
+ return [x.strip() for x in v.split(sep)]
106
+ else:
107
+ return v.split(sep)
108
+
109
+ if isinstance(v, (tuple, set, list)):
110
+ return list(v)
111
+ raise TypeError(f'unsupported type "{type(v)}"')
112
+
113
+
114
+ @overload
115
+ def replace_null_values(
116
+ row: list[T],
117
+ null_values: Union[list[T], set[T]],
118
+ replacer: T = None,
119
+ ) -> list[T]:
120
+ ...
121
+
122
+
123
+ @overload
124
+ def replace_null_values(
125
+ row: tuple[T, ...],
126
+ null_values: Union[list[T], set[T]],
127
+ replacer: T = None,
128
+ ) -> tuple[T, ...]:
129
+ ...
130
+
131
+
132
+ @overload
133
+ def replace_null_values(
134
+ row: dict[_KT, _VT],
135
+ null_values: Union[list[_VT], set[_VT]],
136
+ replacer: _VT = None,
137
+ ) -> dict[_KT, _VT]:
138
+ ...
139
+
140
+
141
+ def replace_null_values(
142
+ row: Union[list[_VT], tuple[_VT, ...], dict[_KT, _VT]],
143
+ null_values: Union[list[_VT], set[_VT]],
144
+ replacer: _VT = None,
145
+ ) -> Union[list[_VT], tuple[_VT, ...], dict[_KT, _VT]]:
146
+ def _f(v):
147
+ if v in null_values:
148
+ return replacer
149
+ return v
150
+
151
+ if isinstance(row, list):
152
+ return list(map(_f, row))
153
+ if isinstance(row, tuple):
154
+ return tuple(map(_f, row))
155
+ if isinstance(row, dict):
156
+ return toolz.valmap(_f, row)
157
+ raise TypeError(f"only list, tuple or dict type is supported, got {repr(type(row))}")
158
+
159
+
160
+ def get_env_id():
161
+ return int(os.environ[ENV_ID_KEY])
162
+
163
+
164
+ def get_environment_variable(key: str, cast: Callable[[str], T] | None = None) -> T | None:
165
+ value = os.environ.get(key)
166
+ if value is None:
167
+ return None
168
+ if cast is not None:
169
+ return cast(value)
170
+ return value
@@ -0,0 +1,117 @@
1
+ import json
2
+ import os
3
+ import re
4
+ import shutil
5
+ import urllib.parse
6
+ from typing import Optional
7
+
8
+ import httpx
9
+ import requests
10
+ import requests.adapters
11
+ from urllib3.util.retry import Retry
12
+
13
+
14
+ def new_retry_session(
15
+ max_retries=3,
16
+ backoff_factor=0.3,
17
+ method_whitelist=None,
18
+ status_forcelist=(429, 500, 502, 503, 504),
19
+ session=None,
20
+ ):
21
+ if not method_whitelist:
22
+ method_whitelist = Retry.DEFAULT_ALLOWED_METHODS
23
+ session = session or requests.Session()
24
+ retry = Retry(
25
+ total=max_retries,
26
+ read=max_retries,
27
+ connect=max_retries,
28
+ allowed_methods=method_whitelist,
29
+ backoff_factor=backoff_factor,
30
+ status_forcelist=status_forcelist,
31
+ )
32
+ adapter = requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100, max_retries=retry)
33
+ session.mount("http://", adapter)
34
+ session.mount("https://", adapter)
35
+ return session
36
+
37
+
38
+ def download_file(url: str, filepath: str, **kwargs) -> str:
39
+ if os.path.isdir(filepath):
40
+ filename = url.split("/")[-1]
41
+ filepath = os.path.join(filepath, filename)
42
+
43
+ with requests.get(url, stream=True, **kwargs) as r:
44
+ with open(filepath, "wb") as f:
45
+ shutil.copyfileobj(r.raw, f)
46
+
47
+ return filepath
48
+
49
+
50
+ FQDN_RE = re.compile(r"^((?!-)[-A-Z\d]{1,62}(?<!-)\.)+[A-Z]{1,62}\.?$", re.IGNORECASE)
51
+
52
+
53
+ def is_valid_domain(host: str) -> bool:
54
+ if len(host) > 253:
55
+ return False
56
+ return bool(FQDN_RE.match(host))
57
+
58
+
59
+ def fill_scheme_to_url(url: str, scheme="https") -> str:
60
+ p = urllib.parse.urlparse(url)
61
+ # 有 scheme,不需要处理
62
+ if p.scheme != "":
63
+ return url
64
+
65
+ netloc = p.netloc or p.path
66
+ if "/" in netloc:
67
+ if netloc.startswith("://"):
68
+ return f"{scheme}{url}"
69
+
70
+ domain = netloc[: netloc.index("/")]
71
+ if not is_valid_domain(domain):
72
+ return url
73
+ path = p.path if p.netloc else ""
74
+ p = urllib.parse.ParseResult(scheme, netloc, path, *p[3:])
75
+ return p.geturl()
76
+
77
+
78
+ def ensure_url_list(s: str, fix_scheme=True) -> Optional[list[str]]:
79
+ if not s:
80
+ return None
81
+
82
+ try:
83
+ urls = json.loads(s)
84
+ except json.JSONDecodeError:
85
+ # try use comma as seperator
86
+ urls = s.split(",")
87
+ if fix_scheme:
88
+ urls = [fill_scheme_to_url(x) for x in urls]
89
+ return urls
90
+
91
+
92
+ async def forward(_request, endpoint: str):
93
+ """
94
+ Forward a request to another server (received via FastAPI) and return the response.
95
+ We create a common method to use in multiple places.
96
+ """
97
+ from fastapi import Request
98
+ from fastapi.responses import Response
99
+
100
+ request: Request = _request
101
+ method = request.method
102
+ headers = dict(request.headers)
103
+ params = request.query_params
104
+ body = await request.body()
105
+
106
+ async with httpx.AsyncClient() as client:
107
+ # Forward the request
108
+ response = await client.request(
109
+ method=method,
110
+ url=endpoint,
111
+ headers=headers,
112
+ params=params,
113
+ content=body,
114
+ )
115
+
116
+ # Return the response from the target server
117
+ return Response(content=response.content, status_code=response.status_code, headers=dict(response.headers))