recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show
  1. recurvedata/__init__.py +0 -0
  2. recurvedata/__version__.py +1 -0
  3. recurvedata/client/__init__.py +3 -0
  4. recurvedata/client/client.py +150 -0
  5. recurvedata/client/server_client.py +91 -0
  6. recurvedata/config.py +99 -0
  7. recurvedata/connectors/__init__.py +20 -0
  8. recurvedata/connectors/_register.py +46 -0
  9. recurvedata/connectors/base.py +111 -0
  10. recurvedata/connectors/config_schema.py +1575 -0
  11. recurvedata/connectors/connectors/__init__.py +0 -0
  12. recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
  13. recurvedata/connectors/connectors/auth.py +44 -0
  14. recurvedata/connectors/connectors/azure_blob.py +89 -0
  15. recurvedata/connectors/connectors/azure_synapse.py +79 -0
  16. recurvedata/connectors/connectors/bigquery.py +359 -0
  17. recurvedata/connectors/connectors/clickhouse.py +219 -0
  18. recurvedata/connectors/connectors/dingtalk.py +61 -0
  19. recurvedata/connectors/connectors/doris.py +215 -0
  20. recurvedata/connectors/connectors/es.py +62 -0
  21. recurvedata/connectors/connectors/feishu.py +65 -0
  22. recurvedata/connectors/connectors/ftp.py +50 -0
  23. recurvedata/connectors/connectors/generic.py +49 -0
  24. recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
  25. recurvedata/connectors/connectors/google_service_account.py +225 -0
  26. recurvedata/connectors/connectors/hive.py +207 -0
  27. recurvedata/connectors/connectors/impala.py +210 -0
  28. recurvedata/connectors/connectors/jenkins.py +51 -0
  29. recurvedata/connectors/connectors/mail.py +89 -0
  30. recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
  31. recurvedata/connectors/connectors/mongo.py +79 -0
  32. recurvedata/connectors/connectors/mssql.py +131 -0
  33. recurvedata/connectors/connectors/mysql.py +191 -0
  34. recurvedata/connectors/connectors/n8n.py +141 -0
  35. recurvedata/connectors/connectors/oss.py +74 -0
  36. recurvedata/connectors/connectors/owncloud.py +36 -0
  37. recurvedata/connectors/connectors/phoenix.py +36 -0
  38. recurvedata/connectors/connectors/postgres.py +230 -0
  39. recurvedata/connectors/connectors/python.py +50 -0
  40. recurvedata/connectors/connectors/redshift.py +187 -0
  41. recurvedata/connectors/connectors/s3.py +93 -0
  42. recurvedata/connectors/connectors/sftp.py +87 -0
  43. recurvedata/connectors/connectors/slack.py +35 -0
  44. recurvedata/connectors/connectors/spark.py +99 -0
  45. recurvedata/connectors/connectors/starrocks.py +175 -0
  46. recurvedata/connectors/connectors/tencent_cos.py +40 -0
  47. recurvedata/connectors/connectors/tidb.py +49 -0
  48. recurvedata/connectors/const.py +315 -0
  49. recurvedata/connectors/datasource.py +189 -0
  50. recurvedata/connectors/dbapi.py +469 -0
  51. recurvedata/connectors/fs.py +66 -0
  52. recurvedata/connectors/ftp.py +40 -0
  53. recurvedata/connectors/object_store.py +60 -0
  54. recurvedata/connectors/pigeon.py +172 -0
  55. recurvedata/connectors/proxy.py +104 -0
  56. recurvedata/connectors/service.py +223 -0
  57. recurvedata/connectors/utils.py +47 -0
  58. recurvedata/consts.py +49 -0
  59. recurvedata/core/__init__.py +0 -0
  60. recurvedata/core/config.py +46 -0
  61. recurvedata/core/configurable.py +27 -0
  62. recurvedata/core/consts.py +2 -0
  63. recurvedata/core/templating.py +206 -0
  64. recurvedata/core/tracing.py +223 -0
  65. recurvedata/core/transformer.py +186 -0
  66. recurvedata/core/translation.py +91 -0
  67. recurvedata/dbt/client.py +97 -0
  68. recurvedata/dbt/consts.py +99 -0
  69. recurvedata/dbt/cosmos_utils.py +275 -0
  70. recurvedata/dbt/error_codes.py +18 -0
  71. recurvedata/dbt/schemas.py +98 -0
  72. recurvedata/dbt/service.py +451 -0
  73. recurvedata/dbt/utils.py +246 -0
  74. recurvedata/error_codes.py +71 -0
  75. recurvedata/exceptions.py +72 -0
  76. recurvedata/executors/__init__.py +4 -0
  77. recurvedata/executors/cli/__init__.py +7 -0
  78. recurvedata/executors/cli/connector.py +117 -0
  79. recurvedata/executors/cli/dbt.py +118 -0
  80. recurvedata/executors/cli/main.py +82 -0
  81. recurvedata/executors/cli/parameters.py +18 -0
  82. recurvedata/executors/client.py +190 -0
  83. recurvedata/executors/consts.py +50 -0
  84. recurvedata/executors/debug_executor.py +100 -0
  85. recurvedata/executors/executor.py +300 -0
  86. recurvedata/executors/link_executor.py +189 -0
  87. recurvedata/executors/models.py +34 -0
  88. recurvedata/executors/schemas.py +222 -0
  89. recurvedata/executors/service/__init__.py +0 -0
  90. recurvedata/executors/service/connector.py +380 -0
  91. recurvedata/executors/utils.py +172 -0
  92. recurvedata/filestorage/__init__.py +11 -0
  93. recurvedata/filestorage/_factory.py +33 -0
  94. recurvedata/filestorage/backends/__init__.py +0 -0
  95. recurvedata/filestorage/backends/fsspec.py +45 -0
  96. recurvedata/filestorage/backends/local.py +67 -0
  97. recurvedata/filestorage/backends/oss.py +56 -0
  98. recurvedata/filestorage/interface.py +84 -0
  99. recurvedata/operators/__init__.py +10 -0
  100. recurvedata/operators/base.py +28 -0
  101. recurvedata/operators/config.py +21 -0
  102. recurvedata/operators/context.py +255 -0
  103. recurvedata/operators/dbt_operator/__init__.py +2 -0
  104. recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
  105. recurvedata/operators/dbt_operator/operator.py +353 -0
  106. recurvedata/operators/link_operator/__init__.py +1 -0
  107. recurvedata/operators/link_operator/operator.py +120 -0
  108. recurvedata/operators/models.py +55 -0
  109. recurvedata/operators/notify_operator/__init__.py +1 -0
  110. recurvedata/operators/notify_operator/operator.py +180 -0
  111. recurvedata/operators/operator.py +119 -0
  112. recurvedata/operators/python_operator/__init__.py +1 -0
  113. recurvedata/operators/python_operator/operator.py +132 -0
  114. recurvedata/operators/sensor_operator/__init__.py +1 -0
  115. recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
  116. recurvedata/operators/sensor_operator/operator.py +172 -0
  117. recurvedata/operators/spark_operator/__init__.py +1 -0
  118. recurvedata/operators/spark_operator/operator.py +200 -0
  119. recurvedata/operators/spark_operator/spark_sample.py +47 -0
  120. recurvedata/operators/sql_operator/__init__.py +1 -0
  121. recurvedata/operators/sql_operator/operator.py +90 -0
  122. recurvedata/operators/task.py +211 -0
  123. recurvedata/operators/transfer_operator/__init__.py +40 -0
  124. recurvedata/operators/transfer_operator/const.py +10 -0
  125. recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
  126. recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
  127. recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
  128. recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
  129. recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
  130. recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
  131. recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
  132. recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
  133. recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
  134. recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
  135. recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
  136. recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
  137. recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
  138. recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
  139. recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
  140. recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
  141. recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
  142. recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
  143. recurvedata/operators/transfer_operator/load_task_email.py +188 -0
  144. recurvedata/operators/transfer_operator/load_task_es.py +86 -0
  145. recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
  146. recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
  147. recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
  148. recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
  149. recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
  150. recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
  151. recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
  152. recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
  153. recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
  154. recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
  155. recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
  156. recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
  157. recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
  158. recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
  159. recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
  160. recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
  161. recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
  162. recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
  163. recurvedata/operators/transfer_operator/mixin.py +31 -0
  164. recurvedata/operators/transfer_operator/operator.py +231 -0
  165. recurvedata/operators/transfer_operator/task.py +223 -0
  166. recurvedata/operators/transfer_operator/utils.py +134 -0
  167. recurvedata/operators/ui.py +80 -0
  168. recurvedata/operators/utils/__init__.py +51 -0
  169. recurvedata/operators/utils/file_factory.py +150 -0
  170. recurvedata/operators/utils/fs.py +10 -0
  171. recurvedata/operators/utils/lineage.py +265 -0
  172. recurvedata/operators/web_init.py +15 -0
  173. recurvedata/pigeon/connector/__init__.py +294 -0
  174. recurvedata/pigeon/connector/_registry.py +17 -0
  175. recurvedata/pigeon/connector/aliyun_oss.py +80 -0
  176. recurvedata/pigeon/connector/awss3.py +123 -0
  177. recurvedata/pigeon/connector/azure_blob.py +176 -0
  178. recurvedata/pigeon/connector/azure_synapse.py +51 -0
  179. recurvedata/pigeon/connector/cass.py +151 -0
  180. recurvedata/pigeon/connector/clickhouse.py +403 -0
  181. recurvedata/pigeon/connector/clickhouse_native.py +351 -0
  182. recurvedata/pigeon/connector/dbapi.py +571 -0
  183. recurvedata/pigeon/connector/doris.py +166 -0
  184. recurvedata/pigeon/connector/es.py +176 -0
  185. recurvedata/pigeon/connector/feishu.py +1135 -0
  186. recurvedata/pigeon/connector/ftp.py +163 -0
  187. recurvedata/pigeon/connector/google_bigquery.py +283 -0
  188. recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
  189. recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
  190. recurvedata/pigeon/connector/hdfs.py +204 -0
  191. recurvedata/pigeon/connector/hive_impala.py +383 -0
  192. recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
  193. recurvedata/pigeon/connector/mongodb.py +56 -0
  194. recurvedata/pigeon/connector/mssql.py +467 -0
  195. recurvedata/pigeon/connector/mysql.py +175 -0
  196. recurvedata/pigeon/connector/owncloud.py +92 -0
  197. recurvedata/pigeon/connector/postgresql.py +267 -0
  198. recurvedata/pigeon/connector/power_bi.py +179 -0
  199. recurvedata/pigeon/connector/qcloud_cos.py +79 -0
  200. recurvedata/pigeon/connector/redshift.py +123 -0
  201. recurvedata/pigeon/connector/sftp.py +73 -0
  202. recurvedata/pigeon/connector/sqlite.py +42 -0
  203. recurvedata/pigeon/connector/starrocks.py +144 -0
  204. recurvedata/pigeon/connector/tableau.py +162 -0
  205. recurvedata/pigeon/const.py +21 -0
  206. recurvedata/pigeon/csv.py +172 -0
  207. recurvedata/pigeon/docs/datasources-example.json +82 -0
  208. recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
  209. recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
  210. recurvedata/pigeon/dumper/__init__.py +171 -0
  211. recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
  212. recurvedata/pigeon/dumper/base.py +141 -0
  213. recurvedata/pigeon/dumper/cass.py +213 -0
  214. recurvedata/pigeon/dumper/dbapi.py +346 -0
  215. recurvedata/pigeon/dumper/es.py +112 -0
  216. recurvedata/pigeon/dumper/ftp.py +64 -0
  217. recurvedata/pigeon/dumper/mongodb.py +103 -0
  218. recurvedata/pigeon/handler/__init__.py +4 -0
  219. recurvedata/pigeon/handler/base.py +153 -0
  220. recurvedata/pigeon/handler/csv_handler.py +290 -0
  221. recurvedata/pigeon/loader/__init__.py +87 -0
  222. recurvedata/pigeon/loader/base.py +83 -0
  223. recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
  224. recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
  225. recurvedata/pigeon/loader/csv_to_doris.py +215 -0
  226. recurvedata/pigeon/loader/csv_to_es.py +51 -0
  227. recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
  228. recurvedata/pigeon/loader/csv_to_hive.py +468 -0
  229. recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
  230. recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
  231. recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
  232. recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
  233. recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
  234. recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
  235. recurvedata/pigeon/meta.py +116 -0
  236. recurvedata/pigeon/row_factory.py +42 -0
  237. recurvedata/pigeon/schema/__init__.py +124 -0
  238. recurvedata/pigeon/schema/types.py +13 -0
  239. recurvedata/pigeon/sync.py +283 -0
  240. recurvedata/pigeon/transformer.py +146 -0
  241. recurvedata/pigeon/utils/__init__.py +134 -0
  242. recurvedata/pigeon/utils/bloomfilter.py +181 -0
  243. recurvedata/pigeon/utils/date_time.py +323 -0
  244. recurvedata/pigeon/utils/escape.py +15 -0
  245. recurvedata/pigeon/utils/fs.py +266 -0
  246. recurvedata/pigeon/utils/json.py +44 -0
  247. recurvedata/pigeon/utils/keyed_tuple.py +85 -0
  248. recurvedata/pigeon/utils/mp.py +156 -0
  249. recurvedata/pigeon/utils/sql.py +328 -0
  250. recurvedata/pigeon/utils/timing.py +155 -0
  251. recurvedata/provider_manager.py +0 -0
  252. recurvedata/providers/__init__.py +0 -0
  253. recurvedata/providers/dbapi/__init__.py +0 -0
  254. recurvedata/providers/flywheel/__init__.py +0 -0
  255. recurvedata/providers/mysql/__init__.py +0 -0
  256. recurvedata/schedulers/__init__.py +1 -0
  257. recurvedata/schedulers/airflow.py +974 -0
  258. recurvedata/schedulers/airflow_db_process.py +331 -0
  259. recurvedata/schedulers/airflow_operators.py +61 -0
  260. recurvedata/schedulers/airflow_plugin.py +9 -0
  261. recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
  262. recurvedata/schedulers/base.py +99 -0
  263. recurvedata/schedulers/cli.py +228 -0
  264. recurvedata/schedulers/client.py +56 -0
  265. recurvedata/schedulers/consts.py +52 -0
  266. recurvedata/schedulers/debug_celery.py +62 -0
  267. recurvedata/schedulers/model.py +63 -0
  268. recurvedata/schedulers/schemas.py +97 -0
  269. recurvedata/schedulers/service.py +20 -0
  270. recurvedata/schedulers/system_dags.py +59 -0
  271. recurvedata/schedulers/task_status.py +279 -0
  272. recurvedata/schedulers/utils.py +73 -0
  273. recurvedata/schema/__init__.py +0 -0
  274. recurvedata/schema/field.py +88 -0
  275. recurvedata/schema/schema.py +55 -0
  276. recurvedata/schema/types.py +17 -0
  277. recurvedata/schema.py +0 -0
  278. recurvedata/server/__init__.py +0 -0
  279. recurvedata/server/app.py +7 -0
  280. recurvedata/server/connector/__init__.py +0 -0
  281. recurvedata/server/connector/api.py +79 -0
  282. recurvedata/server/connector/schemas.py +28 -0
  283. recurvedata/server/data_service/__init__.py +0 -0
  284. recurvedata/server/data_service/api.py +126 -0
  285. recurvedata/server/data_service/client.py +18 -0
  286. recurvedata/server/data_service/consts.py +1 -0
  287. recurvedata/server/data_service/schemas.py +68 -0
  288. recurvedata/server/data_service/service.py +218 -0
  289. recurvedata/server/dbt/__init__.py +0 -0
  290. recurvedata/server/dbt/api.py +116 -0
  291. recurvedata/server/error_code.py +49 -0
  292. recurvedata/server/exceptions.py +19 -0
  293. recurvedata/server/executor/__init__.py +0 -0
  294. recurvedata/server/executor/api.py +37 -0
  295. recurvedata/server/executor/schemas.py +30 -0
  296. recurvedata/server/executor/service.py +220 -0
  297. recurvedata/server/main.py +32 -0
  298. recurvedata/server/schedulers/__init__.py +0 -0
  299. recurvedata/server/schedulers/api.py +252 -0
  300. recurvedata/server/schedulers/schemas.py +50 -0
  301. recurvedata/server/schemas.py +50 -0
  302. recurvedata/utils/__init__.py +15 -0
  303. recurvedata/utils/_typer.py +61 -0
  304. recurvedata/utils/attrdict.py +19 -0
  305. recurvedata/utils/command_helper.py +20 -0
  306. recurvedata/utils/compat.py +12 -0
  307. recurvedata/utils/compression.py +203 -0
  308. recurvedata/utils/crontab.py +42 -0
  309. recurvedata/utils/crypto_util.py +305 -0
  310. recurvedata/utils/dataclass.py +11 -0
  311. recurvedata/utils/date_time.py +464 -0
  312. recurvedata/utils/dispatch.py +114 -0
  313. recurvedata/utils/email_util.py +104 -0
  314. recurvedata/utils/files.py +386 -0
  315. recurvedata/utils/helpers.py +170 -0
  316. recurvedata/utils/httputil.py +117 -0
  317. recurvedata/utils/imports.py +132 -0
  318. recurvedata/utils/json.py +80 -0
  319. recurvedata/utils/log.py +117 -0
  320. recurvedata/utils/log_capture.py +153 -0
  321. recurvedata/utils/mp.py +178 -0
  322. recurvedata/utils/normalizer.py +102 -0
  323. recurvedata/utils/redis_lock.py +474 -0
  324. recurvedata/utils/registry.py +54 -0
  325. recurvedata/utils/shell.py +15 -0
  326. recurvedata/utils/singleton.py +33 -0
  327. recurvedata/utils/sql.py +6 -0
  328. recurvedata/utils/timeout.py +28 -0
  329. recurvedata/utils/tracing.py +14 -0
  330. recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
  331. recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
  332. recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
  333. recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
@@ -0,0 +1,266 @@
1
+ import bz2
2
+ import contextlib
3
+ import datetime
4
+ import glob
5
+ import gzip
6
+ import logging
7
+ import os
8
+ import shutil
9
+ import subprocess
10
+ import tempfile
11
+ import zipfile
12
+ from itertools import islice
13
+
14
+ from recurvedata.pigeon.utils import ensure_list
15
+
16
+
17
+ def new_tempfile(suffix="", prefix=None, dir=None):
18
+ ts = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
19
+ suffix = "{}_{}".format(ts, suffix)
20
+ kwargs = {"suffix": suffix, "dir": dir}
21
+ if prefix:
22
+ kwargs["prefix"] = prefix
23
+ _, filename = tempfile.mkstemp(**kwargs)
24
+ return filename
25
+
26
+
27
+ class new_stagefile_factory:
28
+ def __init__(self, directory):
29
+ if not os.path.isabs(directory):
30
+ directory = os.path.join("/tmp", directory)
31
+ self.directory = directory
32
+
33
+ def __call__(self, name):
34
+ os.makedirs(self.directory, exist_ok=True)
35
+ return os.path.join(self.directory, name)
36
+
37
+
38
+ def merge_files(files, filename=None, num_skip_lines=0, delete=True):
39
+ """Concat multiple files into one file.
40
+
41
+ :param files: source file names
42
+ :param filename: target filename, will create a tempfile if not provided
43
+ :param num_skip_lines: skip n lines before merge into target file
44
+ :param delete: delete source files after been merged
45
+ :return: the target filename
46
+ """
47
+ if filename is None:
48
+ _, filename = tempfile.mkstemp()
49
+
50
+ if not num_skip_lines:
51
+ if len(files) == 1 and delete:
52
+ # just rename
53
+ os.rename(files[0], filename)
54
+ else:
55
+ # merge by `cat` for better performance
56
+ cmd = f'cat {" ".join(files)} > {filename}'
57
+ _run_command(cmd)
58
+ else:
59
+ with open(filename, "wb") as fout:
60
+ for f in files:
61
+ with open(f, "rb") as fin:
62
+ for _ in range(num_skip_lines):
63
+ fin.readline()
64
+ shutil.copyfileobj(fin, fout)
65
+
66
+ if delete:
67
+ remove_files_safely(files)
68
+
69
+ return filename
70
+
71
+
72
+ def skip_lines(infile, lines, inplace=False):
73
+ tmpfile = new_tempfile()
74
+ with open(infile, "rb") as fin, open(tmpfile, "wb") as fout:
75
+ # skip the first n lines
76
+ for _ in range(lines):
77
+ fin.readline()
78
+
79
+ # copy the rest to another file
80
+ shutil.copyfileobj(fin, fout)
81
+
82
+ if inplace:
83
+ os.rename(tmpfile, infile)
84
+ return infile
85
+ return tmpfile
86
+
87
+
88
+ def read_lines(filename, start_line, lines_num=1):
89
+ with open(filename) as f:
90
+ for line in islice(f, start_line, start_line + lines_num):
91
+ yield line
92
+
93
+
94
+ def is_file_empty(filename):
95
+ """Detect file is empty or not, the non-exists file is considered as empty"""
96
+ try:
97
+ return os.stat(filename).st_size == 0
98
+ except FileNotFoundError:
99
+ return True
100
+
101
+
102
+ def remove_files(files):
103
+ for f in ensure_list(files):
104
+ os.unlink(f)
105
+
106
+
107
+ def remove_files_safely(files):
108
+ with contextlib.suppress(OSError, TypeError, ValueError):
109
+ remove_files(files)
110
+
111
+
112
+ def remove_files_by_pattern(pattern):
113
+ files = glob.glob(pattern)
114
+ logging.info("files to be deleted: %s", str(files))
115
+ remove_files_safely(files)
116
+
117
+
118
+ def remove_folder_safely(folder):
119
+ if not os.path.exists(folder):
120
+ return
121
+ shutil.rmtree(folder, ignore_errors=True)
122
+
123
+
124
+ def gzip_compress(filename, target_filename=None, using_cmd=False):
125
+ """Compress a file using gzip
126
+ :param filename: the path of input file
127
+ :param target_filename: the path of output file, a temporary filename will be made otherwise
128
+ :param using_cmd: use the gzip command line instead of Python GzipFile to speedup
129
+ :return: the target_filename
130
+ """
131
+ if target_filename is None:
132
+ target_filename = new_tempfile(suffix=".gz")
133
+
134
+ if using_cmd:
135
+ _run_command(f"gzip {filename} -c > {target_filename}")
136
+ return target_filename
137
+
138
+ with open(filename, "rb") as f_in, gzip.GzipFile(target_filename, "wb") as f_out:
139
+ shutil.copyfileobj(f_in, f_out)
140
+ return target_filename
141
+
142
+
143
+ def gzip_decompress(filename, target_filename=None, using_cmd=False):
144
+ """Decompress a gzip file
145
+ :param filename: the path of the gzip file
146
+ :param target_filename: the path of output file, a temporary filename will be made otherwise
147
+ :param using_cmd: use the gzip command line instead of Python GzipFile to speedup
148
+ :return: the target_filename
149
+ """
150
+ if target_filename is None:
151
+ target_filename = new_tempfile()
152
+
153
+ if using_cmd:
154
+ _run_command(f"gzip -d {filename} -c > {target_filename}")
155
+ return target_filename
156
+
157
+ with gzip.GzipFile(filename, "rb") as f_in, open(target_filename, "wb") as f_out:
158
+ shutil.copyfileobj(f_in, f_out)
159
+ return target_filename
160
+
161
+
162
+ def bzip2_compress(filename, target_filename=None, using_cmd=False):
163
+ """Compress a file using bzip2
164
+ :param filename: the path of input file
165
+ :param target_filename: the path of output file, a temporary filename will be made otherwise
166
+ :param using_cmd: use the bzip2 command line instead of Python BZ2File to speedup
167
+ :return: the target_filename
168
+ """
169
+ if target_filename is None:
170
+ target_filename = new_tempfile(suffix=".bz2")
171
+
172
+ if using_cmd:
173
+ _run_command(f"bzip2 {filename} -c > {target_filename}")
174
+ return target_filename
175
+
176
+ with open(filename, "rb") as f_in, bz2.BZ2File(target_filename, "wb") as f_out:
177
+ shutil.copyfileobj(f_in, f_out)
178
+ return target_filename
179
+
180
+
181
+ def bzip2_decompress(filename, target_filename=None, using_cmd=False):
182
+ """Decompress a bzip2 file
183
+ :param filename: the path of the bzip2 file
184
+ :param target_filename: the path of output file, a temporary filename will be made otherwise
185
+ :param using_cmd: use the gzip command line instead of Python BZ2File to speedup
186
+ :return: the target_filename
187
+ """
188
+ if target_filename is None:
189
+ target_filename = new_tempfile()
190
+
191
+ if using_cmd:
192
+ _run_command(f"bzip2 -d {filename} -c > {target_filename}")
193
+ return target_filename
194
+
195
+ with bz2.BZ2File(filename, "rb") as f_in, open(target_filename, "wb") as f_out:
196
+ shutil.copyfileobj(f_in, f_out)
197
+ return target_filename
198
+
199
+
200
+ def zip_compress(filename, target_filename=None, using_cmd=False, arcname=None):
201
+ """Compress a file using zip
202
+ :param filename: the path of input file
203
+ :param target_filename: the path of output file, a temporary filename will be made otherwise
204
+ :param using_cmd: use the zip command line instead of Python ZipFile to speedup
205
+ :param arcname: filename in the archive file, only supported with using_cmd=False
206
+ :return: the target_filename
207
+ """
208
+ if target_filename is None:
209
+ target_filename = new_tempfile(suffix=".zip")
210
+
211
+ directory, basename = os.path.split(filename.rstrip("/"))
212
+
213
+ if using_cmd:
214
+ # 先删除生成的临时文件,只使用生成的文件名,要不然会报错
215
+ # zip warning: missing end signature--probably not a zip file (did you
216
+ # zip warning: remember to use binary mode when you transferred it?)
217
+ # zip warning: (if you are trying to read a damaged archive try -F)
218
+ remove_files_safely(target_filename)
219
+ if arcname is not None:
220
+ logging.warning("arcname is not supported while using cmd")
221
+ _run_command(f"cd {directory} && zip -r {target_filename} {basename}")
222
+ return target_filename
223
+
224
+ with zipfile.ZipFile(target_filename, "w", compression=zipfile.ZIP_DEFLATED) as zf:
225
+ zf.write(filename, arcname=arcname or basename)
226
+ return target_filename
227
+
228
+
229
+ def zip_decompress(filename, target_directory=None, using_cmd=False):
230
+ """Decompress a .zip file
231
+ :param filename: the path of input file
232
+ :param target_directory: the path of output directory, a temporary directory will be made otherwise
233
+ :param using_cmd: use the unzip command line instead of Python ZipFile to speedup
234
+ :return: the output directory
235
+ """
236
+ if not target_directory:
237
+ target_directory = tempfile.mkdtemp()
238
+
239
+ if using_cmd:
240
+ _run_command(f"unzip {filename} -d {target_directory}")
241
+ return target_directory
242
+
243
+ with zipfile.ZipFile(filename, "r") as zf:
244
+ zf.extractall(target_directory)
245
+ return target_directory
246
+
247
+
248
+ @contextlib.contextmanager
249
+ def ensure_remove(filename):
250
+ try:
251
+ yield filename
252
+ finally:
253
+ remove_files_safely(filename)
254
+
255
+
256
+ def schema_filename(base):
257
+ return f"{base}.schema"
258
+
259
+
260
+ def exists(path):
261
+ return os.path.exists(path)
262
+
263
+
264
+ def _run_command(cmd):
265
+ logging.info(cmd)
266
+ subprocess.check_output(cmd, shell=True)
@@ -0,0 +1,44 @@
1
+ import datetime
2
+ import decimal
3
+ import json
4
+
5
+
6
+ class JSONEncoder(json.JSONEncoder):
7
+ def default(self, obj):
8
+ if isinstance(obj, decimal.Decimal):
9
+ return float(obj)
10
+
11
+ if isinstance(obj, (datetime.date, datetime.datetime)):
12
+ return obj.isoformat()
13
+
14
+ if isinstance(obj, datetime.timedelta):
15
+ return str(obj)
16
+
17
+ return super().default(obj)
18
+
19
+
20
+ def json_dumps(obj, **kwargs):
21
+ kwargs.setdefault('cls', JSONEncoder)
22
+ return json.dumps(obj, **kwargs)
23
+
24
+
25
+ def dump_json(obj, fp=None, **kwargs):
26
+ kwargs.setdefault('indent', 4)
27
+ kwargs.setdefault('ensure_ascii', False)
28
+ kwargs.setdefault('sort_keys', True)
29
+ kwargs.setdefault('cls', JSONEncoder)
30
+ if fp is None:
31
+ return json.dumps(obj, **kwargs)
32
+ else:
33
+ if isinstance(fp, str):
34
+ with open(fp, 'w') as fp:
35
+ return json.dump(obj, fp, **kwargs)
36
+ return json.dump(obj, fp, **kwargs)
37
+
38
+
39
+ def load_json(fp, **kwargs):
40
+ if isinstance(fp, str):
41
+ with open(fp, 'r') as fp:
42
+ return json.load(fp, **kwargs)
43
+ else:
44
+ return json.load(fp, **kwargs)
@@ -0,0 +1,85 @@
1
+ """
2
+ This file is taken from SQLAlchemy
3
+ """
4
+
5
+
6
+ class AbstractKeyedTuple(tuple):
7
+ __slots__ = ()
8
+
9
+ def keys(self):
10
+ """Return a list of string key names for this :class:`.KeyedTuple`.
11
+
12
+ .. seealso::
13
+
14
+ :attr:`.KeyedTuple._fields`
15
+
16
+ """
17
+
18
+ return list(self._fields)
19
+
20
+
21
+ class KeyedTuple(AbstractKeyedTuple):
22
+ """``tuple`` subclass that adds labeled names.
23
+
24
+ E.g.::
25
+
26
+ >>> k = KeyedTuple([1, 2, 3], labels=["one", "two", "three"])
27
+ >>> k.one
28
+ 1
29
+ >>> k.two
30
+ 2
31
+
32
+ Result rows returned by :class:`.Query` that contain multiple
33
+ ORM entities and/or column expressions make use of this
34
+ class to return rows.
35
+
36
+ The :class:`.KeyedTuple` exhibits similar behavior to the
37
+ ``collections.namedtuple()`` construct provided in the Python
38
+ standard library, however is architected very differently.
39
+ Unlike ``collections.namedtuple()``, :class:`.KeyedTuple` is
40
+ does not rely on creation of custom subtypes in order to represent
41
+ a new series of keys, instead each :class:`.KeyedTuple` instance
42
+ receives its list of keys in place. The subtype approach
43
+ of ``collections.namedtuple()`` introduces significant complexity
44
+ and performance overhead, which is not necessary for the
45
+ :class:`.Query` object's use case.
46
+
47
+ .. seealso::
48
+
49
+ :ref:`ormtutorial_querying`
50
+
51
+ """
52
+
53
+ def __new__(cls, vals, labels=None):
54
+ t = tuple.__new__(cls, vals)
55
+ if labels:
56
+ t.__dict__.update(zip(labels, vals))
57
+ else:
58
+ labels = []
59
+ t.__dict__["_labels"] = labels
60
+ return t
61
+
62
+ @property
63
+ def _fields(self):
64
+ """Return a tuple of string key names for this :class:`.KeyedTuple`.
65
+
66
+ This method provides compatibility with ``collections.namedtuple()``.
67
+
68
+ .. seealso::
69
+
70
+ :meth:`.KeyedTuple.keys`
71
+
72
+ """
73
+ return tuple([l for l in self._labels if l is not None])
74
+
75
+ def __setattr__(self, key, value):
76
+ raise AttributeError("Can't set attribute: %s" % key)
77
+
78
+ def _asdict(self):
79
+ """Return the contents of this :class:`.KeyedTuple` as a dictionary.
80
+
81
+ This method provides compatibility with ``collections.namedtuple()``,
82
+ with the exception that the dictionary returned is **not** ordered.
83
+
84
+ """
85
+ return {key: self.__dict__[key] for key in self.keys()}
@@ -0,0 +1,156 @@
1
+ import logging
2
+ import threading
3
+ import time
4
+ from multiprocessing import Process
5
+ from multiprocessing.queues import Queue
6
+ from queue import Empty, Full
7
+ from subprocess import PIPE, STDOUT, CalledProcessError, Popen
8
+ from typing import Any, List, Optional, Tuple, Union
9
+
10
+
11
+ def safe_join_subprocesses(workers, result_queue):
12
+ result = []
13
+ live_workers = list(workers)
14
+ while live_workers:
15
+ try:
16
+ while 1:
17
+ result.append(result_queue.get(False))
18
+ except Empty:
19
+ pass
20
+
21
+ time.sleep(0.5) # Give tasks a chance to put more data in
22
+ if not result_queue.empty():
23
+ continue
24
+ live_workers = [p for p in live_workers if p.is_alive()]
25
+ return result
26
+
27
+
28
+ def has_process_fail(workers: List[Process], log=True):
29
+ for p in workers:
30
+ if p.is_alive():
31
+ continue
32
+ if p.exitcode != 0:
33
+ if log:
34
+ logging.info(f"found process {p.pid} fail, exitcode {p.exitcode}")
35
+ return True
36
+ return False
37
+
38
+
39
+ def terminate_processes(workers: List[Process]):
40
+ for p in workers:
41
+ if p.is_alive():
42
+ logging.info(f"start terminate process {p.pid}")
43
+ p.terminate()
44
+ logging.info(f"finish terminate process {p.pid}")
45
+
46
+
47
+ def master_safe_put_queue(
48
+ queue: Queue, obj: Any, workers: List[Process], block=True, timeout: Optional[int] = None
49
+ ) -> Optional[bool]:
50
+ """
51
+ 一种调用 queue.put 的场景,是 master put 数据,worker 消费数据.
52
+ 在默认的 timeout=None, block=True 下,
53
+ 如果 queue.maxsize 较小,且 workers 遇到了报错,没法及时消费,
54
+ 就会导致 master 在调用 queue.put 的时候卡住。
55
+ master_safe_put_queue 可以解决这个问题,
56
+ 当 timeout=None, block=True 的情况下,
57
+ 会用一个较小的 timeout(10s),死循环不断尝试 queue.put(timeout=10),
58
+ 当 queue.put 卡住达到 10s 的时候,会报错 queue.Full,
59
+ 这时候检查 workers 是否有异常退出的进程,
60
+ 如果 workers 有异常退出的进程,则返回 True, 表示 worker 有异常退出导致 master queue.put 卡住;
61
+ 如果 workers 都正常,则表示确实是 worker 消费速度较慢,重新调用 queue.put(timeout=10) 继续死循环
62
+
63
+ 其他情况下与 queue.put 一致
64
+
65
+ :param queue: queue
66
+ :param obj: the obj to put into queue
67
+ :param workers: sub processes
68
+ :param block: should block when queue has no free slot
69
+ :param timeout: queue.put's timeout
70
+ :return: True 表示 workers 有异常退出导致 master queue.put 卡住;否则返回 None
71
+ """
72
+ if timeout is None and block:
73
+ while True:
74
+ try:
75
+ return queue.put(obj, timeout=10)
76
+ except Full:
77
+ if has_process_fail(workers):
78
+ return True
79
+ else:
80
+ return queue.put(obj, block=block, timeout=timeout)
81
+
82
+
83
+ def safe_join_subprocesses_early_stop(workers: List[Process], result_queue: Queue) -> Tuple[List, bool]:
84
+ """
85
+ this function wait and read the sub workers' result from result_queue,
86
+ exit when
87
+ 1) one sub worker fail
88
+ or
89
+ 2) all sub workers success
90
+ :param workers: sub progresses
91
+ :param result_queue: queue which sub progresses put result into
92
+ :return: result got from sub workers, and early_stop flag
93
+ """
94
+ result = []
95
+ early_stop = False
96
+ live_workers = list(workers)
97
+ last_check_early_stop_time = time.time()
98
+ while live_workers:
99
+ try:
100
+ while 1:
101
+ result.append(result_queue.get(False))
102
+
103
+ if time.time() - last_check_early_stop_time > 10:
104
+ if has_process_fail(live_workers):
105
+ early_stop = True
106
+ return result, early_stop
107
+ last_check_early_stop_time = time.time()
108
+
109
+ except Empty:
110
+ pass
111
+
112
+ time.sleep(0.5) # Give tasks a chance to put more data in
113
+ if not result_queue.empty():
114
+ continue
115
+
116
+ if has_process_fail(live_workers):
117
+ early_stop = True
118
+ return result, early_stop
119
+ last_check_early_stop_time = time.time()
120
+ live_workers = [p for p in live_workers if p.is_alive()]
121
+ return result, early_stop
122
+
123
+
124
+ def run_subprocess(cmd: Union[str, List], stdout=PIPE, stderr=STDOUT, return_output=False, **kwargs) -> Optional[str]:
125
+ p = Popen(cmd, stdout=stdout, stderr=stderr, **kwargs)
126
+ logging.info(f"started sub process: {cmd}, pid: {p.pid}")
127
+ lines: List[str] = []
128
+ for raw_line in iter(p.stdout.readline, b""):
129
+ line = raw_line.decode("utf8").rstrip()
130
+ logging.info(line)
131
+ if return_output:
132
+ lines.append(line)
133
+ p.wait()
134
+ logging.info("sub process exited with return code %s", p.returncode)
135
+ if p.returncode:
136
+ raise CalledProcessError(p.returncode, p.args)
137
+ return "\n".join(lines)
138
+
139
+
140
+ class PropagatingThread(threading.Thread):
141
+ def run(self):
142
+ self.exc = None
143
+ try:
144
+ if hasattr(self, "_Thread__target"):
145
+ # Thread uses name mangling prior to Python 3.
146
+ self.ret = self._Thread__target(*self._Thread__args, **self._Thread__kwargs)
147
+ else:
148
+ self.ret = self._target(*self._args, **self._kwargs)
149
+ except BaseException as e:
150
+ self.exc = e
151
+
152
+ def join(self, timeout=None):
153
+ super().join(timeout)
154
+ if self.exc:
155
+ raise self.exc
156
+ return self.ret