recurvedata-lib 0.1.487__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of recurvedata-lib might be problematic. Click here for more details.

Files changed (333) hide show
  1. recurvedata/__init__.py +0 -0
  2. recurvedata/__version__.py +1 -0
  3. recurvedata/client/__init__.py +3 -0
  4. recurvedata/client/client.py +150 -0
  5. recurvedata/client/server_client.py +91 -0
  6. recurvedata/config.py +99 -0
  7. recurvedata/connectors/__init__.py +20 -0
  8. recurvedata/connectors/_register.py +46 -0
  9. recurvedata/connectors/base.py +111 -0
  10. recurvedata/connectors/config_schema.py +1575 -0
  11. recurvedata/connectors/connectors/__init__.py +0 -0
  12. recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
  13. recurvedata/connectors/connectors/auth.py +44 -0
  14. recurvedata/connectors/connectors/azure_blob.py +89 -0
  15. recurvedata/connectors/connectors/azure_synapse.py +79 -0
  16. recurvedata/connectors/connectors/bigquery.py +359 -0
  17. recurvedata/connectors/connectors/clickhouse.py +219 -0
  18. recurvedata/connectors/connectors/dingtalk.py +61 -0
  19. recurvedata/connectors/connectors/doris.py +215 -0
  20. recurvedata/connectors/connectors/es.py +62 -0
  21. recurvedata/connectors/connectors/feishu.py +65 -0
  22. recurvedata/connectors/connectors/ftp.py +50 -0
  23. recurvedata/connectors/connectors/generic.py +49 -0
  24. recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
  25. recurvedata/connectors/connectors/google_service_account.py +225 -0
  26. recurvedata/connectors/connectors/hive.py +207 -0
  27. recurvedata/connectors/connectors/impala.py +210 -0
  28. recurvedata/connectors/connectors/jenkins.py +51 -0
  29. recurvedata/connectors/connectors/mail.py +89 -0
  30. recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
  31. recurvedata/connectors/connectors/mongo.py +79 -0
  32. recurvedata/connectors/connectors/mssql.py +131 -0
  33. recurvedata/connectors/connectors/mysql.py +191 -0
  34. recurvedata/connectors/connectors/n8n.py +141 -0
  35. recurvedata/connectors/connectors/oss.py +74 -0
  36. recurvedata/connectors/connectors/owncloud.py +36 -0
  37. recurvedata/connectors/connectors/phoenix.py +36 -0
  38. recurvedata/connectors/connectors/postgres.py +230 -0
  39. recurvedata/connectors/connectors/python.py +50 -0
  40. recurvedata/connectors/connectors/redshift.py +187 -0
  41. recurvedata/connectors/connectors/s3.py +93 -0
  42. recurvedata/connectors/connectors/sftp.py +87 -0
  43. recurvedata/connectors/connectors/slack.py +35 -0
  44. recurvedata/connectors/connectors/spark.py +99 -0
  45. recurvedata/connectors/connectors/starrocks.py +175 -0
  46. recurvedata/connectors/connectors/tencent_cos.py +40 -0
  47. recurvedata/connectors/connectors/tidb.py +49 -0
  48. recurvedata/connectors/const.py +315 -0
  49. recurvedata/connectors/datasource.py +189 -0
  50. recurvedata/connectors/dbapi.py +469 -0
  51. recurvedata/connectors/fs.py +66 -0
  52. recurvedata/connectors/ftp.py +40 -0
  53. recurvedata/connectors/object_store.py +60 -0
  54. recurvedata/connectors/pigeon.py +172 -0
  55. recurvedata/connectors/proxy.py +104 -0
  56. recurvedata/connectors/service.py +223 -0
  57. recurvedata/connectors/utils.py +47 -0
  58. recurvedata/consts.py +49 -0
  59. recurvedata/core/__init__.py +0 -0
  60. recurvedata/core/config.py +46 -0
  61. recurvedata/core/configurable.py +27 -0
  62. recurvedata/core/consts.py +2 -0
  63. recurvedata/core/templating.py +206 -0
  64. recurvedata/core/tracing.py +223 -0
  65. recurvedata/core/transformer.py +186 -0
  66. recurvedata/core/translation.py +91 -0
  67. recurvedata/dbt/client.py +97 -0
  68. recurvedata/dbt/consts.py +99 -0
  69. recurvedata/dbt/cosmos_utils.py +275 -0
  70. recurvedata/dbt/error_codes.py +18 -0
  71. recurvedata/dbt/schemas.py +98 -0
  72. recurvedata/dbt/service.py +451 -0
  73. recurvedata/dbt/utils.py +246 -0
  74. recurvedata/error_codes.py +71 -0
  75. recurvedata/exceptions.py +72 -0
  76. recurvedata/executors/__init__.py +4 -0
  77. recurvedata/executors/cli/__init__.py +7 -0
  78. recurvedata/executors/cli/connector.py +117 -0
  79. recurvedata/executors/cli/dbt.py +118 -0
  80. recurvedata/executors/cli/main.py +82 -0
  81. recurvedata/executors/cli/parameters.py +18 -0
  82. recurvedata/executors/client.py +190 -0
  83. recurvedata/executors/consts.py +50 -0
  84. recurvedata/executors/debug_executor.py +100 -0
  85. recurvedata/executors/executor.py +300 -0
  86. recurvedata/executors/link_executor.py +189 -0
  87. recurvedata/executors/models.py +34 -0
  88. recurvedata/executors/schemas.py +222 -0
  89. recurvedata/executors/service/__init__.py +0 -0
  90. recurvedata/executors/service/connector.py +380 -0
  91. recurvedata/executors/utils.py +172 -0
  92. recurvedata/filestorage/__init__.py +11 -0
  93. recurvedata/filestorage/_factory.py +33 -0
  94. recurvedata/filestorage/backends/__init__.py +0 -0
  95. recurvedata/filestorage/backends/fsspec.py +45 -0
  96. recurvedata/filestorage/backends/local.py +67 -0
  97. recurvedata/filestorage/backends/oss.py +56 -0
  98. recurvedata/filestorage/interface.py +84 -0
  99. recurvedata/operators/__init__.py +10 -0
  100. recurvedata/operators/base.py +28 -0
  101. recurvedata/operators/config.py +21 -0
  102. recurvedata/operators/context.py +255 -0
  103. recurvedata/operators/dbt_operator/__init__.py +2 -0
  104. recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
  105. recurvedata/operators/dbt_operator/operator.py +353 -0
  106. recurvedata/operators/link_operator/__init__.py +1 -0
  107. recurvedata/operators/link_operator/operator.py +120 -0
  108. recurvedata/operators/models.py +55 -0
  109. recurvedata/operators/notify_operator/__init__.py +1 -0
  110. recurvedata/operators/notify_operator/operator.py +180 -0
  111. recurvedata/operators/operator.py +119 -0
  112. recurvedata/operators/python_operator/__init__.py +1 -0
  113. recurvedata/operators/python_operator/operator.py +132 -0
  114. recurvedata/operators/sensor_operator/__init__.py +1 -0
  115. recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
  116. recurvedata/operators/sensor_operator/operator.py +172 -0
  117. recurvedata/operators/spark_operator/__init__.py +1 -0
  118. recurvedata/operators/spark_operator/operator.py +200 -0
  119. recurvedata/operators/spark_operator/spark_sample.py +47 -0
  120. recurvedata/operators/sql_operator/__init__.py +1 -0
  121. recurvedata/operators/sql_operator/operator.py +90 -0
  122. recurvedata/operators/task.py +211 -0
  123. recurvedata/operators/transfer_operator/__init__.py +40 -0
  124. recurvedata/operators/transfer_operator/const.py +10 -0
  125. recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
  126. recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
  127. recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
  128. recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
  129. recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
  130. recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
  131. recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
  132. recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
  133. recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
  134. recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
  135. recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
  136. recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
  137. recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
  138. recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
  139. recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
  140. recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
  141. recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
  142. recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
  143. recurvedata/operators/transfer_operator/load_task_email.py +188 -0
  144. recurvedata/operators/transfer_operator/load_task_es.py +86 -0
  145. recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
  146. recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
  147. recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
  148. recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
  149. recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
  150. recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
  151. recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
  152. recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
  153. recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
  154. recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
  155. recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
  156. recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
  157. recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
  158. recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
  159. recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
  160. recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
  161. recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
  162. recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
  163. recurvedata/operators/transfer_operator/mixin.py +31 -0
  164. recurvedata/operators/transfer_operator/operator.py +231 -0
  165. recurvedata/operators/transfer_operator/task.py +223 -0
  166. recurvedata/operators/transfer_operator/utils.py +134 -0
  167. recurvedata/operators/ui.py +80 -0
  168. recurvedata/operators/utils/__init__.py +51 -0
  169. recurvedata/operators/utils/file_factory.py +150 -0
  170. recurvedata/operators/utils/fs.py +10 -0
  171. recurvedata/operators/utils/lineage.py +265 -0
  172. recurvedata/operators/web_init.py +15 -0
  173. recurvedata/pigeon/connector/__init__.py +294 -0
  174. recurvedata/pigeon/connector/_registry.py +17 -0
  175. recurvedata/pigeon/connector/aliyun_oss.py +80 -0
  176. recurvedata/pigeon/connector/awss3.py +123 -0
  177. recurvedata/pigeon/connector/azure_blob.py +176 -0
  178. recurvedata/pigeon/connector/azure_synapse.py +51 -0
  179. recurvedata/pigeon/connector/cass.py +151 -0
  180. recurvedata/pigeon/connector/clickhouse.py +403 -0
  181. recurvedata/pigeon/connector/clickhouse_native.py +351 -0
  182. recurvedata/pigeon/connector/dbapi.py +571 -0
  183. recurvedata/pigeon/connector/doris.py +166 -0
  184. recurvedata/pigeon/connector/es.py +176 -0
  185. recurvedata/pigeon/connector/feishu.py +1135 -0
  186. recurvedata/pigeon/connector/ftp.py +163 -0
  187. recurvedata/pigeon/connector/google_bigquery.py +283 -0
  188. recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
  189. recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
  190. recurvedata/pigeon/connector/hdfs.py +204 -0
  191. recurvedata/pigeon/connector/hive_impala.py +383 -0
  192. recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
  193. recurvedata/pigeon/connector/mongodb.py +56 -0
  194. recurvedata/pigeon/connector/mssql.py +467 -0
  195. recurvedata/pigeon/connector/mysql.py +175 -0
  196. recurvedata/pigeon/connector/owncloud.py +92 -0
  197. recurvedata/pigeon/connector/postgresql.py +267 -0
  198. recurvedata/pigeon/connector/power_bi.py +179 -0
  199. recurvedata/pigeon/connector/qcloud_cos.py +79 -0
  200. recurvedata/pigeon/connector/redshift.py +123 -0
  201. recurvedata/pigeon/connector/sftp.py +73 -0
  202. recurvedata/pigeon/connector/sqlite.py +42 -0
  203. recurvedata/pigeon/connector/starrocks.py +144 -0
  204. recurvedata/pigeon/connector/tableau.py +162 -0
  205. recurvedata/pigeon/const.py +21 -0
  206. recurvedata/pigeon/csv.py +172 -0
  207. recurvedata/pigeon/docs/datasources-example.json +82 -0
  208. recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
  209. recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
  210. recurvedata/pigeon/dumper/__init__.py +171 -0
  211. recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
  212. recurvedata/pigeon/dumper/base.py +141 -0
  213. recurvedata/pigeon/dumper/cass.py +213 -0
  214. recurvedata/pigeon/dumper/dbapi.py +346 -0
  215. recurvedata/pigeon/dumper/es.py +112 -0
  216. recurvedata/pigeon/dumper/ftp.py +64 -0
  217. recurvedata/pigeon/dumper/mongodb.py +103 -0
  218. recurvedata/pigeon/handler/__init__.py +4 -0
  219. recurvedata/pigeon/handler/base.py +153 -0
  220. recurvedata/pigeon/handler/csv_handler.py +290 -0
  221. recurvedata/pigeon/loader/__init__.py +87 -0
  222. recurvedata/pigeon/loader/base.py +83 -0
  223. recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
  224. recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
  225. recurvedata/pigeon/loader/csv_to_doris.py +215 -0
  226. recurvedata/pigeon/loader/csv_to_es.py +51 -0
  227. recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
  228. recurvedata/pigeon/loader/csv_to_hive.py +468 -0
  229. recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
  230. recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
  231. recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
  232. recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
  233. recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
  234. recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
  235. recurvedata/pigeon/meta.py +116 -0
  236. recurvedata/pigeon/row_factory.py +42 -0
  237. recurvedata/pigeon/schema/__init__.py +124 -0
  238. recurvedata/pigeon/schema/types.py +13 -0
  239. recurvedata/pigeon/sync.py +283 -0
  240. recurvedata/pigeon/transformer.py +146 -0
  241. recurvedata/pigeon/utils/__init__.py +134 -0
  242. recurvedata/pigeon/utils/bloomfilter.py +181 -0
  243. recurvedata/pigeon/utils/date_time.py +323 -0
  244. recurvedata/pigeon/utils/escape.py +15 -0
  245. recurvedata/pigeon/utils/fs.py +266 -0
  246. recurvedata/pigeon/utils/json.py +44 -0
  247. recurvedata/pigeon/utils/keyed_tuple.py +85 -0
  248. recurvedata/pigeon/utils/mp.py +156 -0
  249. recurvedata/pigeon/utils/sql.py +328 -0
  250. recurvedata/pigeon/utils/timing.py +155 -0
  251. recurvedata/provider_manager.py +0 -0
  252. recurvedata/providers/__init__.py +0 -0
  253. recurvedata/providers/dbapi/__init__.py +0 -0
  254. recurvedata/providers/flywheel/__init__.py +0 -0
  255. recurvedata/providers/mysql/__init__.py +0 -0
  256. recurvedata/schedulers/__init__.py +1 -0
  257. recurvedata/schedulers/airflow.py +974 -0
  258. recurvedata/schedulers/airflow_db_process.py +331 -0
  259. recurvedata/schedulers/airflow_operators.py +61 -0
  260. recurvedata/schedulers/airflow_plugin.py +9 -0
  261. recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
  262. recurvedata/schedulers/base.py +99 -0
  263. recurvedata/schedulers/cli.py +228 -0
  264. recurvedata/schedulers/client.py +56 -0
  265. recurvedata/schedulers/consts.py +52 -0
  266. recurvedata/schedulers/debug_celery.py +62 -0
  267. recurvedata/schedulers/model.py +63 -0
  268. recurvedata/schedulers/schemas.py +97 -0
  269. recurvedata/schedulers/service.py +20 -0
  270. recurvedata/schedulers/system_dags.py +59 -0
  271. recurvedata/schedulers/task_status.py +279 -0
  272. recurvedata/schedulers/utils.py +73 -0
  273. recurvedata/schema/__init__.py +0 -0
  274. recurvedata/schema/field.py +88 -0
  275. recurvedata/schema/schema.py +55 -0
  276. recurvedata/schema/types.py +17 -0
  277. recurvedata/schema.py +0 -0
  278. recurvedata/server/__init__.py +0 -0
  279. recurvedata/server/app.py +7 -0
  280. recurvedata/server/connector/__init__.py +0 -0
  281. recurvedata/server/connector/api.py +79 -0
  282. recurvedata/server/connector/schemas.py +28 -0
  283. recurvedata/server/data_service/__init__.py +0 -0
  284. recurvedata/server/data_service/api.py +126 -0
  285. recurvedata/server/data_service/client.py +18 -0
  286. recurvedata/server/data_service/consts.py +1 -0
  287. recurvedata/server/data_service/schemas.py +68 -0
  288. recurvedata/server/data_service/service.py +218 -0
  289. recurvedata/server/dbt/__init__.py +0 -0
  290. recurvedata/server/dbt/api.py +116 -0
  291. recurvedata/server/error_code.py +49 -0
  292. recurvedata/server/exceptions.py +19 -0
  293. recurvedata/server/executor/__init__.py +0 -0
  294. recurvedata/server/executor/api.py +37 -0
  295. recurvedata/server/executor/schemas.py +30 -0
  296. recurvedata/server/executor/service.py +220 -0
  297. recurvedata/server/main.py +32 -0
  298. recurvedata/server/schedulers/__init__.py +0 -0
  299. recurvedata/server/schedulers/api.py +252 -0
  300. recurvedata/server/schedulers/schemas.py +50 -0
  301. recurvedata/server/schemas.py +50 -0
  302. recurvedata/utils/__init__.py +15 -0
  303. recurvedata/utils/_typer.py +61 -0
  304. recurvedata/utils/attrdict.py +19 -0
  305. recurvedata/utils/command_helper.py +20 -0
  306. recurvedata/utils/compat.py +12 -0
  307. recurvedata/utils/compression.py +203 -0
  308. recurvedata/utils/crontab.py +42 -0
  309. recurvedata/utils/crypto_util.py +305 -0
  310. recurvedata/utils/dataclass.py +11 -0
  311. recurvedata/utils/date_time.py +464 -0
  312. recurvedata/utils/dispatch.py +114 -0
  313. recurvedata/utils/email_util.py +104 -0
  314. recurvedata/utils/files.py +386 -0
  315. recurvedata/utils/helpers.py +170 -0
  316. recurvedata/utils/httputil.py +117 -0
  317. recurvedata/utils/imports.py +132 -0
  318. recurvedata/utils/json.py +80 -0
  319. recurvedata/utils/log.py +117 -0
  320. recurvedata/utils/log_capture.py +153 -0
  321. recurvedata/utils/mp.py +178 -0
  322. recurvedata/utils/normalizer.py +102 -0
  323. recurvedata/utils/redis_lock.py +474 -0
  324. recurvedata/utils/registry.py +54 -0
  325. recurvedata/utils/shell.py +15 -0
  326. recurvedata/utils/singleton.py +33 -0
  327. recurvedata/utils/sql.py +6 -0
  328. recurvedata/utils/timeout.py +28 -0
  329. recurvedata/utils/tracing.py +14 -0
  330. recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
  331. recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
  332. recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
  333. recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
@@ -0,0 +1,181 @@
1
+ import os
2
+ import threading
3
+
4
+ _default_error_rate = 1 / 10**8 # 亿分之一
5
+
6
+
7
+ class _BloomFilterInterface(object):
8
+ def __init__(self, filename, capacity=100, error_rate=_default_error_rate, auto_scale=True, reuse=True):
9
+ self.filename = filename
10
+
11
+ def __len__(self):
12
+ raise NotImplementedError
13
+
14
+ def __contains__(self, item):
15
+ raise NotImplementedError
16
+
17
+ def __str__(self):
18
+ raise NotImplementedError
19
+
20
+ def add(self, key):
21
+ raise NotImplementedError
22
+
23
+ def save(self):
24
+ raise NotImplementedError
25
+
26
+ def close(self):
27
+ pass
28
+
29
+
30
+ class PyBloomFilter(_BloomFilterInterface):
31
+ def __init__(self, filename, capacity=100, error_rate=_default_error_rate, auto_scale=True, reuse=True):
32
+ import pybloom_live
33
+
34
+ super().__init__(filename, capacity, error_rate, auto_scale, reuse)
35
+
36
+ if auto_scale:
37
+ filter_cls = pybloom_live.ScalableBloomFilter
38
+ else:
39
+ filter_cls = pybloom_live.BloomFilter
40
+
41
+ if reuse and os.path.exists(filename):
42
+ with open(filename, "rb") as f:
43
+ self._bf = filter_cls.fromfile(f)
44
+ else:
45
+ self._bf = filter_cls(capacity, error_rate)
46
+
47
+ self._lock = threading.RLock()
48
+
49
+ def __getattr__(self, name):
50
+ with self._lock:
51
+ return getattr(self._bf, name)
52
+
53
+ def __len__(self):
54
+ with self._lock:
55
+ return len(self._bf)
56
+
57
+ def __str__(self):
58
+ return f"<{self.__class__.__name__} ({repr(self.filename)}, capacity={self.capacity}, count={self.count})>"
59
+
60
+ def __contains__(self, item):
61
+ with self._lock:
62
+ return item in self._bf
63
+
64
+ def add(self, key):
65
+ with self._lock:
66
+ return self._bf.add(key)
67
+
68
+ def save(self):
69
+ with self._lock:
70
+ with open(self.filename, "wb") as f:
71
+ self._bf.tofile(f)
72
+
73
+ def close(self):
74
+ self.save()
75
+
76
+
77
+ class CBloomFilter(_BloomFilterInterface):
78
+ def __init__(self, filename, capacity, error_rate=_default_error_rate, auto_scale=True, reuse=True):
79
+ import pybloomfilter
80
+
81
+ super().__init__(filename, capacity, error_rate, auto_scale, reuse)
82
+
83
+ # pybloomfilter 不支持自动 scale,必须给定 capacity
84
+ if reuse and os.path.exists(filename):
85
+ self._bf = pybloomfilter.BloomFilter.open(filename)
86
+ else:
87
+ self._bf = pybloomfilter.BloomFilter(capacity, error_rate, filename)
88
+ self._lock = threading.RLock()
89
+
90
+ @property
91
+ def count(self):
92
+ return len(self)
93
+
94
+ @property
95
+ def capacity(self):
96
+ return self._bf.capacity
97
+
98
+ def __contains__(self, item):
99
+ with self._lock:
100
+ return item in self._bf
101
+
102
+ def __len__(self):
103
+ with self._lock:
104
+ return len(self._bf)
105
+
106
+ def __str__(self):
107
+ return f"<{self.__class__.__name__} ({repr(self.filename)}, capacity={self.capacity}, count={self.count})>"
108
+
109
+ def add(self, key):
110
+ with self._lock:
111
+ return self._bf.add(key)
112
+
113
+ def save(self):
114
+ with self._lock:
115
+ self._bf.sync()
116
+
117
+ def close(self):
118
+ self._bf.close()
119
+
120
+
121
+ class DummyBloomFilter(_BloomFilterInterface):
122
+ def __len__(self):
123
+ return 0
124
+
125
+ def __contains__(self, item):
126
+ return False
127
+
128
+ def __str__(self):
129
+ return self.__class__.__name__
130
+
131
+ def add(self, key):
132
+ return False
133
+
134
+ def save(self):
135
+ pass
136
+
137
+
138
+ _engine_choices = {
139
+ "C": CBloomFilter,
140
+ "py": PyBloomFilter,
141
+ "dummy": DummyBloomFilter,
142
+ "None": DummyBloomFilter,
143
+ }
144
+
145
+
146
+ def new_bloom_filter(filename, capacity, error_rate=_default_error_rate, auto_scale=True, reuse=True, engine="py"):
147
+ if engine is None:
148
+ engine = "None"
149
+ engine = engine or "py"
150
+ return _engine_choices[engine](filename, capacity, error_rate, auto_scale, reuse)
151
+
152
+
153
+ def __benchmark(filename, capacity, error_rate, engine):
154
+ import datetime
155
+
156
+ values = range(capacity)
157
+ f = new_bloom_filter(filename, capacity, error_rate, reuse=False, engine=engine)
158
+ st = datetime.datetime.now()
159
+ for i in values:
160
+ if i not in f:
161
+ f.add(i)
162
+ f.save()
163
+ duration = datetime.datetime.now() - st
164
+ qps = len(values) / duration.total_seconds()
165
+ print(engine, f, duration, qps)
166
+
167
+
168
+ if __name__ == "__main__":
169
+ # C <CBloomFilter ('/tmp/bloom_C', capacity=2000000, count=1999740)> 0:00:03.047118 656357.9093425327
170
+ # C <CBloomFilter ('/tmp/bloom_C', capacity=2000000, count=1999766)> 0:00:02.591953 771618.9298185576
171
+ # C <CBloomFilter ('/tmp/bloom_C', capacity=2000000, count=1999737)> 0:00:02.756158 725647.8039357685
172
+ # py <PyBloomFilter ('/tmp/bloom_py', capacity=2000000, count=1999770)> 0:00:33.104276 60415.15603603595
173
+ # py <PyBloomFilter ('/tmp/bloom_py', capacity=2000000, count=1999770)> 0:00:33.406134 59869.244372904686
174
+ # py <PyBloomFilter ('/tmp/bloom_py', capacity=2000000, count=1999770)> 0:00:34.070549 58701.725058789045
175
+ # None DummyBloomFilter 0:00:00.431990 4629736.799462951
176
+ # None DummyBloomFilter 0:00:00.432359 4625785.51620297
177
+ # None DummyBloomFilter 0:00:00.432572 4623507.762869534
178
+ for x in ["C", "py", "None"]:
179
+ fn = f"/tmp/bloom_{x}"
180
+ for _ in range(3):
181
+ __benchmark(fn, capacity=10000000, error_rate=0.001, engine=x)
@@ -0,0 +1,323 @@
1
+ import datetime
2
+ from typing import List, Union
3
+
4
+ import dateutil.parser
5
+ import pendulum
6
+
7
+ _tz_utc = pendulum.timezone("utc")
8
+ _tz_local = pendulum.local_timezone()
9
+
10
+ _DATELIKE = Union[str, datetime.datetime, datetime.date, pendulum.DateTime, pendulum.Date]
11
+ _TZ_TYPE = Union[datetime.tzinfo, str]
12
+
13
+
14
+ def utcnow() -> datetime.datetime:
15
+ """Current datetime in UTC timezone, naive format (without timezone info).
16
+ e.g. datetime.datetime(2022, 10, 8, 9, 52, 13, 489857)
17
+ """
18
+ return datetime.datetime.utcnow()
19
+
20
+
21
+ def utcnow_aware() -> datetime.datetime:
22
+ """Current datetime in UTC timezone, aware format (with timezone info).
23
+ e.g. datetime.datetime(2022, 10, 8, 9, 52, 13, 489857, tzinfo=tzutc())
24
+ """
25
+ return datetime.datetime.utcnow().replace(tzinfo=_tz_utc)
26
+
27
+
28
+ def now() -> datetime.datetime:
29
+ """Current datetime in local timezone, naive format (without timezone info).
30
+ e.g. datetime.datetime(2022, 10, 8, 17, 52, 13, 489857)
31
+ """
32
+ return datetime.datetime.now()
33
+
34
+
35
+ def now_aware() -> datetime.datetime:
36
+ """Current datetime in local timezone, naive format (with timezone info).
37
+ e.g. datetime.datetime(2022, 10, 8, 17, 52, 13, 489857, tzinfo=tzlocal())
38
+ """
39
+ return datetime.datetime.now(tz=_tz_local)
40
+
41
+
42
+ def _ensure_datetime(dttm: _DATELIKE) -> datetime.datetime:
43
+ """Convert a date-like value to a datetime.datetime object, leave the timezone info as-is
44
+
45
+ >>> _ensure_datetime('2022-09-10')
46
+ datetime.datetime(2022, 9, 10, 0, 0)
47
+ >>> _ensure_datetime('2022-09-10 08:00:00+00:00')
48
+ datetime.datetime(2022, 9, 10, 8, 0, tzinfo=tzutc())
49
+ >>> _ensure_datetime(datetime.datetime(2022, 9, 10))
50
+ datetime.datetime(2022, 9, 10, 0, 0)
51
+ >>> _ensure_datetime(pendulum.parse('2022-09-10 08:00:00+00:00'))
52
+ datetime.datetime(2022, 9, 10, 8, 0, tzinfo=Timezone('+00:00'))
53
+ """
54
+ if isinstance(dttm, pendulum.DateTime):
55
+ return datetime.datetime.fromtimestamp(dttm.timestamp(), dttm.tz)
56
+ if isinstance(dttm, datetime.datetime):
57
+ return dttm
58
+ if isinstance(dttm, datetime.date):
59
+ return datetime.datetime.combine(dttm, datetime.time.min)
60
+ if isinstance(dttm, str):
61
+ return dateutil.parser.parse(dttm)
62
+ raise TypeError(f"unsupported type {type(dttm)}")
63
+
64
+
65
+ def to_pendulum(dttm: _DATELIKE) -> pendulum.DateTime:
66
+ """Convert a date-like value into pendulum.DateTime
67
+
68
+ >>> to_pendulum('2022-09-10')
69
+ DateTime(2022, 9, 10, 0, 0, 0, tzinfo=Timezone('UTC'))
70
+ >>> to_pendulum('2022-09-10 12:12:12')
71
+ DateTime(2022, 9, 10, 12, 12, 12, tzinfo=Timezone('UTC'))
72
+ >>> to_pendulum('2022-09-10 12:12:12+08:00')
73
+ DateTime(2022, 9, 10, 12, 12, 12, tzinfo=Timezone('+08:00'))
74
+ >>> to_pendulum(datetime.datetime(2022, 9, 10))
75
+ DateTime(2022, 9, 10, 0, 0, 0, tzinfo=Timezone('UTC'))
76
+ """
77
+ if isinstance(dttm, str):
78
+ return pendulum.parse(dttm)
79
+ return pendulum.instance(dttm)
80
+
81
+
82
+ def as_local_datetime(dt: _DATELIKE) -> datetime.datetime:
83
+ """Convert a date-like value into local timezone, ignore the original timezone
84
+
85
+ Note those tests only work well in timezon Asia/Shanghai
86
+ >>> as_local_datetime('2022-09-10')
87
+ datetime.datetime(2022, 9, 10, 0, 0, tzinfo=Timezone('Asia/Shanghai'))
88
+ >>> as_local_datetime('2022-09-10 12:12:12+08:00')
89
+ datetime.datetime(2022, 9, 10, 12, 12, 12, tzinfo=Timezone('Asia/Shanghai'))
90
+ >>> as_local_datetime(pendulum.parse('2022-09-10 08:00:00+00:00'))
91
+ datetime.datetime(2022, 9, 10, 8, 0, tzinfo=Timezone('Asia/Shanghai'))
92
+ """
93
+ return _ensure_datetime(dt).replace(tzinfo=_tz_local)
94
+
95
+
96
+ def _ensure_tz(tz_or_name: _TZ_TYPE) -> datetime.tzinfo:
97
+ if isinstance(tz_or_name, str):
98
+ return pendulum.timezone(tz_or_name)
99
+ return tz_or_name
100
+
101
+
102
+ def convert_tz(dt: _DATELIKE, source: _TZ_TYPE, to: _TZ_TYPE) -> datetime.datetime:
103
+ """Convert timezone.
104
+
105
+ >>> convert_tz('2022-09-10 08:00:00', 'Asia/Shanghai', 'UTC')
106
+ datetime.datetime(2022, 9, 10, 0, 0, tzinfo=Timezone('UTC'))
107
+ >>> convert_tz('2022-09-10 00:00:00', 'UTC', 'Asia/Shanghai')
108
+ datetime.datetime(2022, 9, 10, 8, 0, tzinfo=Timezone('Asia/Shanghai'))
109
+ >>> convert_tz('2022-09-10 00:00:00', 'Europe/Paris', 'Asia/Shanghai')
110
+ datetime.datetime(2022, 9, 10, 6, 0, tzinfo=Timezone('Asia/Shanghai'))
111
+ """
112
+ return _ensure_datetime(dt).replace(tzinfo=_ensure_tz(source)).astimezone(_ensure_tz(to))
113
+
114
+
115
+ def local_to_utc(dt: _DATELIKE) -> datetime.datetime:
116
+ """Convert a datetime from local to utc
117
+
118
+ >>> local_to_utc('2022-09-10 08:00:00')
119
+ datetime.datetime(2022, 9, 10, 0, 0, tzinfo=Timezone('UTC'))
120
+ >>> local_to_utc('2022-09-10 08:00:00+08:00')
121
+ datetime.datetime(2022, 9, 10, 0, 0, tzinfo=Timezone('UTC'))
122
+ """
123
+ return convert_tz(dt, source=_tz_local, to=_tz_utc)
124
+
125
+
126
+ def utc_to_local(dt: _DATELIKE) -> datetime.datetime:
127
+ """Convert a datetime from utc to local
128
+
129
+ >>> utc_to_local('2022-09-10 08:00:00')
130
+ datetime.datetime(2022, 9, 10, 16, 0, tzinfo=Timezone('Asia/Shanghai'))
131
+ >>> utc_to_local('2022-09-10 08:00:00+00:00')
132
+ datetime.datetime(2022, 9, 10, 16, 0, tzinfo=Timezone('Asia/Shanghai'))
133
+ """
134
+ return convert_tz(dt, source=_tz_utc, to=_tz_local)
135
+
136
+
137
+ def truncate_second(dttm: _DATELIKE) -> datetime.datetime:
138
+ return truncate(dttm, "second")
139
+
140
+
141
+ def truncate_minute(dttm: _DATELIKE) -> datetime.datetime:
142
+ return truncate(dttm, "minute")
143
+
144
+
145
+ def truncate_hour(dttm: _DATELIKE) -> datetime.datetime:
146
+ return truncate(dttm, "hour")
147
+
148
+
149
+ def truncate_day(dttm: _DATELIKE) -> datetime.datetime:
150
+ return truncate(dttm, "day")
151
+
152
+
153
+ def truncate_week(dttm: _DATELIKE) -> datetime.datetime:
154
+ dttm = truncate(dttm, "day")
155
+ return dttm - datetime.timedelta(days=dttm.isoweekday() - 1)
156
+
157
+
158
+ def truncate_month(dttm: _DATELIKE) -> datetime.datetime:
159
+ return truncate(dttm, "month")
160
+
161
+
162
+ def truncate_half_month(dttm: _DATELIKE) -> datetime.datetime:
163
+ dttm = truncate_day(dttm)
164
+ if dttm.day >= 15:
165
+ return dttm.replace(day=15)
166
+ return dttm.replace(day=1)
167
+
168
+
169
+ def truncate_quarter(dttm: _DATELIKE) -> datetime.datetime:
170
+ dttm = truncate(dttm, "month")
171
+
172
+ month = dttm.month
173
+ if 1 <= month <= 3:
174
+ return dttm.replace(month=1)
175
+ elif 4 <= month <= 6:
176
+ return dttm.replace(month=4)
177
+ elif 7 <= month <= 9:
178
+ return dttm.replace(month=7)
179
+ elif 10 <= month <= 12:
180
+ return dttm.replace(month=10)
181
+
182
+
183
+ def truncate_half_year(dttm: _DATELIKE) -> datetime.datetime:
184
+ dttm = truncate(dttm, "month")
185
+ if 1 <= dttm.month <= 6:
186
+ return dttm.replace(month=1)
187
+ return dttm.replace(month=7)
188
+
189
+
190
+ def truncate_year(dttm: _DATELIKE) -> datetime.datetime:
191
+ return truncate(dttm, "year")
192
+
193
+
194
+ _PERIODS = {
195
+ "second": dict(microsecond=0),
196
+ "minute": dict(microsecond=0, second=0),
197
+ "hour": dict(microsecond=0, second=0, minute=0),
198
+ "day": dict(
199
+ microsecond=0,
200
+ second=0,
201
+ minute=0,
202
+ hour=0,
203
+ ),
204
+ "month": dict(microsecond=0, second=0, minute=0, hour=0, day=1),
205
+ "year": dict(microsecond=0, second=0, minute=0, hour=0, day=1, month=1),
206
+ }
207
+ _ODD_PERIODS = {"week": truncate_week, "quarter": truncate_quarter, "half_year": truncate_half_year}
208
+
209
+
210
+ def truncate(dttm: _DATELIKE, truncate_to="day") -> datetime.datetime:
211
+ dttm = _ensure_datetime(dttm)
212
+ if truncate_to in _PERIODS:
213
+ return dttm.replace(**_PERIODS[truncate_to])
214
+
215
+ if truncate_to not in _ODD_PERIODS:
216
+ raise ValueError(
217
+ "truncate_to not valid. Valid periods: {}".format(
218
+ ", ".join(list(_PERIODS.keys()) + list(_ODD_PERIODS.keys()))
219
+ )
220
+ )
221
+ return _ODD_PERIODS[truncate_to](dttm)
222
+
223
+
224
+ def date_add(dttm: _DATELIKE, days: int) -> datetime.datetime:
225
+ """
226
+ >>> dttm = datetime.datetime(2022, 10, 8)
227
+ >>> date_add(dttm, 6)
228
+ datetime.datetime(2022, 10, 14, 0, 0)
229
+ >>> date_add('2022-10-08', 6)
230
+ datetime.datetime(2022, 10, 14, 0, 0)
231
+ """
232
+ return _ensure_datetime(dttm) + datetime.timedelta(days=days)
233
+
234
+
235
+ def month_start(dttm: _DATELIKE) -> datetime.datetime:
236
+ """
237
+ >>> dttm = datetime.datetime(2022, 10, 8)
238
+ >>> month_start(dttm)
239
+ datetime.datetime(2022, 10, 1, 0, 0)
240
+ """
241
+ return truncate_month(dttm)
242
+
243
+
244
+ def month_end(dttm: _DATELIKE) -> datetime.datetime:
245
+ """
246
+ >>> dttm = datetime.datetime(2022, 10, 8)
247
+ >>> month_end(dttm)
248
+ datetime.datetime(2022, 10, 31, 0, 0)
249
+ """
250
+ dt = to_pendulum(dttm).last_of("month")
251
+ return datetime.datetime(dt.year, dt.month, dt.day)
252
+
253
+
254
+ def _get_last_month(dttm: _DATELIKE) -> datetime.datetime:
255
+ return month_start(dttm) - datetime.timedelta(days=1)
256
+
257
+
258
+ def last_month_start(dttm: _DATELIKE) -> datetime.datetime:
259
+ """
260
+ >>> dttm = datetime.datetime(2022, 10, 8)
261
+ >>> last_month_start(dttm)
262
+ datetime.datetime(2022, 9, 1, 0, 0)
263
+ """
264
+ return month_start(_get_last_month(dttm))
265
+
266
+
267
+ def last_month_end(dttm: _DATELIKE) -> datetime.datetime:
268
+ """
269
+ >>> dttm = datetime.datetime(2022, 10, 8)
270
+ >>> last_month_end(dttm)
271
+ datetime.datetime(2022, 9, 30, 0, 0)
272
+ """
273
+ return month_start(dttm) - datetime.timedelta(days=1)
274
+
275
+
276
+ def _get_last_week(dttm: _DATELIKE) -> datetime.datetime:
277
+ return truncate_week(dttm) - datetime.timedelta(days=7)
278
+
279
+
280
+ def last_week_start(dttm: _DATELIKE) -> datetime.datetime:
281
+ """
282
+ >>> dttm = datetime.datetime(2022, 10, 8)
283
+ >>> last_week_start(dttm)
284
+ datetime.datetime(2022, 9, 26, 0, 0)
285
+ """
286
+ return truncate_week(dttm) - datetime.timedelta(days=7)
287
+
288
+
289
+ def last_week_end(dttm: _DATELIKE) -> datetime.datetime:
290
+ """
291
+ >>> dttm = datetime.datetime(2022, 10, 8)
292
+ >>> last_week_end(dttm)
293
+ datetime.datetime(2022, 10, 2, 0, 0)
294
+ """
295
+ return truncate_week(dttm) - datetime.timedelta(days=1)
296
+
297
+
298
+ def month_range(start_date: _DATELIKE, end_date: _DATELIKE) -> List[str]:
299
+ """Get the first day of all months between start_date and end_date
300
+
301
+ >>> month_range('2022-01-02', '2022-05-20')
302
+ ['2022-01-01', '2022-02-01', '2022-03-01', '2022-04-01', '2022-05-01']
303
+ """
304
+ start_date = to_pendulum(start_date).replace(day=1)
305
+ end_date = to_pendulum(end_date).replace(day=1)
306
+ return [x.date().isoformat() for x in pendulum.period(start_date, end_date).range("months")]
307
+
308
+
309
+ def day_range(start_date: _DATELIKE, end_date: _DATELIKE) -> List[str]:
310
+ """Get all dates between start_date and end_date
311
+
312
+ >>> day_range('2022-01-02', '2022-01-07')
313
+ ['2022-01-02', '2022-01-03', '2022-01-04', '2022-01-05', '2022-01-06', '2022-01-07']
314
+ """
315
+ start_date = to_pendulum(start_date)
316
+ end_date = to_pendulum(end_date)
317
+ return [x.date().isoformat() for x in pendulum.period(start_date, end_date).range("days")]
318
+
319
+
320
+ if __name__ == "__main__":
321
+ import doctest
322
+
323
+ doctest.testmod()
@@ -0,0 +1,15 @@
1
+ _escape_table = [chr(x) for x in range(128)]
2
+ _escape_table[0] = "\\0"
3
+
4
+ # hive delimiters
5
+ _escape_table[1] = "\\1"
6
+ # _escape_table[2] = '\\2'
7
+ # _escape_table[3] = '\\3'
8
+
9
+ # _escape_table[ord('\\')] = '\\\\'
10
+ _escape_table[ord("\n")] = "\\n"
11
+ _escape_table[ord("\r")] = "\\r"
12
+
13
+
14
+ def escape_string(v):
15
+ return v.translate(_escape_table)