pycharter 0.0.22__py3-none-any.whl → 0.0.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (404) hide show
  1. api/main.py +27 -1
  2. api/models/docs.py +68 -0
  3. api/models/evolution.py +117 -0
  4. api/models/tracking.py +111 -0
  5. api/models/validation.py +46 -6
  6. api/routes/v1/__init__.py +14 -1
  7. api/routes/v1/docs.py +187 -0
  8. api/routes/v1/evolution.py +337 -0
  9. api/routes/v1/templates.py +211 -27
  10. api/routes/v1/tracking.py +301 -0
  11. api/routes/v1/validation.py +68 -31
  12. pycharter/__init__.py +268 -58
  13. pycharter/data/templates/contract/template_coercion_rules.yaml +57 -0
  14. pycharter/data/templates/contract/template_contract.yaml +122 -0
  15. pycharter/data/templates/contract/template_metadata.yaml +68 -0
  16. pycharter/data/templates/contract/template_schema.yaml +100 -0
  17. pycharter/data/templates/contract/template_validation_rules.yaml +75 -0
  18. pycharter/data/templates/etl/README.md +224 -0
  19. pycharter/data/templates/etl/extract_cloud_azure.yaml +24 -0
  20. pycharter/data/templates/etl/extract_cloud_gcs.yaml +25 -0
  21. pycharter/data/templates/etl/extract_cloud_s3.yaml +30 -0
  22. pycharter/data/templates/etl/extract_database.yaml +34 -0
  23. pycharter/data/templates/etl/extract_database_ssh.yaml +40 -0
  24. pycharter/data/templates/etl/extract_file_csv.yaml +21 -0
  25. pycharter/data/templates/etl/extract_file_glob.yaml +25 -0
  26. pycharter/data/templates/etl/extract_file_json.yaml +24 -0
  27. pycharter/data/templates/etl/extract_file_parquet.yaml +20 -0
  28. pycharter/data/templates/etl/extract_http_paginated.yaml +79 -0
  29. pycharter/data/templates/etl/extract_http_path_params.yaml +38 -0
  30. pycharter/data/templates/etl/extract_http_simple.yaml +62 -0
  31. pycharter/data/templates/etl/load_cloud_azure.yaml +24 -0
  32. pycharter/data/templates/etl/load_cloud_gcs.yaml +22 -0
  33. pycharter/data/templates/etl/load_cloud_s3.yaml +27 -0
  34. pycharter/data/templates/etl/load_file.yaml +34 -0
  35. pycharter/data/templates/etl/load_insert.yaml +18 -0
  36. pycharter/data/templates/etl/load_postgresql.yaml +39 -0
  37. pycharter/data/templates/etl/load_sqlite.yaml +21 -0
  38. pycharter/data/templates/etl/load_truncate_and_load.yaml +20 -0
  39. pycharter/data/templates/etl/load_upsert.yaml +25 -0
  40. pycharter/data/templates/etl/load_with_dlq.yaml +34 -0
  41. pycharter/data/templates/etl/load_with_ssh_tunnel.yaml +35 -0
  42. pycharter/data/templates/etl/pipeline_http_to_db.yaml +75 -0
  43. pycharter/data/templates/etl/transform_combined.yaml +48 -0
  44. pycharter/data/templates/etl/transform_custom_function.yaml +58 -0
  45. pycharter/data/templates/etl/transform_jsonata.yaml +51 -0
  46. pycharter/data/templates/etl/transform_simple.yaml +59 -0
  47. pycharter/db/schemas/.ipynb_checkpoints/data_contract-checkpoint.py +160 -0
  48. pycharter/docs_generator/__init__.py +43 -0
  49. pycharter/docs_generator/generator.py +465 -0
  50. pycharter/docs_generator/renderers.py +247 -0
  51. pycharter/etl_generator/__init__.py +168 -80
  52. pycharter/etl_generator/builder.py +121 -0
  53. pycharter/etl_generator/config_loader.py +394 -0
  54. pycharter/etl_generator/config_validator.py +418 -0
  55. pycharter/etl_generator/context.py +132 -0
  56. pycharter/etl_generator/expression.py +499 -0
  57. pycharter/etl_generator/extractors/__init__.py +30 -0
  58. pycharter/etl_generator/extractors/base.py +70 -0
  59. pycharter/etl_generator/extractors/cloud_storage.py +530 -0
  60. pycharter/etl_generator/extractors/database.py +221 -0
  61. pycharter/etl_generator/extractors/factory.py +185 -0
  62. pycharter/etl_generator/extractors/file.py +475 -0
  63. pycharter/etl_generator/extractors/http.py +895 -0
  64. pycharter/etl_generator/extractors/streaming.py +57 -0
  65. pycharter/etl_generator/loaders/__init__.py +41 -0
  66. pycharter/etl_generator/loaders/base.py +35 -0
  67. pycharter/etl_generator/loaders/cloud.py +87 -0
  68. pycharter/etl_generator/loaders/cloud_storage_loader.py +275 -0
  69. pycharter/etl_generator/loaders/database.py +274 -0
  70. pycharter/etl_generator/loaders/factory.py +180 -0
  71. pycharter/etl_generator/loaders/file.py +72 -0
  72. pycharter/etl_generator/loaders/file_loader.py +130 -0
  73. pycharter/etl_generator/pipeline.py +743 -0
  74. pycharter/etl_generator/protocols.py +54 -0
  75. pycharter/etl_generator/result.py +63 -0
  76. pycharter/etl_generator/schemas/__init__.py +49 -0
  77. pycharter/etl_generator/transformers/__init__.py +49 -0
  78. pycharter/etl_generator/transformers/base.py +63 -0
  79. pycharter/etl_generator/transformers/config.py +45 -0
  80. pycharter/etl_generator/transformers/custom_function.py +101 -0
  81. pycharter/etl_generator/transformers/jsonata_transformer.py +56 -0
  82. pycharter/etl_generator/transformers/operations.py +218 -0
  83. pycharter/etl_generator/transformers/pipeline.py +54 -0
  84. pycharter/etl_generator/transformers/simple_operations.py +131 -0
  85. pycharter/quality/__init__.py +25 -0
  86. pycharter/quality/tracking/__init__.py +64 -0
  87. pycharter/quality/tracking/collector.py +318 -0
  88. pycharter/quality/tracking/exporters.py +238 -0
  89. pycharter/quality/tracking/models.py +194 -0
  90. pycharter/quality/tracking/store.py +385 -0
  91. pycharter/runtime_validator/__init__.py +20 -7
  92. pycharter/runtime_validator/builder.py +328 -0
  93. pycharter/runtime_validator/validator.py +311 -7
  94. pycharter/runtime_validator/validator_core.py +61 -0
  95. pycharter/schema_evolution/__init__.py +61 -0
  96. pycharter/schema_evolution/compatibility.py +270 -0
  97. pycharter/schema_evolution/diff.py +496 -0
  98. pycharter/schema_evolution/models.py +201 -0
  99. pycharter/shared/__init__.py +56 -0
  100. pycharter/shared/errors.py +296 -0
  101. pycharter/shared/protocols.py +234 -0
  102. {pycharter-0.0.22.dist-info → pycharter-0.0.24.dist-info}/METADATA +146 -26
  103. pycharter-0.0.24.dist-info/RECORD +543 -0
  104. {pycharter-0.0.22.dist-info → pycharter-0.0.24.dist-info}/WHEEL +1 -1
  105. ui/static/404/index.html +1 -1
  106. ui/static/404.html +1 -1
  107. ui/static/__next.__PAGE__.txt +1 -1
  108. ui/static/__next._full.txt +1 -1
  109. ui/static/__next._head.txt +1 -1
  110. ui/static/__next._index.txt +1 -1
  111. ui/static/__next._tree.txt +1 -1
  112. ui/static/_next/static/chunks/26dfc590f7714c03.js +1 -0
  113. ui/static/_next/static/chunks/34d289e6db2ef551.js +1 -0
  114. ui/static/_next/static/chunks/99508d9d5869cc27.js +1 -0
  115. ui/static/_next/static/chunks/b313c35a6ba76574.js +1 -0
  116. ui/static/_not-found/__next._full.txt +1 -1
  117. ui/static/_not-found/__next._head.txt +1 -1
  118. ui/static/_not-found/__next._index.txt +1 -1
  119. ui/static/_not-found/__next._not-found.__PAGE__.txt +1 -1
  120. ui/static/_not-found/__next._not-found.txt +1 -1
  121. ui/static/_not-found/__next._tree.txt +1 -1
  122. ui/static/_not-found/index.html +1 -1
  123. ui/static/_not-found/index.txt +1 -1
  124. ui/static/contracts/__next._full.txt +2 -2
  125. ui/static/contracts/__next._head.txt +1 -1
  126. ui/static/contracts/__next._index.txt +1 -1
  127. ui/static/contracts/__next._tree.txt +1 -1
  128. ui/static/contracts/__next.contracts.__PAGE__.txt +2 -2
  129. ui/static/contracts/__next.contracts.txt +1 -1
  130. ui/static/contracts/index.html +1 -1
  131. ui/static/contracts/index.txt +2 -2
  132. ui/static/documentation/__next._full.txt +1 -1
  133. ui/static/documentation/__next._head.txt +1 -1
  134. ui/static/documentation/__next._index.txt +1 -1
  135. ui/static/documentation/__next._tree.txt +1 -1
  136. ui/static/documentation/__next.documentation.__PAGE__.txt +1 -1
  137. ui/static/documentation/__next.documentation.txt +1 -1
  138. ui/static/documentation/index.html +2 -2
  139. ui/static/documentation/index.txt +1 -1
  140. ui/static/index.html +1 -1
  141. ui/static/index.txt +1 -1
  142. ui/static/metadata/__next._full.txt +1 -1
  143. ui/static/metadata/__next._head.txt +1 -1
  144. ui/static/metadata/__next._index.txt +1 -1
  145. ui/static/metadata/__next._tree.txt +1 -1
  146. ui/static/metadata/__next.metadata.__PAGE__.txt +1 -1
  147. ui/static/metadata/__next.metadata.txt +1 -1
  148. ui/static/metadata/index.html +1 -1
  149. ui/static/metadata/index.txt +1 -1
  150. ui/static/quality/__next._full.txt +2 -2
  151. ui/static/quality/__next._head.txt +1 -1
  152. ui/static/quality/__next._index.txt +1 -1
  153. ui/static/quality/__next._tree.txt +1 -1
  154. ui/static/quality/__next.quality.__PAGE__.txt +2 -2
  155. ui/static/quality/__next.quality.txt +1 -1
  156. ui/static/quality/index.html +2 -2
  157. ui/static/quality/index.txt +2 -2
  158. ui/static/rules/__next._full.txt +1 -1
  159. ui/static/rules/__next._head.txt +1 -1
  160. ui/static/rules/__next._index.txt +1 -1
  161. ui/static/rules/__next._tree.txt +1 -1
  162. ui/static/rules/__next.rules.__PAGE__.txt +1 -1
  163. ui/static/rules/__next.rules.txt +1 -1
  164. ui/static/rules/index.html +1 -1
  165. ui/static/rules/index.txt +1 -1
  166. ui/static/schemas/__next._full.txt +1 -1
  167. ui/static/schemas/__next._head.txt +1 -1
  168. ui/static/schemas/__next._index.txt +1 -1
  169. ui/static/schemas/__next._tree.txt +1 -1
  170. ui/static/schemas/__next.schemas.__PAGE__.txt +1 -1
  171. ui/static/schemas/__next.schemas.txt +1 -1
  172. ui/static/schemas/index.html +1 -1
  173. ui/static/schemas/index.txt +1 -1
  174. ui/static/settings/__next._full.txt +1 -1
  175. ui/static/settings/__next._head.txt +1 -1
  176. ui/static/settings/__next._index.txt +1 -1
  177. ui/static/settings/__next._tree.txt +1 -1
  178. ui/static/settings/__next.settings.__PAGE__.txt +1 -1
  179. ui/static/settings/__next.settings.txt +1 -1
  180. ui/static/settings/index.html +1 -1
  181. ui/static/settings/index.txt +1 -1
  182. ui/static/static/404/index.html +1 -1
  183. ui/static/static/404.html +1 -1
  184. ui/static/static/__next.__PAGE__.txt +1 -1
  185. ui/static/static/__next._full.txt +2 -2
  186. ui/static/static/__next._head.txt +1 -1
  187. ui/static/static/__next._index.txt +2 -2
  188. ui/static/static/__next._tree.txt +2 -2
  189. ui/static/static/_next/static/chunks/13d4a0fbd74c1ee4.js +1 -0
  190. ui/static/static/_next/static/chunks/2edb43b48432ac04.js +441 -0
  191. ui/static/static/_next/static/chunks/d2363397e1b2bcab.css +1 -0
  192. ui/static/static/_next/static/chunks/f7d1a90dd75d2572.js +1 -0
  193. ui/static/static/_not-found/__next._full.txt +2 -2
  194. ui/static/static/_not-found/__next._head.txt +1 -1
  195. ui/static/static/_not-found/__next._index.txt +2 -2
  196. ui/static/static/_not-found/__next._not-found.__PAGE__.txt +1 -1
  197. ui/static/static/_not-found/__next._not-found.txt +1 -1
  198. ui/static/static/_not-found/__next._tree.txt +2 -2
  199. ui/static/static/_not-found/index.html +1 -1
  200. ui/static/static/_not-found/index.txt +2 -2
  201. ui/static/static/contracts/__next._full.txt +3 -3
  202. ui/static/static/contracts/__next._head.txt +1 -1
  203. ui/static/static/contracts/__next._index.txt +2 -2
  204. ui/static/static/contracts/__next._tree.txt +2 -2
  205. ui/static/static/contracts/__next.contracts.__PAGE__.txt +2 -2
  206. ui/static/static/contracts/__next.contracts.txt +1 -1
  207. ui/static/static/contracts/index.html +1 -1
  208. ui/static/static/contracts/index.txt +3 -3
  209. ui/static/static/documentation/__next._full.txt +3 -3
  210. ui/static/static/documentation/__next._head.txt +1 -1
  211. ui/static/static/documentation/__next._index.txt +2 -2
  212. ui/static/static/documentation/__next._tree.txt +2 -2
  213. ui/static/static/documentation/__next.documentation.__PAGE__.txt +2 -2
  214. ui/static/static/documentation/__next.documentation.txt +1 -1
  215. ui/static/static/documentation/index.html +2 -2
  216. ui/static/static/documentation/index.txt +3 -3
  217. ui/static/static/index.html +1 -1
  218. ui/static/static/index.txt +2 -2
  219. ui/static/static/metadata/__next._full.txt +2 -2
  220. ui/static/static/metadata/__next._head.txt +1 -1
  221. ui/static/static/metadata/__next._index.txt +2 -2
  222. ui/static/static/metadata/__next._tree.txt +2 -2
  223. ui/static/static/metadata/__next.metadata.__PAGE__.txt +1 -1
  224. ui/static/static/metadata/__next.metadata.txt +1 -1
  225. ui/static/static/metadata/index.html +1 -1
  226. ui/static/static/metadata/index.txt +2 -2
  227. ui/static/static/quality/__next._full.txt +2 -2
  228. ui/static/static/quality/__next._head.txt +1 -1
  229. ui/static/static/quality/__next._index.txt +2 -2
  230. ui/static/static/quality/__next._tree.txt +2 -2
  231. ui/static/static/quality/__next.quality.__PAGE__.txt +1 -1
  232. ui/static/static/quality/__next.quality.txt +1 -1
  233. ui/static/static/quality/index.html +2 -2
  234. ui/static/static/quality/index.txt +2 -2
  235. ui/static/static/rules/__next._full.txt +2 -2
  236. ui/static/static/rules/__next._head.txt +1 -1
  237. ui/static/static/rules/__next._index.txt +2 -2
  238. ui/static/static/rules/__next._tree.txt +2 -2
  239. ui/static/static/rules/__next.rules.__PAGE__.txt +1 -1
  240. ui/static/static/rules/__next.rules.txt +1 -1
  241. ui/static/static/rules/index.html +1 -1
  242. ui/static/static/rules/index.txt +2 -2
  243. ui/static/static/schemas/__next._full.txt +2 -2
  244. ui/static/static/schemas/__next._head.txt +1 -1
  245. ui/static/static/schemas/__next._index.txt +2 -2
  246. ui/static/static/schemas/__next._tree.txt +2 -2
  247. ui/static/static/schemas/__next.schemas.__PAGE__.txt +1 -1
  248. ui/static/static/schemas/__next.schemas.txt +1 -1
  249. ui/static/static/schemas/index.html +1 -1
  250. ui/static/static/schemas/index.txt +2 -2
  251. ui/static/static/settings/__next._full.txt +2 -2
  252. ui/static/static/settings/__next._head.txt +1 -1
  253. ui/static/static/settings/__next._index.txt +2 -2
  254. ui/static/static/settings/__next._tree.txt +2 -2
  255. ui/static/static/settings/__next.settings.__PAGE__.txt +1 -1
  256. ui/static/static/settings/__next.settings.txt +1 -1
  257. ui/static/static/settings/index.html +1 -1
  258. ui/static/static/settings/index.txt +2 -2
  259. ui/static/static/static/.gitkeep +0 -0
  260. ui/static/static/static/404/index.html +1 -0
  261. ui/static/static/static/404.html +1 -0
  262. ui/static/static/static/__next.__PAGE__.txt +10 -0
  263. ui/static/static/static/__next._full.txt +30 -0
  264. ui/static/static/static/__next._head.txt +7 -0
  265. ui/static/static/static/__next._index.txt +9 -0
  266. ui/static/static/static/__next._tree.txt +2 -0
  267. ui/static/static/static/_next/static/chunks/222442f6da32302a.js +1 -0
  268. ui/static/static/static/_next/static/chunks/247eb132b7f7b574.js +1 -0
  269. ui/static/static/static/_next/static/chunks/297d55555b71baba.js +1 -0
  270. ui/static/static/static/_next/static/chunks/2ab439ce003cd691.js +1 -0
  271. ui/static/static/static/_next/static/chunks/414e77373f8ff61c.js +1 -0
  272. ui/static/static/static/_next/static/chunks/49ca65abd26ae49e.js +1 -0
  273. ui/static/static/static/_next/static/chunks/652ad0aa26265c47.js +2 -0
  274. ui/static/static/static/_next/static/chunks/9667e7a3d359eb39.js +1 -0
  275. ui/static/static/static/_next/static/chunks/9c23f44fff36548a.js +1 -0
  276. ui/static/static/static/_next/static/chunks/a6dad97d9634a72d.js +1 -0
  277. ui/static/static/static/_next/static/chunks/b32a0963684b9933.js +4 -0
  278. ui/static/static/static/_next/static/chunks/c69f6cba366bd988.js +1 -0
  279. ui/static/static/static/_next/static/chunks/db913959c675cea6.js +1 -0
  280. ui/static/static/static/_next/static/chunks/f061a4be97bfc3b3.js +1 -0
  281. ui/static/static/static/_next/static/chunks/f2e7afeab1178138.js +1 -0
  282. ui/static/static/static/_next/static/chunks/ff1a16fafef87110.js +1 -0
  283. ui/static/static/static/_next/static/chunks/turbopack-ffcb7ab6794027ef.js +3 -0
  284. ui/static/static/static/_next/static/tNTkVW6puVXC4bAm4WrHl/_buildManifest.js +11 -0
  285. ui/static/static/static/_next/static/tNTkVW6puVXC4bAm4WrHl/_ssgManifest.js +1 -0
  286. ui/static/static/static/_not-found/__next._full.txt +17 -0
  287. ui/static/static/static/_not-found/__next._head.txt +7 -0
  288. ui/static/static/static/_not-found/__next._index.txt +9 -0
  289. ui/static/static/static/_not-found/__next._not-found.__PAGE__.txt +5 -0
  290. ui/static/static/static/_not-found/__next._not-found.txt +4 -0
  291. ui/static/static/static/_not-found/__next._tree.txt +2 -0
  292. ui/static/static/static/_not-found/index.html +1 -0
  293. ui/static/static/static/_not-found/index.txt +17 -0
  294. ui/static/static/static/contracts/__next._full.txt +21 -0
  295. ui/static/static/static/contracts/__next._head.txt +7 -0
  296. ui/static/static/static/contracts/__next._index.txt +9 -0
  297. ui/static/static/static/contracts/__next._tree.txt +2 -0
  298. ui/static/static/static/contracts/__next.contracts.__PAGE__.txt +9 -0
  299. ui/static/static/static/contracts/__next.contracts.txt +4 -0
  300. ui/static/static/static/contracts/index.html +1 -0
  301. ui/static/static/static/contracts/index.txt +21 -0
  302. ui/static/static/static/documentation/__next._full.txt +21 -0
  303. ui/static/static/static/documentation/__next._head.txt +7 -0
  304. ui/static/static/static/documentation/__next._index.txt +9 -0
  305. ui/static/static/static/documentation/__next._tree.txt +2 -0
  306. ui/static/static/static/documentation/__next.documentation.__PAGE__.txt +9 -0
  307. ui/static/static/static/documentation/__next.documentation.txt +4 -0
  308. ui/static/static/static/documentation/index.html +93 -0
  309. ui/static/static/static/documentation/index.txt +21 -0
  310. ui/static/static/static/index.html +1 -0
  311. ui/static/static/static/index.txt +30 -0
  312. ui/static/static/static/metadata/__next._full.txt +21 -0
  313. ui/static/static/static/metadata/__next._head.txt +7 -0
  314. ui/static/static/static/metadata/__next._index.txt +9 -0
  315. ui/static/static/static/metadata/__next._tree.txt +2 -0
  316. ui/static/static/static/metadata/__next.metadata.__PAGE__.txt +9 -0
  317. ui/static/static/static/metadata/__next.metadata.txt +4 -0
  318. ui/static/static/static/metadata/index.html +1 -0
  319. ui/static/static/static/metadata/index.txt +21 -0
  320. ui/static/static/static/quality/__next._full.txt +21 -0
  321. ui/static/static/static/quality/__next._head.txt +7 -0
  322. ui/static/static/static/quality/__next._index.txt +9 -0
  323. ui/static/static/static/quality/__next._tree.txt +2 -0
  324. ui/static/static/static/quality/__next.quality.__PAGE__.txt +9 -0
  325. ui/static/static/static/quality/__next.quality.txt +4 -0
  326. ui/static/static/static/quality/index.html +2 -0
  327. ui/static/static/static/quality/index.txt +21 -0
  328. ui/static/static/static/rules/__next._full.txt +21 -0
  329. ui/static/static/static/rules/__next._head.txt +7 -0
  330. ui/static/static/static/rules/__next._index.txt +9 -0
  331. ui/static/static/static/rules/__next._tree.txt +2 -0
  332. ui/static/static/static/rules/__next.rules.__PAGE__.txt +9 -0
  333. ui/static/static/static/rules/__next.rules.txt +4 -0
  334. ui/static/static/static/rules/index.html +1 -0
  335. ui/static/static/static/rules/index.txt +21 -0
  336. ui/static/static/static/schemas/__next._full.txt +21 -0
  337. ui/static/static/static/schemas/__next._head.txt +7 -0
  338. ui/static/static/static/schemas/__next._index.txt +9 -0
  339. ui/static/static/static/schemas/__next._tree.txt +2 -0
  340. ui/static/static/static/schemas/__next.schemas.__PAGE__.txt +9 -0
  341. ui/static/static/static/schemas/__next.schemas.txt +4 -0
  342. ui/static/static/static/schemas/index.html +1 -0
  343. ui/static/static/static/schemas/index.txt +21 -0
  344. ui/static/static/static/settings/__next._full.txt +21 -0
  345. ui/static/static/static/settings/__next._head.txt +7 -0
  346. ui/static/static/static/settings/__next._index.txt +9 -0
  347. ui/static/static/static/settings/__next._tree.txt +2 -0
  348. ui/static/static/static/settings/__next.settings.__PAGE__.txt +9 -0
  349. ui/static/static/static/settings/__next.settings.txt +4 -0
  350. ui/static/static/static/settings/index.html +1 -0
  351. ui/static/static/static/settings/index.txt +21 -0
  352. ui/static/static/static/validation/__next._full.txt +21 -0
  353. ui/static/static/static/validation/__next._head.txt +7 -0
  354. ui/static/static/static/validation/__next._index.txt +9 -0
  355. ui/static/static/static/validation/__next._tree.txt +2 -0
  356. ui/static/static/static/validation/__next.validation.__PAGE__.txt +9 -0
  357. ui/static/static/static/validation/__next.validation.txt +4 -0
  358. ui/static/static/static/validation/index.html +1 -0
  359. ui/static/static/static/validation/index.txt +21 -0
  360. ui/static/static/validation/__next._full.txt +2 -2
  361. ui/static/static/validation/__next._head.txt +1 -1
  362. ui/static/static/validation/__next._index.txt +2 -2
  363. ui/static/static/validation/__next._tree.txt +2 -2
  364. ui/static/static/validation/__next.validation.__PAGE__.txt +1 -1
  365. ui/static/static/validation/__next.validation.txt +1 -1
  366. ui/static/static/validation/index.html +1 -1
  367. ui/static/static/validation/index.txt +2 -2
  368. ui/static/validation/__next._full.txt +2 -2
  369. ui/static/validation/__next._head.txt +1 -1
  370. ui/static/validation/__next._index.txt +1 -1
  371. ui/static/validation/__next._tree.txt +1 -1
  372. ui/static/validation/__next.validation.__PAGE__.txt +2 -2
  373. ui/static/validation/__next.validation.txt +1 -1
  374. ui/static/validation/index.html +1 -1
  375. ui/static/validation/index.txt +2 -2
  376. pycharter/data/templates/template_coercion_rules.yaml +0 -15
  377. pycharter/data/templates/template_contract.yaml +0 -587
  378. pycharter/data/templates/template_metadata.yaml +0 -38
  379. pycharter/data/templates/template_schema.yaml +0 -22
  380. pycharter/data/templates/template_transform_advanced.yaml +0 -50
  381. pycharter/data/templates/template_transform_simple.yaml +0 -59
  382. pycharter/data/templates/template_validation_rules.yaml +0 -29
  383. pycharter/etl_generator/extraction.py +0 -916
  384. pycharter/etl_generator/factory.py +0 -174
  385. pycharter/etl_generator/orchestrator.py +0 -1650
  386. pycharter/integrations/__init__.py +0 -19
  387. pycharter/integrations/kafka.py +0 -178
  388. pycharter/integrations/streaming.py +0 -100
  389. pycharter-0.0.22.dist-info/RECORD +0 -358
  390. {pycharter-0.0.22.dist-info → pycharter-0.0.24.dist-info}/entry_points.txt +0 -0
  391. {pycharter-0.0.22.dist-info → pycharter-0.0.24.dist-info}/licenses/LICENSE +0 -0
  392. {pycharter-0.0.22.dist-info → pycharter-0.0.24.dist-info}/top_level.txt +0 -0
  393. /ui/static/_next/static/{0rYA78L88aUyD2Uh38hhX → 2gKjNv6YvE6BcIdFthBLs}/_buildManifest.js +0 -0
  394. /ui/static/_next/static/{0rYA78L88aUyD2Uh38hhX → 2gKjNv6YvE6BcIdFthBLs}/_ssgManifest.js +0 -0
  395. /ui/static/static/_next/static/{tNTkVW6puVXC4bAm4WrHl → 0rYA78L88aUyD2Uh38hhX}/_buildManifest.js +0 -0
  396. /ui/static/static/_next/static/{tNTkVW6puVXC4bAm4WrHl → 0rYA78L88aUyD2Uh38hhX}/_ssgManifest.js +0 -0
  397. /ui/static/{_next → static/_next}/static/chunks/c4fa4f4114b7c352.js +0 -0
  398. /ui/static/static/{_next → static/_next}/static/chunks/4e310fe5005770a3.css +0 -0
  399. /ui/static/{_next → static/static/_next}/static/chunks/5e04d10c4a7b58a3.js +0 -0
  400. /ui/static/static/{_next → static/_next}/static/chunks/5fc14c00a2779dc5.js +0 -0
  401. /ui/static/{_next → static/static/_next}/static/chunks/75d88a058d8ffaa6.js +0 -0
  402. /ui/static/{_next → static/static/_next}/static/chunks/8c89634cf6bad76f.js +0 -0
  403. /ui/static/static/{_next → static/_next}/static/chunks/b584574fdc8ab13e.js +0 -0
  404. /ui/static/static/{_next → static/_next}/static/chunks/d5989c94d3614b3a.js +0 -0
@@ -1,1650 +0,0 @@
1
- """
2
- ETL Orchestrator - Streaming ETL pipeline with simple operations, JSONata, and custom functions.
3
-
4
- Executes ETL pipelines: Extract → Transform (Simple Operations → JSONata → Custom Functions) → Load.
5
-
6
- Transformation Pipeline:
7
- 1. Simple Operations: rename, convert, defaults, add, select, drop (declarative, easy to use)
8
- 2. JSONata: Powerful query language for complex transformations (full JSONata support)
9
- 3. Custom Functions: Import and run external Python modules/functions
10
- """
11
-
12
- import asyncio
13
- import gc
14
- import importlib
15
- import logging
16
- import re
17
- import uuid
18
- import warnings
19
- from collections import Counter, defaultdict
20
- from datetime import datetime
21
- from pathlib import Path
22
- from typing import Any, AsyncIterator, Callable, Dict, List, Optional, Tuple
23
-
24
- import jsonata
25
- import yaml
26
-
27
- from pycharter.contract_parser import ContractMetadata, parse_contract_file
28
- from pycharter.etl_generator.checkpoint import CheckpointManager
29
- from pycharter.etl_generator.database import get_database_connection, load_data
30
- from pycharter.etl_generator.dlq import DeadLetterQueue, DLQReason
31
- from pycharter.etl_generator.extraction import extract_with_pagination_streaming
32
- from pycharter.etl_generator.progress import ETLProgress, ProgressTracker
33
- from pycharter.utils.value_injector import resolve_values
34
-
35
- logger = logging.getLogger(__name__)
36
-
37
- # Optional memory monitoring
38
- try:
39
- import psutil
40
- PSUTIL_AVAILABLE = True
41
- except ImportError:
42
- PSUTIL_AVAILABLE = False
43
-
44
- DEFAULT_BATCH_SIZE = 1000
45
-
46
-
47
- class ETLOrchestrator:
48
- """
49
- Generic ETL Orchestrator that executes pipelines from contract artifacts and ETL configs.
50
-
51
- Processes data in streaming mode: Extract-Batch → Transform-Batch → Load-Batch.
52
- This ensures constant memory usage regardless of dataset size.
53
-
54
- Example:
55
- >>> from pycharter.etl_generator import ETLOrchestrator
56
- >>> orchestrator = ETLOrchestrator(contract_dir="data/examples/my_contract")
57
- >>> await orchestrator.run()
58
- """
59
-
60
- def __init__(
61
- self,
62
- contract_dir: Optional[str] = None,
63
- contract_file: Optional[str] = None,
64
- contract_dict: Optional[Dict[str, Any]] = None,
65
- contract_metadata: Optional[ContractMetadata] = None,
66
- checkpoint_dir: Optional[str] = None,
67
- progress_callback: Optional[Callable[[ETLProgress], None]] = None,
68
- verbose: bool = True,
69
- max_memory_mb: Optional[int] = None,
70
- config_context: Optional[Dict[str, Any]] = None,
71
- # ETL config options (alternative to loading from contract_dir)
72
- extract_config: Optional[Dict[str, Any]] = None,
73
- transform_config: Optional[Dict[str, Any]] = None,
74
- load_config: Optional[Dict[str, Any]] = None,
75
- extract_file: Optional[str] = None,
76
- transform_file: Optional[str] = None,
77
- load_file: Optional[str] = None,
78
- ):
79
- """
80
- Initialize the ETL orchestrator with contract artifacts.
81
-
82
- Args:
83
- contract_dir: Directory containing contract files and ETL configs
84
- contract_file: Path to complete contract file (YAML/JSON)
85
- contract_dict: Contract as dictionary
86
- contract_metadata: ContractMetadata object (from parse_contract)
87
- checkpoint_dir: Directory for checkpoint files (None = disabled)
88
- progress_callback: Optional callback for progress updates
89
- verbose: If True, print progress to stdout
90
- max_memory_mb: Maximum memory usage in MB (None = no limit)
91
- config_context: Optional context dictionary for value injection.
92
- Values in this dict have highest priority when resolving
93
- variables in config files (e.g., ${VAR}).
94
- Useful for injecting application-level settings.
95
- extract_config: Optional extract configuration as dictionary.
96
- If provided, overrides extract.yaml from contract_dir.
97
- transform_config: Optional transform configuration as dictionary.
98
- If provided, overrides transform.yaml from contract_dir.
99
- load_config: Optional load configuration as dictionary.
100
- If provided, overrides load.yaml from contract_dir.
101
- extract_file: Optional path to extract.yaml file.
102
- If provided, overrides extract.yaml from contract_dir.
103
- transform_file: Optional path to transform.yaml file.
104
- If provided, overrides transform.yaml from contract_dir.
105
- load_file: Optional path to load.yaml file.
106
- If provided, overrides load.yaml from contract_dir.
107
-
108
- Note:
109
- ETL config priority: direct dict > file path > contract_dir
110
- If contract_dir is not provided, you must provide extract_config/transform_config/load_config
111
- or extract_file/transform_file/load_file.
112
-
113
- Note:
114
- Tables must be created manually or via migrations (e.g., Alembic).
115
- PyCharter no longer creates tables from schema.json.
116
- """
117
- self.contract_dir: Optional[Path] = None
118
- self.schema: Optional[Dict[str, Any]] = None
119
- self.coercion_rules: Dict[str, Any] = {}
120
- self.validation_rules: Dict[str, Any] = {}
121
- self.input_params: Dict[str, Dict[str, Any]] = {}
122
-
123
- # Configuration context for value injection
124
- self.config_context = config_context or {}
125
-
126
- # Store ETL config parameters for later loading
127
- self._extract_config_param = extract_config
128
- self._transform_config_param = transform_config
129
- self._load_config_param = load_config
130
- self._extract_file_param = extract_file
131
- self._transform_file_param = transform_file
132
- self._load_file_param = load_file
133
-
134
- # Enhanced features
135
- self.checkpoint_manager = CheckpointManager(checkpoint_dir)
136
- self.progress_tracker = ProgressTracker(progress_callback, verbose)
137
- self.max_memory_mb = max_memory_mb
138
- self.process = None
139
- if PSUTIL_AVAILABLE:
140
- self.process = psutil.Process()
141
-
142
- # Logging context
143
- self.run_id: Optional[str] = None # Correlation ID for this run
144
- self._current_stage: Optional[str] = None # Current pipeline stage
145
-
146
- # Load contract artifacts
147
- if contract_metadata:
148
- self._load_from_metadata(contract_metadata)
149
- elif contract_dict:
150
- self._load_from_dict(contract_dict)
151
- elif contract_file:
152
- file_path = Path(contract_file)
153
- self.contract_dir = file_path.parent
154
- self._load_from_file(file_path)
155
- elif contract_dir:
156
- self.contract_dir = Path(contract_dir)
157
- self._load_from_directory(self.contract_dir)
158
- else:
159
- # If no contract source provided, we still need contract_dir for ETL configs
160
- # unless all ETL configs are provided directly
161
- if not (extract_config or extract_file) and not contract_dir:
162
- raise ValueError(
163
- "Must provide one of: contract_dir, contract_file, contract_dict, "
164
- "contract_metadata, or extract_config/extract_file"
165
- )
166
- # Set contract_dir to None if not provided (ETL configs will be loaded from params)
167
- self.contract_dir = None
168
-
169
- # Load ETL configurations (extract, transform, load)
170
- # Priority: direct dict > file path > contract_dir
171
- self._load_etl_configs()
172
-
173
- # ============================================================================
174
- # INITIALIZATION AND CONFIGURATION LOADING
175
- # ============================================================================
176
-
177
- def _load_from_metadata(self, metadata: ContractMetadata) -> None:
178
- """Load contract from ContractMetadata object."""
179
- self.schema = metadata.schema
180
- self.coercion_rules = metadata.coercion_rules or {}
181
- self.validation_rules = metadata.validation_rules or {}
182
-
183
- def _load_from_dict(self, contract: Dict[str, Any]) -> None:
184
- """Load contract from dictionary."""
185
- self.schema = contract.get("schema")
186
- if not self.schema:
187
- raise ValueError("Contract dictionary must contain 'schema'")
188
-
189
- self.coercion_rules = self._extract_rules(contract.get("coercion_rules", {}))
190
- self.validation_rules = self._extract_rules(contract.get("validation_rules", {}))
191
-
192
- @staticmethod
193
- def _extract_rules(rules_data: Any) -> Dict[str, Any]:
194
- """Extract rules from various formats."""
195
- if not isinstance(rules_data, dict):
196
- return {}
197
-
198
- if "rules" in rules_data:
199
- return rules_data["rules"]
200
- elif not any(k in rules_data for k in ["version", "description", "title"]):
201
- return rules_data
202
- else:
203
- return {}
204
-
205
- def _load_from_file(self, file_path: Path) -> None:
206
- """Load contract from file."""
207
- contract_metadata = parse_contract_file(str(file_path))
208
- self._load_from_metadata(contract_metadata)
209
-
210
- def _load_from_directory(self, contract_dir: Path) -> None:
211
- """Load contract components from directory."""
212
- if not contract_dir.exists():
213
- raise ValueError(f"Contract directory not found: {contract_dir}")
214
-
215
- # Load schema (required) - support both YAML and JSON
216
- schema_path_yaml = contract_dir / "schema.yaml"
217
- schema_path_json = contract_dir / "schema.json"
218
-
219
- schema_path = None
220
- if schema_path_yaml.exists():
221
- schema_path = schema_path_yaml
222
- elif schema_path_json.exists():
223
- schema_path = schema_path_json
224
- else:
225
- # Try to find JSON schema files with dataset name pattern
226
- dataset_name = contract_dir.name
227
- possible_json_schemas = [
228
- contract_dir / f"{dataset_name}_schema.json",
229
- contract_dir / f"{dataset_name}.schema.json",
230
- contract_dir / "schema.json",
231
- ]
232
- for possible_path in possible_json_schemas:
233
- if possible_path.exists():
234
- schema_path = possible_path
235
- break
236
-
237
- if schema_path and schema_path.exists():
238
- if schema_path.suffix == '.json':
239
- import json
240
- with open(schema_path, 'r', encoding='utf-8') as f:
241
- self.schema = json.load(f)
242
- else:
243
- self.schema = self._load_yaml(schema_path)
244
- else:
245
- raise ValueError(
246
- f"Schema file not found in {contract_dir}. "
247
- f"Expected: schema.yaml, schema.json, or {contract_dir.name}_schema.json"
248
- )
249
-
250
- # Load coercion rules (optional)
251
- coercion_path = contract_dir / "coercion_rules.yaml"
252
- if coercion_path.exists():
253
- coercion_data = self._load_yaml(coercion_path)
254
- self.coercion_rules = self._extract_rules(coercion_data)
255
-
256
- # Load validation rules (optional)
257
- validation_path = contract_dir / "validation_rules.yaml"
258
- if validation_path.exists():
259
- validation_data = self._load_yaml(validation_path)
260
- self.validation_rules = self._extract_rules(validation_data)
261
-
262
- def _load_etl_configs(self) -> None:
263
- """
264
- Load ETL configuration files (extract, transform, load).
265
-
266
- Priority order:
267
- 1. Direct dictionary parameters (extract_config, transform_config, load_config)
268
- 2. File path parameters (extract_file, transform_file, load_file)
269
- 3. Files in contract_dir (extract.yaml, transform.yaml, load.yaml)
270
- """
271
- # Load extract config (required)
272
- self.extract_config = self._load_single_config(
273
- config_param=self._extract_config_param,
274
- file_param=self._extract_file_param,
275
- default_filename="extract.yaml",
276
- required=True,
277
- config_name="Extract"
278
- )
279
-
280
- # Load transform config (optional)
281
- self.transform_config = self._load_single_config(
282
- config_param=self._transform_config_param,
283
- file_param=self._transform_file_param,
284
- default_filename="transform.yaml",
285
- required=False,
286
- config_name="Transform"
287
- )
288
-
289
- # Load load config (required)
290
- self.load_config = self._load_single_config(
291
- config_param=self._load_config_param,
292
- file_param=self._load_file_param,
293
- default_filename="load.yaml",
294
- required=True,
295
- config_name="Load"
296
- )
297
-
298
- # Parse input parameters from extract config
299
- self._parse_input_params()
300
-
301
- if not self.schema:
302
- raise ValueError("Schema not loaded")
303
-
304
- # Initialize Dead Letter Queue (will be configured with session in run() method)
305
- self.dlq: Optional[DeadLetterQueue] = None
306
-
307
- def _load_single_config(
308
- self,
309
- config_param: Optional[Dict[str, Any]],
310
- file_param: Optional[str],
311
- default_filename: str,
312
- required: bool,
313
- config_name: str,
314
- ) -> Dict[str, Any]:
315
- """
316
- Load a single ETL config following priority order.
317
-
318
- Args:
319
- config_param: Direct dictionary config (highest priority)
320
- file_param: File path to config (medium priority)
321
- default_filename: Default filename in contract_dir (lowest priority)
322
- required: Whether this config is required
323
- config_name: Name for error messages
324
-
325
- Returns:
326
- Loaded config dictionary (empty dict if not required and not found)
327
- """
328
- # Priority 1: Direct dictionary
329
- if config_param is not None:
330
- return config_param
331
-
332
- # Priority 2: File path
333
- if file_param:
334
- config_path = Path(file_param)
335
- if not config_path.exists():
336
- raise ValueError(f"{config_name} config file not found: {config_path}")
337
- config = self._load_yaml(config_path)
338
- # Set contract_dir from file if not already set
339
- if not self.contract_dir:
340
- self.contract_dir = config_path.parent
341
- return config
342
-
343
- # Priority 3: From contract_dir
344
- if self.contract_dir and self.contract_dir.exists():
345
- config_path = self.contract_dir / default_filename
346
- if config_path.exists():
347
- return self._load_yaml(config_path)
348
-
349
- # Handle missing config
350
- if required:
351
- raise ValueError(
352
- f"{config_name} configuration not found. Provide one of: "
353
- f"{config_name.lower()}_config (dict), {config_name.lower()}_file (path), "
354
- f"or contract_dir with {default_filename}"
355
- )
356
-
357
- return {}
358
-
359
- def _parse_input_params(self) -> None:
360
- """Parse input parameters from extract config."""
361
- input_params_config = self.extract_config.get('input_params', [])
362
- if isinstance(input_params_config, list):
363
- self.input_params = {name: {} for name in input_params_config}
364
- elif isinstance(input_params_config, dict):
365
- self.input_params = input_params_config
366
- else:
367
- self.input_params = {}
368
-
369
- def _load_yaml(self, file_path: Path) -> Dict[str, Any]:
370
- """Load YAML file, return empty dict if not found."""
371
- if not file_path.exists():
372
- return {}
373
- with open(file_path, 'r', encoding='utf-8') as f:
374
- return yaml.safe_load(f) or {}
375
-
376
- def _prepare_params(self, **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
377
- """Prepare params and headers from config and kwargs."""
378
- params = self.extract_config.get('params', {}).copy()
379
- headers = self.extract_config.get('headers', {})
380
-
381
- # Get parameter mapping from extract config (maps input param names to API param names)
382
- param_mapping = self.extract_config.get('param_mapping', {})
383
-
384
- # Merge input arguments
385
- for param_name, param_value in kwargs.items():
386
- if param_name in self.input_params:
387
- # Check if there's a mapping for this parameter
388
- api_param_name = param_mapping.get(param_name, param_name)
389
- params[api_param_name] = param_value
390
- else:
391
- warnings.warn(
392
- f"Unknown input parameter '{param_name}'. "
393
- f"Available: {list(self.input_params.keys())}",
394
- UserWarning
395
- )
396
-
397
- # Validate required input parameters and apply defaults for optional ones
398
- for param_name, param_meta in self.input_params.items():
399
- if param_meta.get('required', False):
400
- # Check if input parameter was provided in kwargs
401
- if param_name not in kwargs:
402
- raise ValueError(
403
- f"Required input parameter '{param_name}' not provided. "
404
- f"Please provide: {param_name}=value"
405
- )
406
- else:
407
- # Apply default value for optional parameters if not provided
408
- if param_name not in kwargs and 'default' in param_meta:
409
- default_value = param_meta.get('default')
410
- # Only add if default is not None (None means truly optional)
411
- if default_value is not None:
412
- api_param_name = param_mapping.get(param_name, param_name)
413
- params[api_param_name] = default_value
414
-
415
- # Resolve values with config context
416
- source_file = str(self.contract_dir / "extract.yaml") if self.contract_dir else None
417
- params = resolve_values(params, context=self.config_context, source_file=source_file)
418
- headers = resolve_values(headers, context=self.config_context, source_file=source_file)
419
-
420
- return params, headers
421
-
422
- # ============================================================================
423
- # EXTRACTION
424
- # ============================================================================
425
-
426
- async def extract(
427
- self,
428
- batch_size: Optional[int] = None,
429
- max_records: Optional[int] = None,
430
- **kwargs
431
- ) -> AsyncIterator[List[Dict[str, Any]]]:
432
- """
433
- Extract data in batches using async generator.
434
-
435
- Yields batches of records for memory-efficient processing.
436
-
437
- Args:
438
- batch_size: Number of records per batch (defaults to extract.yaml config)
439
- max_records: Maximum total records to extract (None = all)
440
- **kwargs: Input parameters defined in extract.yaml's input_params section
441
-
442
- Yields:
443
- Batches of extracted records (lists of dictionaries)
444
-
445
- Example:
446
- >>> async for batch in orchestrator.extract(symbol="AAPL"):
447
- ... print(f"Extracted {len(batch)} records")
448
- """
449
- self._current_stage = 'extract'
450
- if batch_size is None:
451
- batch_size = self.extract_config.get('batch_size', DEFAULT_BATCH_SIZE)
452
-
453
- params, headers = self._prepare_params(**kwargs)
454
-
455
- async for batch in extract_with_pagination_streaming(
456
- self.extract_config, params, headers, self.contract_dir, batch_size, max_records, config_context=self.config_context
457
- ):
458
- yield batch
459
-
460
- # ============================================================================
461
- # TRANSFORMATION (Simple Operations → JSONata → Custom Functions)
462
- # ============================================================================
463
-
464
- def transform(self, raw_data: List[Dict[str, Any]], **kwargs) -> List[Dict[str, Any]]:
465
- """
466
- Transform data using simple operations, JSONata expressions, and/or custom Python functions.
467
-
468
- Pipeline order (applied sequentially):
469
- 1. Simple operations (rename, select, drop, convert, defaults, add)
470
- 2. JSONata transformation (if configured)
471
- 3. Custom function execution (if configured)
472
-
473
- Args:
474
- raw_data: Raw data from extraction
475
- **kwargs: Additional parameters (passed to custom functions)
476
-
477
- Returns:
478
- Transformed data
479
-
480
- Example - Simple operations:
481
- transform_config:
482
- rename:
483
- oldName: new_name
484
- camelCase: snake_case
485
- select:
486
- - field1
487
- - field2
488
- convert:
489
- price: float
490
- quantity: integer
491
- defaults:
492
- status: "pending"
493
-
494
- Example - JSONata (advanced):
495
- transform_config:
496
- jsonata:
497
- expression: |
498
- $.{
499
- "ticker": symbol,
500
- "avg_price": $average(prices)
501
- }
502
-
503
- Example - Custom function:
504
- transform_config:
505
- custom_function:
506
- module: "myproject.transforms"
507
- function: "optimize_data"
508
- mode: "batch"
509
- """
510
- if not self.transform_config:
511
- return raw_data
512
-
513
- data = raw_data
514
-
515
- # Step 1: Apply simple operations (in order)
516
- # Support both new 'transform' key and legacy top-level keys for backward compatibility
517
- simple_ops = {}
518
-
519
- # New format: transform: { rename: {...}, select: [...] }
520
- if 'transform' in self.transform_config:
521
- simple_ops = self.transform_config.get('transform', {})
522
-
523
- # Legacy format: rename: {...} at top level (for backward compatibility)
524
- if 'rename' in self.transform_config and 'transform' not in self.transform_config:
525
- simple_ops['rename'] = self.transform_config.get('rename')
526
- if 'select' in self.transform_config and 'transform' not in self.transform_config:
527
- simple_ops['select'] = self.transform_config.get('select')
528
- if 'drop' in self.transform_config and 'transform' not in self.transform_config:
529
- simple_ops['drop'] = self.transform_config.get('drop')
530
- if 'convert' in self.transform_config and 'transform' not in self.transform_config:
531
- simple_ops['convert'] = self.transform_config.get('convert')
532
- if 'defaults' in self.transform_config and 'transform' not in self.transform_config:
533
- simple_ops['defaults'] = self.transform_config.get('defaults')
534
- if 'add' in self.transform_config and 'transform' not in self.transform_config:
535
- simple_ops['add'] = self.transform_config.get('add')
536
-
537
- if simple_ops:
538
- data = self._apply_simple_operations(data, simple_ops)
539
-
540
- # Step 2: Apply JSONata transformation (if configured)
541
- jsonata_config = self.transform_config.get('jsonata')
542
- if jsonata_config:
543
- data = self._apply_jsonata(data, jsonata_config)
544
-
545
- # Step 3: Apply custom function (if configured)
546
- custom_func_config = self.transform_config.get('custom_function')
547
- if custom_func_config:
548
- data = self._apply_custom_function(data, custom_func_config, **kwargs)
549
-
550
- return data
551
-
552
- def _apply_simple_operations(
553
- self,
554
- data: List[Dict[str, Any]],
555
- config: Dict[str, Any]
556
- ) -> List[Dict[str, Any]]:
557
- """
558
- Apply simple declarative transformation operations.
559
-
560
- Operations are applied in this order:
561
- 1. rename - Rename fields (old_name: new_name)
562
- 2. convert - Convert field types (field: type)
563
- 3. defaults - Set default values for missing fields
564
- 4. add - Add computed fields with expressions
565
- 5. select - Keep only specified fields
566
- 6. drop - Remove specified fields
567
-
568
- Args:
569
- data: Input data (list of records)
570
- config: Simple operations configuration
571
-
572
- Returns:
573
- Transformed data
574
-
575
- Example config:
576
- transform:
577
- rename:
578
- oldName: new_name
579
- camelCase: snake_case
580
- convert:
581
- price: float
582
- quantity: integer
583
- active: boolean
584
- defaults:
585
- status: "pending"
586
- priority: 0
587
- add:
588
- full_name: "${first_name} ${last_name}"
589
- created_at: "now()"
590
- record_id: "uuid()"
591
- select:
592
- - field1
593
- - field2
594
- drop:
595
- - internal_id
596
- - debug_info
597
- """
598
- if not data:
599
- return data
600
-
601
- result = []
602
-
603
- # Get available fields from first record for validation
604
- available_fields = set(data[0].keys()) if data else set()
605
-
606
- # Step 1: Rename fields
607
- rename_map = config.get('rename', {})
608
- if rename_map:
609
- # Validate rename mappings
610
- missing_fields = [old for old in rename_map.keys() if old not in available_fields]
611
- if missing_fields:
612
- logger.warning(
613
- f"Rename operation: Fields not found in data: {missing_fields}. "
614
- f"Available fields: {sorted(available_fields)}"
615
- )
616
-
617
- # Step 2: Convert types
618
- convert_map = config.get('convert', {})
619
-
620
- # Step 3: Defaults
621
- defaults_map = config.get('defaults', {})
622
-
623
- # Step 4: Add computed fields
624
- add_map = config.get('add', {})
625
-
626
- # Step 5: Select fields (keep only these)
627
- select_fields = config.get('select')
628
-
629
- # Step 6: Drop fields (remove these)
630
- drop_fields = set(config.get('drop', []))
631
-
632
- for record in data:
633
- transformed = dict(record)
634
-
635
- # 1. Rename
636
- if rename_map:
637
- for old_name, new_name in rename_map.items():
638
- if old_name in transformed:
639
- transformed[new_name] = transformed.pop(old_name)
640
-
641
- # 2. Convert types
642
- if convert_map:
643
- for field_name, target_type in convert_map.items():
644
- if field_name in transformed:
645
- try:
646
- transformed[field_name] = self._convert_type(
647
- transformed[field_name], target_type
648
- )
649
- except (ValueError, TypeError) as e:
650
- logger.warning(
651
- f"Failed to convert field '{field_name}' to {target_type}: {e}. "
652
- f"Keeping original value."
653
- )
654
-
655
- # 3. Apply defaults
656
- if defaults_map:
657
- for field_name, default_value in defaults_map.items():
658
- if field_name not in transformed or transformed[field_name] is None:
659
- transformed[field_name] = default_value
660
-
661
- # 4. Add computed fields
662
- if add_map:
663
- for field_name, expression in add_map.items():
664
- try:
665
- transformed[field_name] = self._evaluate_expression(
666
- expression, transformed
667
- )
668
- except Exception as e:
669
- logger.warning(
670
- f"Failed to compute field '{field_name}': {e}. "
671
- f"Skipping this field."
672
- )
673
-
674
- # 5. Select (keep only specified fields)
675
- if select_fields:
676
- transformed = {
677
- k: v for k, v in transformed.items()
678
- if k in select_fields
679
- }
680
-
681
- # 6. Drop (remove specified fields)
682
- if drop_fields:
683
- transformed = {
684
- k: v for k, v in transformed.items()
685
- if k not in drop_fields
686
- }
687
-
688
- result.append(transformed)
689
-
690
- return result
691
-
692
- def _convert_type(self, value: Any, target_type: str) -> Any:
693
- """
694
- Convert a value to the specified type.
695
-
696
- Args:
697
- value: Value to convert
698
- target_type: Target type (string, integer, float, boolean, datetime, date)
699
-
700
- Returns:
701
- Converted value
702
- """
703
- if value is None:
704
- return None
705
-
706
- target_type_lower = target_type.lower().strip()
707
-
708
- if target_type_lower in ('str', 'string'):
709
- return str(value)
710
- elif target_type_lower in ('int', 'integer'):
711
- if isinstance(value, str):
712
- # Try to parse as float first (handles "1.0" -> 1)
713
- try:
714
- return int(float(value))
715
- except ValueError:
716
- return int(value)
717
- return int(value)
718
- elif target_type_lower in ('float', 'number', 'numeric'):
719
- if isinstance(value, str):
720
- return float(value)
721
- return float(value)
722
- elif target_type_lower in ('bool', 'boolean'):
723
- if isinstance(value, str):
724
- return value.lower() in ('true', '1', 'yes', 'on')
725
- return bool(value)
726
- elif target_type_lower == 'datetime':
727
- from datetime import datetime
728
- if isinstance(value, str):
729
- # Try common datetime formats
730
- for fmt in [
731
- '%Y-%m-%dT%H:%M:%S',
732
- '%Y-%m-%dT%H:%M:%S.%f',
733
- '%Y-%m-%dT%H:%M:%SZ',
734
- '%Y-%m-%dT%H:%M:%S.%fZ',
735
- '%Y-%m-%d %H:%M:%S',
736
- '%Y-%m-%d %H:%M:%S.%f',
737
- ]:
738
- try:
739
- return datetime.strptime(value, fmt)
740
- except ValueError:
741
- continue
742
- raise ValueError(f"Cannot parse datetime: {value}")
743
- return value
744
- elif target_type_lower == 'date':
745
- from datetime import date, datetime
746
- if isinstance(value, str):
747
- # Try common date formats
748
- for fmt in ['%Y-%m-%d', '%Y/%m/%d', '%m/%d/%Y']:
749
- try:
750
- dt = datetime.strptime(value, fmt)
751
- return dt.date()
752
- except ValueError:
753
- continue
754
- raise ValueError(f"Cannot parse date: {value}")
755
- elif isinstance(value, datetime):
756
- return value.date()
757
- return value
758
- else:
759
- raise ValueError(f"Unsupported target type: {target_type}")
760
-
761
- def _evaluate_expression(self, expression: str, record: Dict[str, Any]) -> Any:
762
- """
763
- Evaluate a simple expression in the context of a record.
764
-
765
- Supports:
766
- - Field references: "${field_name}"
767
- - String concatenation: "${field1} ${field2}"
768
- - Simple functions: "now()", "uuid()"
769
- - Literal values (if no placeholders)
770
-
771
- Args:
772
- expression: Expression string
773
- record: Record dictionary for context
774
-
775
- Returns:
776
- Evaluated result
777
-
778
- Examples:
779
- "${first_name} ${last_name}" -> "John Doe"
780
- "now()" -> "2024-01-01T12:00:00"
781
- "uuid()" -> "123e4567-e89b-12d3-a456-426614174000"
782
- "static_value" -> "static_value"
783
- """
784
- if not isinstance(expression, str):
785
- return expression
786
-
787
- expression = expression.strip()
788
-
789
- # Handle special functions
790
- if expression == 'now()':
791
- return datetime.now().isoformat()
792
- elif expression == 'uuid()':
793
- return str(uuid.uuid4())
794
-
795
- # Handle field references and string interpolation
796
- try:
797
- # Simple string interpolation: "${field1} ${field2}"
798
- result = expression
799
- placeholders_found = False
800
-
801
- # Find all ${...} placeholders
802
- placeholder_pattern = r'\$\{([^}]+)\}'
803
- matches = re.findall(placeholder_pattern, expression)
804
-
805
- if matches:
806
- placeholders_found = True
807
- for field_name in matches:
808
- if field_name in record:
809
- value = record[field_name]
810
- placeholder = f"${{{field_name}}}"
811
- result = result.replace(placeholder, str(value) if value is not None else '')
812
- else:
813
- logger.warning(
814
- f"Expression '{expression}': Field '{field_name}' not found in record. "
815
- f"Available fields: {sorted(record.keys())}"
816
- )
817
- # Replace with empty string if field not found
818
- placeholder = f"${{{field_name}}}"
819
- result = result.replace(placeholder, '')
820
-
821
- # If no placeholders were found and it's not a function, return as literal
822
- if not placeholders_found and not expression.endswith('()'):
823
- return expression
824
-
825
- return result
826
- except Exception as e:
827
- raise ValueError(f"Failed to evaluate expression '{expression}': {e}") from e
828
-
829
- def _apply_jsonata(
830
- self,
831
- data: List[Dict[str, Any]],
832
- config: Dict[str, Any]
833
- ) -> List[Dict[str, Any]]:
834
- """
835
- Apply JSONata expression to transform data.
836
-
837
- Args:
838
- data: Input data (list of records)
839
- config: JSONata configuration with 'expression' and optional 'mode'
840
-
841
- Returns:
842
- Transformed data
843
-
844
- Example config:
845
- jsonata:
846
- expression: |
847
- $.{
848
- "ticker": symbol,
849
- "avg_price": $average(prices),
850
- "total_volume": $sum(volumes)
851
- }
852
- mode: "batch" # or "record"
853
- """
854
- expression_str = config.get('expression')
855
- if not expression_str:
856
- return data
857
-
858
- mode = config.get('mode', 'batch')
859
-
860
- try:
861
- expr = jsonata.Jsonata(expression_str)
862
-
863
- if mode == 'batch':
864
- # Apply expression to entire dataset
865
- result = expr.evaluate(data)
866
- if result is None:
867
- return []
868
- return result if isinstance(result, list) else [result]
869
- else:
870
- # Apply expression to each record individually
871
- return [expr.evaluate(record) for record in data if expr.evaluate(record) is not None]
872
-
873
- except Exception as e:
874
- logger.error(f"JSONata transformation failed: {e}")
875
- raise ValueError(f"JSONata transformation error: {e}") from e
876
-
877
- def _apply_custom_function(
878
- self,
879
- data: List[Dict[str, Any]],
880
- config: Dict[str, Any],
881
- **kwargs
882
- ) -> List[Dict[str, Any]]:
883
- """
884
- Execute a custom Python function for transformation.
885
-
886
- Args:
887
- data: Input data
888
- config: Custom function configuration
889
- **kwargs: Additional parameters passed to the function
890
-
891
- Returns:
892
- Transformed data
893
-
894
- Example config:
895
- custom_function:
896
- module: "pyoptima"
897
- function: "optimize_from_etl_inputs"
898
- mode: "batch"
899
- kwargs:
900
- method: "min_volatility"
901
- solver: "ipopt"
902
-
903
- Alternative config (using callable path):
904
- custom_function:
905
- callable: "myproject.transforms.optimize_portfolio"
906
- mode: "batch"
907
- """
908
- # Get module and function
909
- callable_path = config.get('callable')
910
- module_path = config.get('module')
911
- func_name = config.get('function')
912
-
913
- if callable_path:
914
- # Parse "module.submodule.function" format
915
- parts = callable_path.rsplit('.', 1)
916
- if len(parts) != 2:
917
- raise ValueError(f"Invalid callable path: {callable_path}. Use 'module.function' format.")
918
- module_path, func_name = parts
919
-
920
- if not module_path or not func_name:
921
- raise ValueError("custom_function requires either 'callable' or 'module' + 'function'")
922
-
923
- # Dynamic import
924
- try:
925
- module = importlib.import_module(module_path)
926
- func = getattr(module, func_name)
927
- except ImportError as e:
928
- raise ValueError(f"Cannot import module '{module_path}': {e}") from e
929
- except AttributeError as e:
930
- raise ValueError(f"Function '{func_name}' not found in module '{module_path}'") from e
931
-
932
- # Handle class-based methods (e.g., pyoptima optimization methods)
933
- if isinstance(func, type):
934
- instance = func()
935
- if hasattr(instance, 'optimize'):
936
- func = instance.optimize
937
- elif hasattr(instance, 'run'):
938
- func = instance.run
939
- elif hasattr(instance, '__call__'):
940
- func = instance
941
- else:
942
- raise ValueError(f"Class '{func_name}' has no 'optimize', 'run', or '__call__' method")
943
-
944
- # Get mode and kwargs
945
- mode = config.get('mode', 'batch')
946
- func_kwargs = config.get('kwargs', {})
947
-
948
- # Merge with runtime kwargs
949
- merged_kwargs = {**func_kwargs, **kwargs}
950
-
951
- try:
952
- if mode == 'batch':
953
- result = func(data, **merged_kwargs)
954
- if result is None:
955
- return []
956
- return result if isinstance(result, list) else [result]
957
- else:
958
- # Record mode
959
- results = []
960
- for record in data:
961
- record_result = func(record, **merged_kwargs)
962
- if record_result is not None:
963
- if isinstance(record_result, list):
964
- results.extend(record_result)
965
- else:
966
- results.append(record_result)
967
- return results
968
-
969
- except Exception as e:
970
- logger.error(f"Custom function '{func_name}' failed: {e}")
971
- raise ValueError(f"Custom function error: {e}") from e
972
-
973
- # ============================================================================
974
- # LOADING
975
- # ============================================================================
976
-
977
- async def load(
978
- self,
979
- transformed_data: List[Dict[str, Any]],
980
- session: Any = None,
981
- **kwargs,
982
- ) -> Dict[str, Any]:
983
- """Load transformed data into the database."""
984
- target_table = self.load_config.get('target_table')
985
- schema_name = self.load_config.get('schema_name')
986
- if not schema_name:
987
- raise ValueError(
988
- "schema_name must be specified in load.yaml. "
989
- "Example: schema_name: public"
990
- )
991
- write_method = self.load_config.get('write_method', 'upsert')
992
- primary_key = self.load_config.get('primary_key')
993
- unique_constraints = self.load_config.get('unique_constraints', [])
994
- # Keep primary_key as-is (can be string or list for composite keys)
995
- # The load functions now handle both single and composite primary keys
996
- batch_size = self.load_config.get('batch_size', 1000)
997
-
998
- # If primary_key is 'id' and not in the data, use unique constraints for conflict detection
999
- # This allows using UUID primary keys while upserting on natural keys
1000
- conflict_key = primary_key
1001
- if write_method == 'upsert' and transformed_data:
1002
- incoming_columns = set(transformed_data[0].keys())
1003
- # Check if primary_key is 'id' (string) or contains 'id' (list)
1004
- pk_is_id = (isinstance(primary_key, str) and primary_key == 'id') or \
1005
- (isinstance(primary_key, list) and len(primary_key) == 1 and primary_key[0] == 'id')
1006
-
1007
- if pk_is_id and 'id' not in incoming_columns:
1008
- # Use first unique constraint for conflict detection
1009
- if unique_constraints:
1010
- # unique_constraints can be a list of lists or a list of strings
1011
- if isinstance(unique_constraints[0], list):
1012
- conflict_key = unique_constraints[0] # First constraint (can be composite)
1013
- else:
1014
- conflict_key = unique_constraints[0] if isinstance(unique_constraints[0], str) else unique_constraints
1015
- else:
1016
- # Fallback: if no unique constraints, can't do upsert
1017
- raise ValueError(
1018
- f"Cannot perform upsert: primary_key is 'id' (auto-generated) but no unique_constraints "
1019
- f"specified in load.yaml for conflict detection. Please specify unique_constraints."
1020
- )
1021
-
1022
- if not target_table:
1023
- raise ValueError("target_table not specified in load configuration")
1024
-
1025
- tunnel = None
1026
- if session is None:
1027
- try:
1028
- engine, db_session, db_type, tunnel = get_database_connection(
1029
- self.load_config, self.contract_dir, config_context=self.config_context
1030
- )
1031
- try:
1032
- result = load_data(
1033
- transformed_data,
1034
- db_session,
1035
- schema_name,
1036
- target_table,
1037
- write_method,
1038
- conflict_key, # Use conflict_key (may be unique constraint instead of PK)
1039
- batch_size,
1040
- db_type,
1041
- )
1042
- return result
1043
- finally:
1044
- db_session.close()
1045
- if tunnel:
1046
- tunnel.stop()
1047
- except Exception as e:
1048
- if tunnel:
1049
- try:
1050
- tunnel.stop()
1051
- except Exception:
1052
- pass
1053
- raise
1054
- else:
1055
- from pycharter.etl_generator.database import detect_database_type
1056
- from sqlalchemy.ext.asyncio import AsyncSession
1057
-
1058
- # Detect database type
1059
- db_type = "postgresql"
1060
- if hasattr(session, 'bind') and hasattr(session.bind, 'url'):
1061
- db_url = str(session.bind.url)
1062
- db_type = detect_database_type(db_url)
1063
-
1064
- # load_data is now async and expects AsyncSession
1065
- if not isinstance(session, AsyncSession):
1066
- raise ValueError(
1067
- f"load_data requires an AsyncSession, but got {type(session)}. "
1068
- "Please use an AsyncSession for database operations."
1069
- )
1070
-
1071
- return await load_data(
1072
- transformed_data,
1073
- session,
1074
- schema_name,
1075
- target_table,
1076
- write_method,
1077
- conflict_key, # Use conflict_key (may be unique constraint instead of PK)
1078
- batch_size,
1079
- db_type,
1080
- )
1081
-
1082
- # ============================================================================
1083
- # MEMORY MANAGEMENT
1084
- # ============================================================================
1085
-
1086
- def _check_memory(self) -> Optional[float]:
1087
- """Get current memory usage in MB, or None if psutil not available."""
1088
- if not PSUTIL_AVAILABLE or not self.process:
1089
- return None
1090
- return self.process.memory_info().rss / 1024 / 1024
1091
-
1092
- def _enforce_memory_limit(self):
1093
- """Check and enforce memory limits."""
1094
- if self.max_memory_mb:
1095
- current = self._check_memory()
1096
- if current and current > self.max_memory_mb:
1097
- gc.collect()
1098
- current = self._check_memory()
1099
-
1100
- if current and current > self.max_memory_mb:
1101
- raise MemoryError(
1102
- f"Memory limit exceeded: {current:.1f}MB > {self.max_memory_mb}MB. "
1103
- f"Consider increasing batch_size."
1104
- )
1105
-
1106
- # ============================================================================
1107
- # PIPELINE EXECUTION
1108
- # ============================================================================
1109
-
1110
- def _log_error(self, message: str, error: Exception, **context) -> None:
1111
- """Log error with context and traceback."""
1112
- extra = {
1113
- 'run_id': self.run_id,
1114
- 'pipeline': self.contract_dir.name if self.contract_dir else 'unknown',
1115
- 'stage': self._current_stage,
1116
- 'error_type': type(error).__name__,
1117
- **context
1118
- }
1119
- logger.error(message, extra=extra, exc_info=True)
1120
-
1121
- def _log_warning(self, message: str, **context) -> None:
1122
- """Log warning with context."""
1123
- extra = {
1124
- 'run_id': self.run_id,
1125
- 'pipeline': self.contract_dir.name if self.contract_dir else 'unknown',
1126
- 'stage': self._current_stage,
1127
- **context
1128
- }
1129
- logger.warning(message, extra=extra)
1130
-
1131
- def _log_info(self, message: str, **context) -> None:
1132
- """Log info with context."""
1133
- extra = {
1134
- 'run_id': self.run_id,
1135
- 'pipeline': self.contract_dir.name if self.contract_dir else 'unknown',
1136
- 'stage': self._current_stage,
1137
- **context
1138
- }
1139
- logger.info(message, extra=extra)
1140
-
1141
- def _summarize_errors(self, failed_batches: List[Dict[str, Any]]) -> str:
1142
- """
1143
- Summarize errors from failed batches.
1144
-
1145
- Groups errors by type and shows the most common errors first.
1146
-
1147
- Args:
1148
- failed_batches: List of failed batch dictionaries with 'error', 'error_type', 'batch_num', 'records'
1149
-
1150
- Returns:
1151
- Formatted error summary string
1152
- """
1153
- if not failed_batches:
1154
- return "No error details available."
1155
-
1156
- # Group errors by type
1157
- error_type_counts = Counter(batch['error_type'] for batch in failed_batches)
1158
- errors_by_type = defaultdict(list)
1159
- for batch in failed_batches:
1160
- errors_by_type[batch['error_type']].append(batch)
1161
-
1162
- # Build summary
1163
- lines = []
1164
-
1165
- # Summary statistics
1166
- total_failed = len(failed_batches)
1167
- total_records_failed = sum(batch.get('records', 0) for batch in failed_batches)
1168
- lines.append(f"Total failed batches: {total_failed}")
1169
- lines.append(f"Total records in failed batches: {total_records_failed}")
1170
- lines.append("")
1171
-
1172
- # Group by error type (most common first)
1173
- lines.append("Errors by type:")
1174
- for error_type, count in error_type_counts.most_common():
1175
- batches_of_type = errors_by_type[error_type]
1176
- lines.append(f" {error_type}: {count} occurrence(s)")
1177
-
1178
- # Show sample error messages (up to 3 unique ones)
1179
- unique_errors = {}
1180
- for batch in batches_of_type:
1181
- error_msg = batch.get('error', 'Unknown error')
1182
- # Truncate very long error messages
1183
- if len(error_msg) > 200:
1184
- error_msg = error_msg[:200] + "..."
1185
- if error_msg not in unique_errors:
1186
- unique_errors[error_msg] = batch.get('batch_num', '?')
1187
- if len(unique_errors) >= 3:
1188
- break
1189
-
1190
- for error_msg, batch_num in unique_errors.items():
1191
- lines.append(f" - Batch {batch_num}: {error_msg}")
1192
-
1193
- if len(unique_errors) < len(batches_of_type):
1194
- remaining = len(batches_of_type) - len(unique_errors)
1195
- lines.append(f" ... and {remaining} more occurrence(s) of this error type")
1196
- lines.append("")
1197
-
1198
- return "\n".join(lines)
1199
-
1200
- async def run(
1201
- self,
1202
- dry_run: bool = False,
1203
- session: Any = None,
1204
- checkpoint_id: Optional[str] = None,
1205
- resume: bool = False,
1206
- batch_size: Optional[int] = None,
1207
- max_retries: int = 3,
1208
- error_threshold: float = 0.1,
1209
- **kwargs,
1210
- ) -> Dict[str, Any]:
1211
- """
1212
- Run the complete ETL pipeline in streaming mode.
1213
-
1214
- Processes data incrementally: Extract-Batch → Transform-Batch → Load-Batch.
1215
- This ensures constant memory usage regardless of dataset size.
1216
-
1217
- Args:
1218
- dry_run: If True, skip database operations
1219
- session: Optional database session
1220
- checkpoint_id: Optional checkpoint ID for resume capability
1221
- resume: If True, resume from checkpoint
1222
- batch_size: Batch size for processing (defaults to extract.yaml config)
1223
- max_retries: Maximum retries for failed batches
1224
- error_threshold: Error rate threshold (0.0-1.0) before aborting
1225
- **kwargs: Additional parameters passed to extract()
1226
-
1227
- Returns:
1228
- Pipeline execution results dictionary
1229
- """
1230
- # Generate correlation ID for this run
1231
- self.run_id = str(uuid.uuid4())[:8]
1232
-
1233
- if batch_size is None:
1234
- batch_size = self.extract_config.get('batch_size', DEFAULT_BATCH_SIZE)
1235
-
1236
- # Note: Tables must be created manually or via migrations.
1237
- # PyCharter no longer creates tables from schema.json.
1238
-
1239
- # Initialize Dead Letter Queue
1240
- dlq_config = self.load_config.get('dead_letter_queue', {})
1241
- dlq_enabled = dlq_config.get('enabled', True)
1242
- dlq_backend = dlq_config.get('backend', 'database')
1243
- dlq_storage_path = dlq_config.get('storage_path')
1244
- dlq_schema = dlq_config.get('schema_name') # Optional schema name
1245
-
1246
- # Get pipeline name for DLQ
1247
- pipeline_name = (
1248
- self.contract_dir.name if self.contract_dir else
1249
- self.extract_config.get('title', 'unknown_pipeline')
1250
- )
1251
-
1252
- self.dlq = DeadLetterQueue(
1253
- db_session=session if not dry_run else None,
1254
- storage_backend=dlq_backend,
1255
- storage_path=dlq_storage_path,
1256
- enabled=dlq_enabled,
1257
- schema_name=dlq_schema, # Pass schema name if provided
1258
- )
1259
-
1260
- self._log_info(
1261
- "Starting ETL pipeline",
1262
- batch_size=batch_size,
1263
- dry_run=dry_run,
1264
- checkpoint_id=checkpoint_id,
1265
- resume=resume,
1266
- input_params=kwargs,
1267
- dlq_enabled=dlq_enabled,
1268
- )
1269
-
1270
- results = {
1271
- 'extraction': {'batches_processed': 0, 'total_records': 0},
1272
- 'transformation': {'batches_processed': 0, 'total_records': 0},
1273
- 'loading': {'batches_processed': 0, 'total_records': 0, 'inserted': 0, 'updated': 0},
1274
- 'success': False,
1275
- 'failed_batches': [],
1276
- 'dlq_records': 0,
1277
- }
1278
-
1279
- # Load checkpoint if resuming
1280
- start_batch = 0
1281
- if resume and checkpoint_id:
1282
- checkpoint_state = self.checkpoint_manager.load(checkpoint_id)
1283
- if checkpoint_state:
1284
- kwargs.update(checkpoint_state.last_processed_params)
1285
- start_batch = checkpoint_state.batch_num
1286
-
1287
- self.progress_tracker.start()
1288
- batch_num = 0
1289
- total_records = 0
1290
- failed_batches = []
1291
-
1292
- try:
1293
- async for batch in self.extract(batch_size=batch_size, **kwargs):
1294
- batch_num += 1
1295
-
1296
- # Skip batches if resuming
1297
- if batch_num <= start_batch:
1298
- continue
1299
-
1300
- batch_start_time = datetime.now()
1301
-
1302
- try:
1303
- self._enforce_memory_limit()
1304
-
1305
- # Transform batch
1306
- self._current_stage = 'transform'
1307
- transformed_batch = self.transform(batch, **kwargs)
1308
-
1309
- # Load batch
1310
- self._current_stage = 'load'
1311
- if not dry_run:
1312
- load_result = await self.load(transformed_batch, session=session, **kwargs)
1313
- results['loading']['inserted'] += load_result.get('inserted', 0)
1314
- results['loading']['updated'] += load_result.get('updated', 0)
1315
- results['loading']['total_records'] += load_result.get('total', 0)
1316
-
1317
- # Update counters
1318
- total_records += len(batch)
1319
- results['extraction']['total_records'] += len(batch)
1320
- results['extraction']['batches_processed'] = batch_num
1321
- results['transformation']['total_records'] += len(transformed_batch)
1322
- results['transformation']['batches_processed'] = batch_num
1323
- results['loading']['batches_processed'] = batch_num
1324
-
1325
- # Report progress
1326
- memory_usage = self._check_memory()
1327
- batch_time = (datetime.now() - batch_start_time).total_seconds()
1328
- self.progress_tracker.record_batch_time(batch_time)
1329
- self.progress_tracker.report(
1330
- 'extract',
1331
- batch_num,
1332
- total_records,
1333
- memory_usage_mb=memory_usage,
1334
- )
1335
-
1336
- # Save checkpoint
1337
- if checkpoint_id:
1338
- self.checkpoint_manager.save(
1339
- checkpoint_id,
1340
- 'extract',
1341
- batch_num,
1342
- total_records,
1343
- kwargs,
1344
- )
1345
-
1346
- # Cleanup
1347
- del batch, transformed_batch
1348
- gc.collect()
1349
-
1350
- except Exception as e:
1351
- batch_duration = (datetime.now() - batch_start_time).total_seconds()
1352
- error_msg = str(e)
1353
- error_type = type(e).__name__
1354
-
1355
- # Check if this is a connection-related error
1356
- is_connection_error = (
1357
- 'connection' in error_msg.lower() or
1358
- 'closed' in error_msg.lower() or
1359
- 'ConnectionDoesNotExistError' in error_type or
1360
- 'ConnectionError' in error_type or
1361
- 'InterfaceError' in error_type or
1362
- 'DBAPIError' in error_type and ('connection' in error_msg.lower() or 'closed' in error_msg.lower())
1363
- )
1364
-
1365
- self._log_error(
1366
- "Batch processing failed",
1367
- e,
1368
- batch_num=batch_num,
1369
- batch_size=len(batch) if batch else 0,
1370
- total_records=total_records,
1371
- is_connection_error=is_connection_error,
1372
- )
1373
-
1374
- # For connection errors, retry before checking error rate
1375
- # This prevents aborting on transient connection issues
1376
- if is_connection_error and len(failed_batches) < max_retries:
1377
- wait_time = min(2 ** len(failed_batches), 5.0) # Exponential backoff, max 5s
1378
- self._log_warning(
1379
- f"Connection error in batch {batch_num}, retrying (attempt {len(failed_batches) + 1}/{max_retries})",
1380
- batch_num=batch_num,
1381
- retry_attempt=len(failed_batches) + 1,
1382
- wait_seconds=wait_time,
1383
- )
1384
- await asyncio.sleep(wait_time)
1385
- continue # Retry the batch without adding to failed_batches
1386
-
1387
- # Not a connection error, or retries exhausted - add to failed batches
1388
- failed_batches.append({
1389
- 'batch_num': batch_num,
1390
- 'error': error_msg,
1391
- 'error_type': error_type,
1392
- 'records': len(batch) if batch else 0,
1393
- })
1394
-
1395
- # Add failed batch to Dead Letter Queue
1396
- if batch and self.dlq:
1397
- # Determine DLQ reason
1398
- if is_connection_error:
1399
- dlq_reason = DLQReason.CONNECTION_ERROR
1400
- elif self._current_stage == 'extract':
1401
- dlq_reason = DLQReason.EXTRACTION_ERROR
1402
- elif self._current_stage == 'transform':
1403
- dlq_reason = DLQReason.TRANSFORMATION_ERROR
1404
- elif self._current_stage == 'load':
1405
- dlq_reason = DLQReason.LOAD_ERROR
1406
- else:
1407
- dlq_reason = DLQReason.UNKNOWN
1408
-
1409
- # Add batch to DLQ
1410
- dlq_records = await self.dlq.add_batch(
1411
- pipeline_name=pipeline_name,
1412
- batch=batch,
1413
- reason=dlq_reason,
1414
- error_message=error_msg,
1415
- error_type=error_type,
1416
- stage=self._current_stage or 'unknown',
1417
- metadata={
1418
- 'batch_num': batch_num,
1419
- 'total_records': total_records,
1420
- 'run_id': self.run_id,
1421
- 'is_connection_error': is_connection_error,
1422
- },
1423
- )
1424
- results['dlq_records'] += len(dlq_records)
1425
-
1426
- # Check error rate (only after connection retries are exhausted)
1427
- # Also be more lenient for small batch counts (don't abort on first failure)
1428
- min_batches_for_error_check = 3 # Need at least 3 batches before checking error rate
1429
- if batch_num >= min_batches_for_error_check:
1430
- error_rate = len(failed_batches) / batch_num if batch_num > 0 else 1.0
1431
- if error_rate > error_threshold:
1432
- # Summarize errors before raising
1433
- error_summary = self._summarize_errors(failed_batches)
1434
-
1435
- self._log_error(
1436
- "Error rate threshold exceeded",
1437
- RuntimeError("Error rate threshold exceeded"),
1438
- error_rate=error_rate,
1439
- threshold=error_threshold,
1440
- failed_batches=len(failed_batches),
1441
- total_batches=batch_num,
1442
- error_summary=error_summary,
1443
- )
1444
-
1445
- error_msg = (
1446
- f"Error rate too high: {error_rate:.1%} > {error_threshold:.1%}. "
1447
- f"Aborting pipeline.\n\n"
1448
- f"Error Summary ({len(failed_batches)} failed batches out of {batch_num} total):\n"
1449
- f"{error_summary}"
1450
- )
1451
- raise RuntimeError(error_msg)
1452
-
1453
- # Retry logic for non-connection errors
1454
- if len(failed_batches) <= max_retries:
1455
- wait_time = 2 ** len(failed_batches)
1456
- self._log_warning(
1457
- f"Retrying batch {batch_num}",
1458
- batch_num=batch_num,
1459
- retry_attempt=len(failed_batches),
1460
- wait_seconds=wait_time,
1461
- )
1462
- await asyncio.sleep(wait_time)
1463
- continue
1464
- else:
1465
- self.progress_tracker.report(
1466
- 'extract',
1467
- batch_num,
1468
- total_records,
1469
- error_count=len(failed_batches),
1470
- )
1471
-
1472
- results['failed_batches'] = failed_batches
1473
- results['success'] = len(failed_batches) < batch_num * error_threshold
1474
-
1475
- # Add DLQ statistics to results
1476
- if self.dlq:
1477
- try:
1478
- dlq_stats = self.dlq.get_statistics(pipeline_name=pipeline_name)
1479
- results['dlq_statistics'] = dlq_stats
1480
- except Exception as e:
1481
- logger.warning(f"Failed to get DLQ statistics: {e}")
1482
-
1483
- self._log_info(
1484
- "ETL pipeline completed",
1485
- batches=batch_num,
1486
- records=total_records,
1487
- failed_batches=len(failed_batches),
1488
- inserted=results['loading'].get('inserted', 0),
1489
- updated=results['loading'].get('updated', 0),
1490
- dlq_records=results.get('dlq_records', 0),
1491
- )
1492
-
1493
- # Delete checkpoint on success
1494
- if checkpoint_id and results['success']:
1495
- self.checkpoint_manager.delete(checkpoint_id)
1496
-
1497
- except Exception as e:
1498
- self._log_error(
1499
- "ETL pipeline failed",
1500
- e,
1501
- batches_processed=batch_num,
1502
- records_processed=total_records,
1503
- failed_batches=len(failed_batches),
1504
- )
1505
-
1506
- if checkpoint_id:
1507
- self.checkpoint_manager.save(
1508
- checkpoint_id,
1509
- 'error',
1510
- batch_num,
1511
- total_records,
1512
- kwargs,
1513
- error=str(e),
1514
- )
1515
- results['error'] = str(e)
1516
- results['success'] = False
1517
- raise
1518
-
1519
- return results
1520
-
1521
- async def run_multiple(
1522
- self,
1523
- param_name: Optional[str] = None,
1524
- param_values: Optional[List[Any]] = None,
1525
- param_sets: Optional[List[Dict[str, Any]]] = None,
1526
- batch_size: int = 5,
1527
- delay_between_runs: float = 1.0,
1528
- dry_run: bool = False,
1529
- session: Any = None,
1530
- **kwargs,
1531
- ) -> List[Dict[str, Any]]:
1532
- """
1533
- Run ETL pipeline multiple times with different parameter sets.
1534
-
1535
- This method allows you to efficiently run the same ETL pipeline multiple times
1536
- with varying parameters. You can either:
1537
- 1. Provide a single parameter name and list of values (simple case)
1538
- 2. Provide a list of parameter dictionaries (complex case with multiple varying params)
1539
-
1540
- Args:
1541
- param_name: Name of the parameter to vary (e.g., 'symbol', 'ticker', 'date')
1542
- Required if using param_values.
1543
- param_values: List of values for the specified parameter.
1544
- Each value will be passed as {param_name: value} to run().
1545
- param_sets: List of parameter dictionaries. Each dict will be unpacked
1546
- and passed to run() as **params. Use this when multiple
1547
- parameters vary between runs.
1548
- batch_size: Number of runs to process before a brief pause (for rate limiting)
1549
- delay_between_runs: Delay in seconds between individual runs (for rate limiting)
1550
- dry_run: If True, skip database operations
1551
- session: Optional database session
1552
- **kwargs: Additional parameters passed to each run() call (common to all runs)
1553
-
1554
- Returns:
1555
- List of result dictionaries, each containing:
1556
- - 'params': The parameters used for this run
1557
- - 'success': Whether the run succeeded
1558
- - 'records': Number of records processed (if successful)
1559
- - 'result': Full result dictionary from run() (if successful)
1560
- - 'error': Error message (if failed)
1561
-
1562
- Examples:
1563
- # Simple case: vary a single parameter
1564
- >>> results = await orchestrator.run_multiple(
1565
- ... param_name='symbol',
1566
- ... param_values=['AAPL', 'MSFT', 'GOOGL'],
1567
- ... batch_size=5,
1568
- ... delay_between_runs=1.0
1569
- ... )
1570
-
1571
- # Complex case: vary multiple parameters
1572
- >>> results = await orchestrator.run_multiple(
1573
- ... param_sets=[
1574
- ... {'symbol': 'AAPL', 'date': '2024-01-01'},
1575
- ... {'symbol': 'MSFT', 'date': '2024-01-02'},
1576
- ... ],
1577
- ... batch_size=3,
1578
- ... delay_between_runs=0.5
1579
- ... )
1580
- """
1581
- # Validate inputs
1582
- if param_sets is not None:
1583
- if param_name is not None or param_values is not None:
1584
- raise ValueError(
1585
- "Cannot use both param_sets and param_name/param_values. "
1586
- "Use either param_sets OR param_name+param_values."
1587
- )
1588
- if not isinstance(param_sets, list) or len(param_sets) == 0:
1589
- raise ValueError("param_sets must be a non-empty list of dictionaries")
1590
- # Convert param_sets to list of dicts
1591
- runs = [dict(params) for params in param_sets]
1592
- elif param_name is not None and param_values is not None:
1593
- if not isinstance(param_values, list) or len(param_values) == 0:
1594
- raise ValueError("param_values must be a non-empty list")
1595
- # Convert param_name + param_values to list of dicts
1596
- runs = [{param_name: value} for value in param_values]
1597
- else:
1598
- raise ValueError(
1599
- "Must provide either (param_name + param_values) OR param_sets"
1600
- )
1601
-
1602
- results = []
1603
-
1604
- for i in range(0, len(runs), batch_size):
1605
- run_batch = runs[i:i + batch_size]
1606
-
1607
- for run_params in run_batch:
1608
- try:
1609
- # Merge run_params with common kwargs
1610
- merged_params = {**kwargs, **run_params}
1611
- result = await self.run(
1612
- dry_run=dry_run,
1613
- session=session,
1614
- **merged_params
1615
- )
1616
- results.append({
1617
- 'params': run_params,
1618
- 'success': result['success'],
1619
- 'records': result.get('loading', {}).get('total_records', 0),
1620
- 'result': result,
1621
- })
1622
- except Exception as e:
1623
- results.append({
1624
- 'params': run_params,
1625
- 'success': False,
1626
- 'error': str(e),
1627
- })
1628
-
1629
- # Rate limiting
1630
- if i + batch_size < len(runs) or run_params != run_batch[-1]:
1631
- await asyncio.sleep(delay_between_runs)
1632
-
1633
- return results
1634
-
1635
-
1636
- def create_orchestrator(
1637
- contract_dir: Optional[str] = None,
1638
- **kwargs,
1639
- ) -> ETLOrchestrator:
1640
- """
1641
- Create an ETL orchestrator instance.
1642
-
1643
- Args:
1644
- contract_dir: Directory containing contract files and ETL configs
1645
- **kwargs: Additional arguments passed to ETLOrchestrator
1646
-
1647
- Returns:
1648
- ETLOrchestrator instance
1649
- """
1650
- return ETLOrchestrator(contract_dir=contract_dir, **kwargs)