pycharter 0.0.22__py3-none-any.whl → 0.0.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (332) hide show
  1. api/routes/v1/templates.py +43 -24
  2. pycharter/data/templates/etl/README.md +91 -0
  3. pycharter/data/templates/etl/extract_cloud_azure.yaml +23 -0
  4. pycharter/data/templates/etl/extract_cloud_gcs.yaml +22 -0
  5. pycharter/data/templates/etl/extract_cloud_s3.yaml +24 -0
  6. pycharter/data/templates/etl/extract_database.yaml +28 -0
  7. pycharter/data/templates/etl/extract_database_ssh.yaml +27 -0
  8. pycharter/data/templates/etl/extract_file_csv.yaml +17 -0
  9. pycharter/data/templates/etl/extract_file_glob.yaml +17 -0
  10. pycharter/data/templates/etl/extract_file_json.yaml +14 -0
  11. pycharter/data/templates/etl/extract_file_parquet.yaml +13 -0
  12. pycharter/data/templates/etl/extract_http_paginated.yaml +75 -0
  13. pycharter/data/templates/etl/extract_http_path_params.yaml +45 -0
  14. pycharter/data/templates/etl/extract_http_simple.yaml +52 -0
  15. pycharter/data/templates/etl/load_insert.yaml +17 -0
  16. pycharter/data/templates/etl/load_postgresql.yaml +17 -0
  17. pycharter/data/templates/etl/load_sqlite.yaml +16 -0
  18. pycharter/data/templates/etl/load_truncate_and_load.yaml +18 -0
  19. pycharter/data/templates/etl/load_upsert.yaml +28 -0
  20. pycharter/data/templates/etl/load_with_dlq.yaml +24 -0
  21. pycharter/data/templates/etl/load_with_ssh_tunnel.yaml +28 -0
  22. pycharter/data/templates/etl/pipeline_http_to_db.yaml +38 -0
  23. pycharter/data/templates/etl/transform_combined.yaml +38 -0
  24. pycharter/data/templates/etl/transform_custom_function.yaml +18 -0
  25. pycharter/data/templates/etl/transform_jsonata.yaml +20 -0
  26. pycharter/data/templates/etl/transform_simple.yaml +41 -0
  27. pycharter/db/schemas/.ipynb_checkpoints/data_contract-checkpoint.py +160 -0
  28. pycharter/etl_generator/extraction.py +47 -262
  29. pycharter/etl_generator/extractors/__init__.py +26 -0
  30. pycharter/etl_generator/extractors/base.py +70 -0
  31. pycharter/etl_generator/extractors/cloud_storage.py +454 -0
  32. pycharter/etl_generator/extractors/database.py +151 -0
  33. pycharter/etl_generator/extractors/factory.py +141 -0
  34. pycharter/etl_generator/extractors/file.py +418 -0
  35. pycharter/etl_generator/extractors/http.py +816 -0
  36. {pycharter-0.0.22.dist-info → pycharter-0.0.23.dist-info}/METADATA +6 -1
  37. pycharter-0.0.23.dist-info/RECORD +498 -0
  38. {pycharter-0.0.22.dist-info → pycharter-0.0.23.dist-info}/WHEEL +1 -1
  39. ui/static/404/index.html +1 -1
  40. ui/static/404.html +1 -1
  41. ui/static/__next.__PAGE__.txt +1 -1
  42. ui/static/__next._full.txt +1 -1
  43. ui/static/__next._head.txt +1 -1
  44. ui/static/__next._index.txt +1 -1
  45. ui/static/__next._tree.txt +1 -1
  46. ui/static/_next/static/chunks/26dfc590f7714c03.js +1 -0
  47. ui/static/_next/static/chunks/34d289e6db2ef551.js +1 -0
  48. ui/static/_next/static/chunks/99508d9d5869cc27.js +1 -0
  49. ui/static/_next/static/chunks/b313c35a6ba76574.js +1 -0
  50. ui/static/_not-found/__next._full.txt +1 -1
  51. ui/static/_not-found/__next._head.txt +1 -1
  52. ui/static/_not-found/__next._index.txt +1 -1
  53. ui/static/_not-found/__next._not-found.__PAGE__.txt +1 -1
  54. ui/static/_not-found/__next._not-found.txt +1 -1
  55. ui/static/_not-found/__next._tree.txt +1 -1
  56. ui/static/_not-found/index.html +1 -1
  57. ui/static/_not-found/index.txt +1 -1
  58. ui/static/contracts/__next._full.txt +2 -2
  59. ui/static/contracts/__next._head.txt +1 -1
  60. ui/static/contracts/__next._index.txt +1 -1
  61. ui/static/contracts/__next._tree.txt +1 -1
  62. ui/static/contracts/__next.contracts.__PAGE__.txt +2 -2
  63. ui/static/contracts/__next.contracts.txt +1 -1
  64. ui/static/contracts/index.html +1 -1
  65. ui/static/contracts/index.txt +2 -2
  66. ui/static/documentation/__next._full.txt +1 -1
  67. ui/static/documentation/__next._head.txt +1 -1
  68. ui/static/documentation/__next._index.txt +1 -1
  69. ui/static/documentation/__next._tree.txt +1 -1
  70. ui/static/documentation/__next.documentation.__PAGE__.txt +1 -1
  71. ui/static/documentation/__next.documentation.txt +1 -1
  72. ui/static/documentation/index.html +2 -2
  73. ui/static/documentation/index.txt +1 -1
  74. ui/static/index.html +1 -1
  75. ui/static/index.txt +1 -1
  76. ui/static/metadata/__next._full.txt +1 -1
  77. ui/static/metadata/__next._head.txt +1 -1
  78. ui/static/metadata/__next._index.txt +1 -1
  79. ui/static/metadata/__next._tree.txt +1 -1
  80. ui/static/metadata/__next.metadata.__PAGE__.txt +1 -1
  81. ui/static/metadata/__next.metadata.txt +1 -1
  82. ui/static/metadata/index.html +1 -1
  83. ui/static/metadata/index.txt +1 -1
  84. ui/static/quality/__next._full.txt +2 -2
  85. ui/static/quality/__next._head.txt +1 -1
  86. ui/static/quality/__next._index.txt +1 -1
  87. ui/static/quality/__next._tree.txt +1 -1
  88. ui/static/quality/__next.quality.__PAGE__.txt +2 -2
  89. ui/static/quality/__next.quality.txt +1 -1
  90. ui/static/quality/index.html +2 -2
  91. ui/static/quality/index.txt +2 -2
  92. ui/static/rules/__next._full.txt +1 -1
  93. ui/static/rules/__next._head.txt +1 -1
  94. ui/static/rules/__next._index.txt +1 -1
  95. ui/static/rules/__next._tree.txt +1 -1
  96. ui/static/rules/__next.rules.__PAGE__.txt +1 -1
  97. ui/static/rules/__next.rules.txt +1 -1
  98. ui/static/rules/index.html +1 -1
  99. ui/static/rules/index.txt +1 -1
  100. ui/static/schemas/__next._full.txt +1 -1
  101. ui/static/schemas/__next._head.txt +1 -1
  102. ui/static/schemas/__next._index.txt +1 -1
  103. ui/static/schemas/__next._tree.txt +1 -1
  104. ui/static/schemas/__next.schemas.__PAGE__.txt +1 -1
  105. ui/static/schemas/__next.schemas.txt +1 -1
  106. ui/static/schemas/index.html +1 -1
  107. ui/static/schemas/index.txt +1 -1
  108. ui/static/settings/__next._full.txt +1 -1
  109. ui/static/settings/__next._head.txt +1 -1
  110. ui/static/settings/__next._index.txt +1 -1
  111. ui/static/settings/__next._tree.txt +1 -1
  112. ui/static/settings/__next.settings.__PAGE__.txt +1 -1
  113. ui/static/settings/__next.settings.txt +1 -1
  114. ui/static/settings/index.html +1 -1
  115. ui/static/settings/index.txt +1 -1
  116. ui/static/static/404/index.html +1 -1
  117. ui/static/static/404.html +1 -1
  118. ui/static/static/__next.__PAGE__.txt +1 -1
  119. ui/static/static/__next._full.txt +2 -2
  120. ui/static/static/__next._head.txt +1 -1
  121. ui/static/static/__next._index.txt +2 -2
  122. ui/static/static/__next._tree.txt +2 -2
  123. ui/static/static/_next/static/chunks/13d4a0fbd74c1ee4.js +1 -0
  124. ui/static/static/_next/static/chunks/2edb43b48432ac04.js +441 -0
  125. ui/static/static/_next/static/chunks/d2363397e1b2bcab.css +1 -0
  126. ui/static/static/_next/static/chunks/f7d1a90dd75d2572.js +1 -0
  127. ui/static/static/_not-found/__next._full.txt +2 -2
  128. ui/static/static/_not-found/__next._head.txt +1 -1
  129. ui/static/static/_not-found/__next._index.txt +2 -2
  130. ui/static/static/_not-found/__next._not-found.__PAGE__.txt +1 -1
  131. ui/static/static/_not-found/__next._not-found.txt +1 -1
  132. ui/static/static/_not-found/__next._tree.txt +2 -2
  133. ui/static/static/_not-found/index.html +1 -1
  134. ui/static/static/_not-found/index.txt +2 -2
  135. ui/static/static/contracts/__next._full.txt +3 -3
  136. ui/static/static/contracts/__next._head.txt +1 -1
  137. ui/static/static/contracts/__next._index.txt +2 -2
  138. ui/static/static/contracts/__next._tree.txt +2 -2
  139. ui/static/static/contracts/__next.contracts.__PAGE__.txt +2 -2
  140. ui/static/static/contracts/__next.contracts.txt +1 -1
  141. ui/static/static/contracts/index.html +1 -1
  142. ui/static/static/contracts/index.txt +3 -3
  143. ui/static/static/documentation/__next._full.txt +3 -3
  144. ui/static/static/documentation/__next._head.txt +1 -1
  145. ui/static/static/documentation/__next._index.txt +2 -2
  146. ui/static/static/documentation/__next._tree.txt +2 -2
  147. ui/static/static/documentation/__next.documentation.__PAGE__.txt +2 -2
  148. ui/static/static/documentation/__next.documentation.txt +1 -1
  149. ui/static/static/documentation/index.html +2 -2
  150. ui/static/static/documentation/index.txt +3 -3
  151. ui/static/static/index.html +1 -1
  152. ui/static/static/index.txt +2 -2
  153. ui/static/static/metadata/__next._full.txt +2 -2
  154. ui/static/static/metadata/__next._head.txt +1 -1
  155. ui/static/static/metadata/__next._index.txt +2 -2
  156. ui/static/static/metadata/__next._tree.txt +2 -2
  157. ui/static/static/metadata/__next.metadata.__PAGE__.txt +1 -1
  158. ui/static/static/metadata/__next.metadata.txt +1 -1
  159. ui/static/static/metadata/index.html +1 -1
  160. ui/static/static/metadata/index.txt +2 -2
  161. ui/static/static/quality/__next._full.txt +2 -2
  162. ui/static/static/quality/__next._head.txt +1 -1
  163. ui/static/static/quality/__next._index.txt +2 -2
  164. ui/static/static/quality/__next._tree.txt +2 -2
  165. ui/static/static/quality/__next.quality.__PAGE__.txt +1 -1
  166. ui/static/static/quality/__next.quality.txt +1 -1
  167. ui/static/static/quality/index.html +2 -2
  168. ui/static/static/quality/index.txt +2 -2
  169. ui/static/static/rules/__next._full.txt +2 -2
  170. ui/static/static/rules/__next._head.txt +1 -1
  171. ui/static/static/rules/__next._index.txt +2 -2
  172. ui/static/static/rules/__next._tree.txt +2 -2
  173. ui/static/static/rules/__next.rules.__PAGE__.txt +1 -1
  174. ui/static/static/rules/__next.rules.txt +1 -1
  175. ui/static/static/rules/index.html +1 -1
  176. ui/static/static/rules/index.txt +2 -2
  177. ui/static/static/schemas/__next._full.txt +2 -2
  178. ui/static/static/schemas/__next._head.txt +1 -1
  179. ui/static/static/schemas/__next._index.txt +2 -2
  180. ui/static/static/schemas/__next._tree.txt +2 -2
  181. ui/static/static/schemas/__next.schemas.__PAGE__.txt +1 -1
  182. ui/static/static/schemas/__next.schemas.txt +1 -1
  183. ui/static/static/schemas/index.html +1 -1
  184. ui/static/static/schemas/index.txt +2 -2
  185. ui/static/static/settings/__next._full.txt +2 -2
  186. ui/static/static/settings/__next._head.txt +1 -1
  187. ui/static/static/settings/__next._index.txt +2 -2
  188. ui/static/static/settings/__next._tree.txt +2 -2
  189. ui/static/static/settings/__next.settings.__PAGE__.txt +1 -1
  190. ui/static/static/settings/__next.settings.txt +1 -1
  191. ui/static/static/settings/index.html +1 -1
  192. ui/static/static/settings/index.txt +2 -2
  193. ui/static/static/static/.gitkeep +0 -0
  194. ui/static/static/static/404/index.html +1 -0
  195. ui/static/static/static/404.html +1 -0
  196. ui/static/static/static/__next.__PAGE__.txt +10 -0
  197. ui/static/static/static/__next._full.txt +30 -0
  198. ui/static/static/static/__next._head.txt +7 -0
  199. ui/static/static/static/__next._index.txt +9 -0
  200. ui/static/static/static/__next._tree.txt +2 -0
  201. ui/static/static/static/_next/static/chunks/222442f6da32302a.js +1 -0
  202. ui/static/static/static/_next/static/chunks/247eb132b7f7b574.js +1 -0
  203. ui/static/static/static/_next/static/chunks/297d55555b71baba.js +1 -0
  204. ui/static/static/static/_next/static/chunks/2ab439ce003cd691.js +1 -0
  205. ui/static/static/static/_next/static/chunks/414e77373f8ff61c.js +1 -0
  206. ui/static/static/static/_next/static/chunks/49ca65abd26ae49e.js +1 -0
  207. ui/static/static/static/_next/static/chunks/652ad0aa26265c47.js +2 -0
  208. ui/static/static/static/_next/static/chunks/9667e7a3d359eb39.js +1 -0
  209. ui/static/static/static/_next/static/chunks/9c23f44fff36548a.js +1 -0
  210. ui/static/static/static/_next/static/chunks/a6dad97d9634a72d.js +1 -0
  211. ui/static/static/static/_next/static/chunks/b32a0963684b9933.js +4 -0
  212. ui/static/static/static/_next/static/chunks/c69f6cba366bd988.js +1 -0
  213. ui/static/static/static/_next/static/chunks/db913959c675cea6.js +1 -0
  214. ui/static/static/static/_next/static/chunks/f061a4be97bfc3b3.js +1 -0
  215. ui/static/static/static/_next/static/chunks/f2e7afeab1178138.js +1 -0
  216. ui/static/static/static/_next/static/chunks/ff1a16fafef87110.js +1 -0
  217. ui/static/static/static/_next/static/chunks/turbopack-ffcb7ab6794027ef.js +3 -0
  218. ui/static/static/static/_next/static/tNTkVW6puVXC4bAm4WrHl/_buildManifest.js +11 -0
  219. ui/static/static/static/_next/static/tNTkVW6puVXC4bAm4WrHl/_ssgManifest.js +1 -0
  220. ui/static/static/static/_not-found/__next._full.txt +17 -0
  221. ui/static/static/static/_not-found/__next._head.txt +7 -0
  222. ui/static/static/static/_not-found/__next._index.txt +9 -0
  223. ui/static/static/static/_not-found/__next._not-found.__PAGE__.txt +5 -0
  224. ui/static/static/static/_not-found/__next._not-found.txt +4 -0
  225. ui/static/static/static/_not-found/__next._tree.txt +2 -0
  226. ui/static/static/static/_not-found/index.html +1 -0
  227. ui/static/static/static/_not-found/index.txt +17 -0
  228. ui/static/static/static/contracts/__next._full.txt +21 -0
  229. ui/static/static/static/contracts/__next._head.txt +7 -0
  230. ui/static/static/static/contracts/__next._index.txt +9 -0
  231. ui/static/static/static/contracts/__next._tree.txt +2 -0
  232. ui/static/static/static/contracts/__next.contracts.__PAGE__.txt +9 -0
  233. ui/static/static/static/contracts/__next.contracts.txt +4 -0
  234. ui/static/static/static/contracts/index.html +1 -0
  235. ui/static/static/static/contracts/index.txt +21 -0
  236. ui/static/static/static/documentation/__next._full.txt +21 -0
  237. ui/static/static/static/documentation/__next._head.txt +7 -0
  238. ui/static/static/static/documentation/__next._index.txt +9 -0
  239. ui/static/static/static/documentation/__next._tree.txt +2 -0
  240. ui/static/static/static/documentation/__next.documentation.__PAGE__.txt +9 -0
  241. ui/static/static/static/documentation/__next.documentation.txt +4 -0
  242. ui/static/static/static/documentation/index.html +93 -0
  243. ui/static/static/static/documentation/index.txt +21 -0
  244. ui/static/static/static/index.html +1 -0
  245. ui/static/static/static/index.txt +30 -0
  246. ui/static/static/static/metadata/__next._full.txt +21 -0
  247. ui/static/static/static/metadata/__next._head.txt +7 -0
  248. ui/static/static/static/metadata/__next._index.txt +9 -0
  249. ui/static/static/static/metadata/__next._tree.txt +2 -0
  250. ui/static/static/static/metadata/__next.metadata.__PAGE__.txt +9 -0
  251. ui/static/static/static/metadata/__next.metadata.txt +4 -0
  252. ui/static/static/static/metadata/index.html +1 -0
  253. ui/static/static/static/metadata/index.txt +21 -0
  254. ui/static/static/static/quality/__next._full.txt +21 -0
  255. ui/static/static/static/quality/__next._head.txt +7 -0
  256. ui/static/static/static/quality/__next._index.txt +9 -0
  257. ui/static/static/static/quality/__next._tree.txt +2 -0
  258. ui/static/static/static/quality/__next.quality.__PAGE__.txt +9 -0
  259. ui/static/static/static/quality/__next.quality.txt +4 -0
  260. ui/static/static/static/quality/index.html +2 -0
  261. ui/static/static/static/quality/index.txt +21 -0
  262. ui/static/static/static/rules/__next._full.txt +21 -0
  263. ui/static/static/static/rules/__next._head.txt +7 -0
  264. ui/static/static/static/rules/__next._index.txt +9 -0
  265. ui/static/static/static/rules/__next._tree.txt +2 -0
  266. ui/static/static/static/rules/__next.rules.__PAGE__.txt +9 -0
  267. ui/static/static/static/rules/__next.rules.txt +4 -0
  268. ui/static/static/static/rules/index.html +1 -0
  269. ui/static/static/static/rules/index.txt +21 -0
  270. ui/static/static/static/schemas/__next._full.txt +21 -0
  271. ui/static/static/static/schemas/__next._head.txt +7 -0
  272. ui/static/static/static/schemas/__next._index.txt +9 -0
  273. ui/static/static/static/schemas/__next._tree.txt +2 -0
  274. ui/static/static/static/schemas/__next.schemas.__PAGE__.txt +9 -0
  275. ui/static/static/static/schemas/__next.schemas.txt +4 -0
  276. ui/static/static/static/schemas/index.html +1 -0
  277. ui/static/static/static/schemas/index.txt +21 -0
  278. ui/static/static/static/settings/__next._full.txt +21 -0
  279. ui/static/static/static/settings/__next._head.txt +7 -0
  280. ui/static/static/static/settings/__next._index.txt +9 -0
  281. ui/static/static/static/settings/__next._tree.txt +2 -0
  282. ui/static/static/static/settings/__next.settings.__PAGE__.txt +9 -0
  283. ui/static/static/static/settings/__next.settings.txt +4 -0
  284. ui/static/static/static/settings/index.html +1 -0
  285. ui/static/static/static/settings/index.txt +21 -0
  286. ui/static/static/static/validation/__next._full.txt +21 -0
  287. ui/static/static/static/validation/__next._head.txt +7 -0
  288. ui/static/static/static/validation/__next._index.txt +9 -0
  289. ui/static/static/static/validation/__next._tree.txt +2 -0
  290. ui/static/static/static/validation/__next.validation.__PAGE__.txt +9 -0
  291. ui/static/static/static/validation/__next.validation.txt +4 -0
  292. ui/static/static/static/validation/index.html +1 -0
  293. ui/static/static/static/validation/index.txt +21 -0
  294. ui/static/static/validation/__next._full.txt +2 -2
  295. ui/static/static/validation/__next._head.txt +1 -1
  296. ui/static/static/validation/__next._index.txt +2 -2
  297. ui/static/static/validation/__next._tree.txt +2 -2
  298. ui/static/static/validation/__next.validation.__PAGE__.txt +1 -1
  299. ui/static/static/validation/__next.validation.txt +1 -1
  300. ui/static/static/validation/index.html +1 -1
  301. ui/static/static/validation/index.txt +2 -2
  302. ui/static/validation/__next._full.txt +2 -2
  303. ui/static/validation/__next._head.txt +1 -1
  304. ui/static/validation/__next._index.txt +1 -1
  305. ui/static/validation/__next._tree.txt +1 -1
  306. ui/static/validation/__next.validation.__PAGE__.txt +2 -2
  307. ui/static/validation/__next.validation.txt +1 -1
  308. ui/static/validation/index.html +1 -1
  309. ui/static/validation/index.txt +2 -2
  310. pycharter/data/templates/template_transform_advanced.yaml +0 -50
  311. pycharter/data/templates/template_transform_simple.yaml +0 -59
  312. pycharter-0.0.22.dist-info/RECORD +0 -358
  313. /pycharter/data/templates/{template_coercion_rules.yaml → contract/template_coercion_rules.yaml} +0 -0
  314. /pycharter/data/templates/{template_contract.yaml → contract/template_contract.yaml} +0 -0
  315. /pycharter/data/templates/{template_metadata.yaml → contract/template_metadata.yaml} +0 -0
  316. /pycharter/data/templates/{template_schema.yaml → contract/template_schema.yaml} +0 -0
  317. /pycharter/data/templates/{template_validation_rules.yaml → contract/template_validation_rules.yaml} +0 -0
  318. {pycharter-0.0.22.dist-info → pycharter-0.0.23.dist-info}/entry_points.txt +0 -0
  319. {pycharter-0.0.22.dist-info → pycharter-0.0.23.dist-info}/licenses/LICENSE +0 -0
  320. {pycharter-0.0.22.dist-info → pycharter-0.0.23.dist-info}/top_level.txt +0 -0
  321. /ui/static/_next/static/{0rYA78L88aUyD2Uh38hhX → 2gKjNv6YvE6BcIdFthBLs}/_buildManifest.js +0 -0
  322. /ui/static/_next/static/{0rYA78L88aUyD2Uh38hhX → 2gKjNv6YvE6BcIdFthBLs}/_ssgManifest.js +0 -0
  323. /ui/static/static/_next/static/{tNTkVW6puVXC4bAm4WrHl → 0rYA78L88aUyD2Uh38hhX}/_buildManifest.js +0 -0
  324. /ui/static/static/_next/static/{tNTkVW6puVXC4bAm4WrHl → 0rYA78L88aUyD2Uh38hhX}/_ssgManifest.js +0 -0
  325. /ui/static/{_next → static/_next}/static/chunks/c4fa4f4114b7c352.js +0 -0
  326. /ui/static/static/{_next → static/_next}/static/chunks/4e310fe5005770a3.css +0 -0
  327. /ui/static/{_next → static/static/_next}/static/chunks/5e04d10c4a7b58a3.js +0 -0
  328. /ui/static/static/{_next → static/_next}/static/chunks/5fc14c00a2779dc5.js +0 -0
  329. /ui/static/{_next → static/static/_next}/static/chunks/75d88a058d8ffaa6.js +0 -0
  330. /ui/static/{_next → static/static/_next}/static/chunks/8c89634cf6bad76f.js +0 -0
  331. /ui/static/static/{_next → static/_next}/static/chunks/b584574fdc8ab13e.js +0 -0
  332. /ui/static/static/{_next → static/_next}/static/chunks/d5989c94d3614b3a.js +0 -0
@@ -0,0 +1,454 @@
1
+ """
2
+ Cloud storage extractor for ETL orchestrator.
3
+
4
+ Supports extracting data from cloud storage:
5
+ - AWS S3
6
+ - Google Cloud Storage (GCS)
7
+ - Azure Blob Storage
8
+ """
9
+
10
+ import logging
11
+ import os
12
+ import tempfile
13
+ from pathlib import Path
14
+ from typing import Any, AsyncIterator, Dict, List, Optional
15
+
16
+ from pycharter.etl_generator.extractors.base import BaseExtractor
17
+ from pycharter.etl_generator.extractors.file import FileExtractor
18
+ from pycharter.utils.value_injector import resolve_values
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ # Try to import cloud storage libraries
23
+ try:
24
+ import boto3
25
+ from botocore.exceptions import ClientError
26
+ S3_AVAILABLE = True
27
+ except ImportError:
28
+ S3_AVAILABLE = False
29
+ boto3 = None
30
+ ClientError = None
31
+
32
+ try:
33
+ from google.cloud import storage as gcs_storage
34
+ GCS_AVAILABLE = True
35
+ except ImportError:
36
+ GCS_AVAILABLE = False
37
+ gcs_storage = None
38
+
39
+ try:
40
+ from azure.storage.blob import BlobServiceClient
41
+ AZURE_AVAILABLE = True
42
+ except ImportError:
43
+ AZURE_AVAILABLE = False
44
+ BlobServiceClient = None
45
+
46
+
47
+ class CloudStorageExtractor(BaseExtractor):
48
+ """Extractor for cloud storage data sources."""
49
+
50
+ def validate_config(self, extract_config: Dict[str, Any]) -> None:
51
+ """Validate cloud storage extractor configuration."""
52
+ if 'source_type' in extract_config and extract_config['source_type'] != 'cloud_storage':
53
+ raise ValueError(
54
+ f"CloudStorageExtractor requires source_type='cloud_storage', "
55
+ f"got '{extract_config.get('source_type')}'"
56
+ )
57
+
58
+ storage_config = extract_config.get('storage', {})
59
+ provider = storage_config.get('provider', '').lower()
60
+
61
+ if provider not in ['s3', 'gcs', 'azure']:
62
+ raise ValueError(
63
+ f"Cloud storage provider must be 's3', 'gcs', or 'azure', got '{provider}'"
64
+ )
65
+
66
+ if not storage_config.get('bucket'):
67
+ raise ValueError("Cloud storage extractor requires 'storage.bucket' in extract_config")
68
+
69
+ if not storage_config.get('path'):
70
+ raise ValueError("Cloud storage extractor requires 'storage.path' in extract_config")
71
+
72
+ async def extract_streaming(
73
+ self,
74
+ extract_config: Dict[str, Any],
75
+ params: Dict[str, Any],
76
+ headers: Dict[str, Any],
77
+ contract_dir: Optional[Any] = None,
78
+ batch_size: int = 1000,
79
+ max_records: Optional[int] = None,
80
+ config_context: Optional[Dict[str, Any]] = None,
81
+ ) -> AsyncIterator[List[Dict[str, Any]]]:
82
+ """
83
+ Extract data from cloud storage.
84
+
85
+ Downloads files from cloud storage and processes them using FileExtractor.
86
+ Supports single files and prefixes (for multiple files).
87
+ """
88
+ storage_config = extract_config.get('storage', {})
89
+ provider = storage_config.get('provider', '').lower()
90
+
91
+ # Resolve variables
92
+ source_file = str(contract_dir / "extract.yaml") if contract_dir else None
93
+ bucket = resolve_values(storage_config.get('bucket'), context=config_context, source_file=source_file)
94
+ path = resolve_values(storage_config.get('path'), context=config_context, source_file=source_file)
95
+ credentials = storage_config.get('credentials')
96
+
97
+ # Detect format
98
+ file_format = extract_config.get('format')
99
+ if not file_format:
100
+ # Try to detect from path
101
+ path_obj = Path(path)
102
+ file_format = self._detect_format_from_path(path_obj)
103
+
104
+ logger.info(f"Extracting from {provider.upper()}: {bucket}/{path}")
105
+
106
+ # Download and process files
107
+ if provider == 's3':
108
+ async for batch in self._extract_from_s3(
109
+ bucket, path, credentials, file_format, batch_size, max_records, config_context, source_file
110
+ ):
111
+ yield batch
112
+ elif provider == 'gcs':
113
+ async for batch in self._extract_from_gcs(
114
+ bucket, path, credentials, file_format, batch_size, max_records, config_context, source_file
115
+ ):
116
+ yield batch
117
+ elif provider == 'azure':
118
+ async for batch in self._extract_from_azure(
119
+ bucket, path, credentials, file_format, batch_size, max_records, config_context, source_file
120
+ ):
121
+ yield batch
122
+ else:
123
+ raise ValueError(f"Unsupported cloud storage provider: {provider}")
124
+
125
+ async def _extract_from_s3(
126
+ self,
127
+ bucket: str,
128
+ path: str,
129
+ credentials: Optional[Dict[str, Any]],
130
+ file_format: Optional[str],
131
+ batch_size: int,
132
+ max_records: Optional[int],
133
+ config_context: Optional[Dict[str, Any]],
134
+ source_file: Optional[str],
135
+ ) -> AsyncIterator[List[Dict[str, Any]]]:
136
+ """Extract data from AWS S3."""
137
+ if not S3_AVAILABLE:
138
+ raise ImportError(
139
+ "boto3 is required for S3 extraction. "
140
+ "Install with: pip install boto3 or pip install pycharter[etl]"
141
+ )
142
+
143
+ # Initialize S3 client
144
+ s3_client = boto3.client('s3')
145
+
146
+ # Handle credentials if provided
147
+ if credentials:
148
+ if isinstance(credentials, dict):
149
+ aws_access_key_id = credentials.get('aws_access_key_id')
150
+ aws_secret_access_key = credentials.get('aws_secret_access_key')
151
+ region = credentials.get('region', 'us-east-1')
152
+
153
+ if aws_access_key_id and aws_secret_access_key:
154
+ s3_client = boto3.client(
155
+ 's3',
156
+ aws_access_key_id=aws_access_key_id,
157
+ aws_secret_access_key=aws_secret_access_key,
158
+ region_name=region,
159
+ )
160
+
161
+ # Check if path is a prefix (ends with / or contains *)
162
+ if path.endswith('/') or '*' in path:
163
+ # List objects with prefix
164
+ prefix = path.rstrip('/')
165
+ if '*' in prefix:
166
+ # Convert glob pattern to prefix
167
+ prefix = prefix.split('*')[0]
168
+
169
+ paginator = s3_client.get_paginator('list_objects_v2')
170
+ pages = paginator.paginate(Bucket=bucket, Prefix=prefix)
171
+
172
+ total_extracted = 0
173
+ for page in pages:
174
+ if 'Contents' not in page:
175
+ continue
176
+
177
+ for obj in page['Contents']:
178
+ if max_records and total_extracted >= max_records:
179
+ break
180
+
181
+ key = obj['Key']
182
+ logger.info(f"Processing S3 object: {bucket}/{key}")
183
+
184
+ # Download file to temp location
185
+ with tempfile.NamedTemporaryFile(delete=False, suffix=Path(key).suffix) as tmp_file:
186
+ try:
187
+ s3_client.download_fileobj(bucket, key, tmp_file)
188
+ tmp_path = Path(tmp_file.name)
189
+
190
+ # Use FileExtractor to process the file
191
+ file_extractor = FileExtractor()
192
+ file_config = {
193
+ 'source_type': 'file',
194
+ 'file_path': str(tmp_path),
195
+ 'format': file_format,
196
+ }
197
+
198
+ async for batch in file_extractor.extract_streaming(
199
+ file_config, {}, {}, None, batch_size, max_records, config_context
200
+ ):
201
+ total_extracted += len(batch)
202
+ yield batch
203
+ if max_records and total_extracted >= max_records:
204
+ break
205
+ finally:
206
+ # Cleanup temp file
207
+ if tmp_path.exists():
208
+ tmp_path.unlink()
209
+ else:
210
+ # Single file
211
+ with tempfile.NamedTemporaryFile(delete=False, suffix=Path(path).suffix) as tmp_file:
212
+ try:
213
+ s3_client.download_fileobj(bucket, path, tmp_file)
214
+ tmp_path = Path(tmp_file.name)
215
+
216
+ # Use FileExtractor to process the file
217
+ file_extractor = FileExtractor()
218
+ file_config = {
219
+ 'source_type': 'file',
220
+ 'file_path': str(tmp_path),
221
+ 'format': file_format,
222
+ }
223
+
224
+ async for batch in file_extractor.extract_streaming(
225
+ file_config, {}, {}, None, batch_size, max_records, config_context
226
+ ):
227
+ yield batch
228
+ finally:
229
+ if tmp_path.exists():
230
+ tmp_path.unlink()
231
+
232
+ async def _extract_from_gcs(
233
+ self,
234
+ bucket: str,
235
+ path: str,
236
+ credentials: Optional[Dict[str, Any]],
237
+ file_format: Optional[str],
238
+ batch_size: int,
239
+ max_records: Optional[int],
240
+ config_context: Optional[Dict[str, Any]],
241
+ source_file: Optional[str],
242
+ ) -> AsyncIterator[List[Dict[str, Any]]]:
243
+ """Extract data from Google Cloud Storage."""
244
+ if not GCS_AVAILABLE:
245
+ raise ImportError(
246
+ "google-cloud-storage is required for GCS extraction. "
247
+ "Install with: pip install google-cloud-storage"
248
+ )
249
+
250
+ # Initialize GCS client
251
+ if credentials:
252
+ # Use provided credentials (path to JSON key file or dict)
253
+ if isinstance(credentials, str):
254
+ client = gcs_storage.Client.from_service_account_json(credentials)
255
+ elif isinstance(credentials, dict):
256
+ # Create temporary JSON file
257
+ import json
258
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as tmp:
259
+ json.dump(credentials, tmp)
260
+ tmp_path = tmp.name
261
+ try:
262
+ client = gcs_storage.Client.from_service_account_json(tmp_path)
263
+ finally:
264
+ Path(tmp_path).unlink()
265
+ else:
266
+ client = gcs_storage.Client()
267
+ else:
268
+ client = gcs_storage.Client()
269
+
270
+ bucket_obj = client.bucket(bucket)
271
+
272
+ # Check if path is a prefix
273
+ if path.endswith('/') or '*' in path:
274
+ prefix = path.rstrip('/')
275
+ if '*' in prefix:
276
+ prefix = prefix.split('*')[0]
277
+
278
+ blobs = bucket_obj.list_blobs(prefix=prefix)
279
+
280
+ total_extracted = 0
281
+ for blob in blobs:
282
+ if max_records and total_extracted >= max_records:
283
+ break
284
+
285
+ logger.info(f"Processing GCS blob: {bucket}/{blob.name}")
286
+
287
+ # Download to temp file
288
+ with tempfile.NamedTemporaryFile(delete=False, suffix=Path(blob.name).suffix) as tmp_file:
289
+ try:
290
+ blob.download_to_filename(tmp_file.name)
291
+ tmp_path = Path(tmp_file.name)
292
+
293
+ # Use FileExtractor
294
+ file_extractor = FileExtractor()
295
+ file_config = {
296
+ 'source_type': 'file',
297
+ 'file_path': str(tmp_path),
298
+ 'format': file_format,
299
+ }
300
+
301
+ async for batch in file_extractor.extract_streaming(
302
+ file_config, {}, {}, None, batch_size, max_records, config_context
303
+ ):
304
+ total_extracted += len(batch)
305
+ yield batch
306
+ if max_records and total_extracted >= max_records:
307
+ break
308
+ finally:
309
+ if tmp_path.exists():
310
+ tmp_path.unlink()
311
+ else:
312
+ # Single file
313
+ blob = bucket_obj.blob(path)
314
+ with tempfile.NamedTemporaryFile(delete=False, suffix=Path(path).suffix) as tmp_file:
315
+ try:
316
+ blob.download_to_filename(tmp_file.name)
317
+ tmp_path = Path(tmp_file.name)
318
+
319
+ # Use FileExtractor
320
+ file_extractor = FileExtractor()
321
+ file_config = {
322
+ 'source_type': 'file',
323
+ 'file_path': str(tmp_path),
324
+ 'format': file_format,
325
+ }
326
+
327
+ async for batch in file_extractor.extract_streaming(
328
+ file_config, {}, {}, None, batch_size, max_records, config_context
329
+ ):
330
+ yield batch
331
+ finally:
332
+ if tmp_path.exists():
333
+ tmp_path.unlink()
334
+
335
+ async def _extract_from_azure(
336
+ self,
337
+ container: str,
338
+ path: str,
339
+ credentials: Optional[Dict[str, Any]],
340
+ file_format: Optional[str],
341
+ batch_size: int,
342
+ max_records: Optional[int],
343
+ config_context: Optional[Dict[str, Any]],
344
+ source_file: Optional[str],
345
+ ) -> AsyncIterator[List[Dict[str, Any]]]:
346
+ """Extract data from Azure Blob Storage."""
347
+ if not AZURE_AVAILABLE:
348
+ raise ImportError(
349
+ "azure-storage-blob is required for Azure extraction. "
350
+ "Install with: pip install azure-storage-blob"
351
+ )
352
+
353
+ # Initialize Azure client
354
+ if credentials:
355
+ connection_string = credentials.get('connection_string')
356
+ account_name = credentials.get('account_name')
357
+ account_key = credentials.get('account_key')
358
+
359
+ if connection_string:
360
+ blob_service_client = BlobServiceClient.from_connection_string(connection_string)
361
+ elif account_name and account_key:
362
+ account_url = f"https://{account_name}.blob.core.windows.net"
363
+ blob_service_client = BlobServiceClient(account_url, credential=account_key)
364
+ else:
365
+ raise ValueError("Azure credentials must include 'connection_string' or ('account_name', 'account_key')")
366
+ else:
367
+ # Use default credentials (environment variables)
368
+ blob_service_client = BlobServiceClient.from_connection_string(
369
+ os.environ.get('AZURE_STORAGE_CONNECTION_STRING', '')
370
+ )
371
+
372
+ container_client = blob_service_client.get_container_client(container)
373
+
374
+ # Check if path is a prefix
375
+ if path.endswith('/') or '*' in path:
376
+ prefix = path.rstrip('/')
377
+ if '*' in prefix:
378
+ prefix = prefix.split('*')[0]
379
+
380
+ blobs = container_client.list_blobs(name_starts_with=prefix)
381
+
382
+ total_extracted = 0
383
+ for blob in blobs:
384
+ if max_records and total_extracted >= max_records:
385
+ break
386
+
387
+ logger.info(f"Processing Azure blob: {container}/{blob.name}")
388
+
389
+ # Download to temp file
390
+ blob_client = container_client.get_blob_client(blob.name)
391
+ with tempfile.NamedTemporaryFile(delete=False, suffix=Path(blob.name).suffix) as tmp_file:
392
+ try:
393
+ blob_data = blob_client.download_blob()
394
+ blob_data.download_to_stream(tmp_file)
395
+ tmp_path = Path(tmp_file.name)
396
+
397
+ # Use FileExtractor
398
+ file_extractor = FileExtractor()
399
+ file_config = {
400
+ 'source_type': 'file',
401
+ 'file_path': str(tmp_path),
402
+ 'format': file_format,
403
+ }
404
+
405
+ async for batch in file_extractor.extract_streaming(
406
+ file_config, {}, {}, None, batch_size, max_records, config_context
407
+ ):
408
+ total_extracted += len(batch)
409
+ yield batch
410
+ if max_records and total_extracted >= max_records:
411
+ break
412
+ finally:
413
+ if tmp_path.exists():
414
+ tmp_path.unlink()
415
+ else:
416
+ # Single file
417
+ blob_client = container_client.get_blob_client(path)
418
+ with tempfile.NamedTemporaryFile(delete=False, suffix=Path(path).suffix) as tmp_file:
419
+ try:
420
+ blob_data = blob_client.download_blob()
421
+ blob_data.download_to_stream(tmp_file)
422
+ tmp_path = Path(tmp_file.name)
423
+
424
+ # Use FileExtractor
425
+ file_extractor = FileExtractor()
426
+ file_config = {
427
+ 'source_type': 'file',
428
+ 'file_path': str(tmp_path),
429
+ 'format': file_format,
430
+ }
431
+
432
+ async for batch in file_extractor.extract_streaming(
433
+ file_config, {}, {}, None, batch_size, max_records, config_context
434
+ ):
435
+ yield batch
436
+ finally:
437
+ if tmp_path.exists():
438
+ tmp_path.unlink()
439
+
440
+ def _detect_format_from_path(self, path: Path) -> Optional[str]:
441
+ """Detect file format from path extension."""
442
+ suffix = path.suffix.lower()
443
+ format_map = {
444
+ '.csv': 'csv',
445
+ '.tsv': 'tsv',
446
+ '.json': 'json',
447
+ '.jsonl': 'jsonl',
448
+ '.ndjson': 'jsonl',
449
+ '.parquet': 'parquet',
450
+ '.xlsx': 'excel',
451
+ '.xls': 'excel',
452
+ '.xml': 'xml',
453
+ }
454
+ return format_map.get(suffix)
@@ -0,0 +1,151 @@
1
+ """
2
+ Database extractor for ETL orchestrator.
3
+
4
+ Supports extracting data from databases:
5
+ - PostgreSQL
6
+ - MySQL
7
+ - SQLite
8
+ - MSSQL
9
+ - Oracle
10
+ """
11
+
12
+ import logging
13
+ from typing import Any, AsyncIterator, Dict, List, Optional
14
+
15
+ from sqlalchemy import create_engine, text
16
+ from sqlalchemy.orm import Session, sessionmaker
17
+
18
+ from pycharter.etl_generator.database import (
19
+ create_ssh_tunnel,
20
+ detect_database_type,
21
+ modify_url_for_tunnel,
22
+ DEFAULT_TUNNEL_LOCAL_PORT,
23
+ )
24
+ from pycharter.etl_generator.extractors.base import BaseExtractor
25
+ from pycharter.utils.value_injector import resolve_values
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ class DatabaseExtractor(BaseExtractor):
31
+ """Extractor for database data sources."""
32
+
33
+ def validate_config(self, extract_config: Dict[str, Any]) -> None:
34
+ """Validate database extractor configuration."""
35
+ if 'source_type' in extract_config and extract_config['source_type'] != 'database':
36
+ raise ValueError(f"DatabaseExtractor requires source_type='database', got '{extract_config.get('source_type')}'")
37
+
38
+ db_config = extract_config.get('database', {})
39
+ if not db_config.get('url'):
40
+ raise ValueError("Database extractor requires 'database.url' in extract_config")
41
+
42
+ if not extract_config.get('query'):
43
+ raise ValueError("Database extractor requires 'query' in extract_config")
44
+
45
+ async def extract_streaming(
46
+ self,
47
+ extract_config: Dict[str, Any],
48
+ params: Dict[str, Any],
49
+ headers: Dict[str, Any],
50
+ contract_dir: Optional[Any] = None,
51
+ batch_size: int = 1000,
52
+ max_records: Optional[int] = None,
53
+ config_context: Optional[Dict[str, Any]] = None,
54
+ ) -> AsyncIterator[List[Dict[str, Any]]]:
55
+ """
56
+ Extract data from database using SQL query.
57
+
58
+ Supports parameterized queries and streaming results for large datasets.
59
+ """
60
+ # Get database configuration
61
+ db_config = extract_config.get('database', {})
62
+ query = extract_config.get('query')
63
+
64
+ if not query:
65
+ raise ValueError("Database extractor requires 'query' in extract_config")
66
+
67
+ # Resolve variables
68
+ source_file = str(contract_dir / "extract.yaml") if contract_dir else None
69
+ db_url = resolve_values(db_config.get('url'), context=config_context, source_file=source_file)
70
+ query = resolve_values(query, context=config_context, source_file=source_file)
71
+
72
+ # Resolve query parameters (merge params from config and kwargs)
73
+ query_params = extract_config.get('query_params', {})
74
+ query_params.update(params) # params from kwargs override config params
75
+
76
+ # Handle SSH tunnel if configured
77
+ ssh_config = db_config.get('ssh_tunnel', {})
78
+ tunnel = None
79
+ if ssh_config:
80
+ ssh_config = resolve_values(ssh_config, context=config_context, source_file=source_file)
81
+ enabled_value = ssh_config.get('enabled', False)
82
+ if isinstance(enabled_value, str):
83
+ enabled_lower = enabled_value.lower()
84
+ ssh_config['enabled'] = enabled_lower in ('true', '1', 'yes', 'on')
85
+ elif not isinstance(enabled_value, bool):
86
+ ssh_config['enabled'] = bool(enabled_value)
87
+
88
+ if ssh_config.get('enabled', False):
89
+ tunnel = create_ssh_tunnel(ssh_config)
90
+ if tunnel:
91
+ db_type_from_url = detect_database_type(db_url)
92
+ local_port = int(ssh_config.get('local_port', DEFAULT_TUNNEL_LOCAL_PORT))
93
+ db_url = modify_url_for_tunnel(db_url, local_port, db_type_from_url)
94
+
95
+ # Detect database type
96
+ db_type = db_config.get('type')
97
+ if not db_type:
98
+ db_type = detect_database_type(db_url)
99
+
100
+ # Create engine and session
101
+ engine = create_engine(db_url, echo=False)
102
+ SessionClass = sessionmaker(bind=engine)
103
+ session = SessionClass()
104
+
105
+ try:
106
+ # Execute query with streaming
107
+ logger.info(f"Executing database query (db_type: {db_type})")
108
+ logger.debug(f"Query: {query[:200]}...")
109
+
110
+ result = session.execute(text(query), query_params)
111
+
112
+ # Stream results in batches
113
+ current_batch = []
114
+ total_extracted = 0
115
+
116
+ for row in result:
117
+ if max_records and total_extracted >= max_records:
118
+ break
119
+
120
+ # Convert row to dict
121
+ if hasattr(row, '_asdict'):
122
+ # Named tuple-like row
123
+ record = row._asdict()
124
+ elif hasattr(row, '_mapping'):
125
+ # Row mapping
126
+ record = dict(row._mapping)
127
+ else:
128
+ # Fallback: use column names
129
+ record = {col: getattr(row, col) for col in row.keys()}
130
+
131
+ current_batch.append(record)
132
+ total_extracted += 1
133
+
134
+ if len(current_batch) >= batch_size:
135
+ yield current_batch
136
+ current_batch = []
137
+
138
+ # Yield remaining records
139
+ if current_batch:
140
+ yield current_batch
141
+
142
+ logger.info(f"Database extraction completed: {total_extracted} records extracted")
143
+
144
+ except Exception as e:
145
+ logger.error(f"Database extraction error: {e}", exc_info=True)
146
+ raise RuntimeError(f"Database extraction failed: {e}") from e
147
+ finally:
148
+ session.close()
149
+ engine.dispose()
150
+ if tunnel:
151
+ tunnel.stop()