pycharter 0.0.22__py3-none-any.whl → 0.0.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (332) hide show
  1. api/routes/v1/templates.py +43 -24
  2. pycharter/data/templates/etl/README.md +91 -0
  3. pycharter/data/templates/etl/extract_cloud_azure.yaml +23 -0
  4. pycharter/data/templates/etl/extract_cloud_gcs.yaml +22 -0
  5. pycharter/data/templates/etl/extract_cloud_s3.yaml +24 -0
  6. pycharter/data/templates/etl/extract_database.yaml +28 -0
  7. pycharter/data/templates/etl/extract_database_ssh.yaml +27 -0
  8. pycharter/data/templates/etl/extract_file_csv.yaml +17 -0
  9. pycharter/data/templates/etl/extract_file_glob.yaml +17 -0
  10. pycharter/data/templates/etl/extract_file_json.yaml +14 -0
  11. pycharter/data/templates/etl/extract_file_parquet.yaml +13 -0
  12. pycharter/data/templates/etl/extract_http_paginated.yaml +75 -0
  13. pycharter/data/templates/etl/extract_http_path_params.yaml +45 -0
  14. pycharter/data/templates/etl/extract_http_simple.yaml +52 -0
  15. pycharter/data/templates/etl/load_insert.yaml +17 -0
  16. pycharter/data/templates/etl/load_postgresql.yaml +17 -0
  17. pycharter/data/templates/etl/load_sqlite.yaml +16 -0
  18. pycharter/data/templates/etl/load_truncate_and_load.yaml +18 -0
  19. pycharter/data/templates/etl/load_upsert.yaml +28 -0
  20. pycharter/data/templates/etl/load_with_dlq.yaml +24 -0
  21. pycharter/data/templates/etl/load_with_ssh_tunnel.yaml +28 -0
  22. pycharter/data/templates/etl/pipeline_http_to_db.yaml +38 -0
  23. pycharter/data/templates/etl/transform_combined.yaml +38 -0
  24. pycharter/data/templates/etl/transform_custom_function.yaml +18 -0
  25. pycharter/data/templates/etl/transform_jsonata.yaml +20 -0
  26. pycharter/data/templates/etl/transform_simple.yaml +41 -0
  27. pycharter/db/schemas/.ipynb_checkpoints/data_contract-checkpoint.py +160 -0
  28. pycharter/etl_generator/extraction.py +47 -262
  29. pycharter/etl_generator/extractors/__init__.py +26 -0
  30. pycharter/etl_generator/extractors/base.py +70 -0
  31. pycharter/etl_generator/extractors/cloud_storage.py +454 -0
  32. pycharter/etl_generator/extractors/database.py +151 -0
  33. pycharter/etl_generator/extractors/factory.py +141 -0
  34. pycharter/etl_generator/extractors/file.py +418 -0
  35. pycharter/etl_generator/extractors/http.py +816 -0
  36. {pycharter-0.0.22.dist-info → pycharter-0.0.23.dist-info}/METADATA +6 -1
  37. pycharter-0.0.23.dist-info/RECORD +498 -0
  38. {pycharter-0.0.22.dist-info → pycharter-0.0.23.dist-info}/WHEEL +1 -1
  39. ui/static/404/index.html +1 -1
  40. ui/static/404.html +1 -1
  41. ui/static/__next.__PAGE__.txt +1 -1
  42. ui/static/__next._full.txt +1 -1
  43. ui/static/__next._head.txt +1 -1
  44. ui/static/__next._index.txt +1 -1
  45. ui/static/__next._tree.txt +1 -1
  46. ui/static/_next/static/chunks/26dfc590f7714c03.js +1 -0
  47. ui/static/_next/static/chunks/34d289e6db2ef551.js +1 -0
  48. ui/static/_next/static/chunks/99508d9d5869cc27.js +1 -0
  49. ui/static/_next/static/chunks/b313c35a6ba76574.js +1 -0
  50. ui/static/_not-found/__next._full.txt +1 -1
  51. ui/static/_not-found/__next._head.txt +1 -1
  52. ui/static/_not-found/__next._index.txt +1 -1
  53. ui/static/_not-found/__next._not-found.__PAGE__.txt +1 -1
  54. ui/static/_not-found/__next._not-found.txt +1 -1
  55. ui/static/_not-found/__next._tree.txt +1 -1
  56. ui/static/_not-found/index.html +1 -1
  57. ui/static/_not-found/index.txt +1 -1
  58. ui/static/contracts/__next._full.txt +2 -2
  59. ui/static/contracts/__next._head.txt +1 -1
  60. ui/static/contracts/__next._index.txt +1 -1
  61. ui/static/contracts/__next._tree.txt +1 -1
  62. ui/static/contracts/__next.contracts.__PAGE__.txt +2 -2
  63. ui/static/contracts/__next.contracts.txt +1 -1
  64. ui/static/contracts/index.html +1 -1
  65. ui/static/contracts/index.txt +2 -2
  66. ui/static/documentation/__next._full.txt +1 -1
  67. ui/static/documentation/__next._head.txt +1 -1
  68. ui/static/documentation/__next._index.txt +1 -1
  69. ui/static/documentation/__next._tree.txt +1 -1
  70. ui/static/documentation/__next.documentation.__PAGE__.txt +1 -1
  71. ui/static/documentation/__next.documentation.txt +1 -1
  72. ui/static/documentation/index.html +2 -2
  73. ui/static/documentation/index.txt +1 -1
  74. ui/static/index.html +1 -1
  75. ui/static/index.txt +1 -1
  76. ui/static/metadata/__next._full.txt +1 -1
  77. ui/static/metadata/__next._head.txt +1 -1
  78. ui/static/metadata/__next._index.txt +1 -1
  79. ui/static/metadata/__next._tree.txt +1 -1
  80. ui/static/metadata/__next.metadata.__PAGE__.txt +1 -1
  81. ui/static/metadata/__next.metadata.txt +1 -1
  82. ui/static/metadata/index.html +1 -1
  83. ui/static/metadata/index.txt +1 -1
  84. ui/static/quality/__next._full.txt +2 -2
  85. ui/static/quality/__next._head.txt +1 -1
  86. ui/static/quality/__next._index.txt +1 -1
  87. ui/static/quality/__next._tree.txt +1 -1
  88. ui/static/quality/__next.quality.__PAGE__.txt +2 -2
  89. ui/static/quality/__next.quality.txt +1 -1
  90. ui/static/quality/index.html +2 -2
  91. ui/static/quality/index.txt +2 -2
  92. ui/static/rules/__next._full.txt +1 -1
  93. ui/static/rules/__next._head.txt +1 -1
  94. ui/static/rules/__next._index.txt +1 -1
  95. ui/static/rules/__next._tree.txt +1 -1
  96. ui/static/rules/__next.rules.__PAGE__.txt +1 -1
  97. ui/static/rules/__next.rules.txt +1 -1
  98. ui/static/rules/index.html +1 -1
  99. ui/static/rules/index.txt +1 -1
  100. ui/static/schemas/__next._full.txt +1 -1
  101. ui/static/schemas/__next._head.txt +1 -1
  102. ui/static/schemas/__next._index.txt +1 -1
  103. ui/static/schemas/__next._tree.txt +1 -1
  104. ui/static/schemas/__next.schemas.__PAGE__.txt +1 -1
  105. ui/static/schemas/__next.schemas.txt +1 -1
  106. ui/static/schemas/index.html +1 -1
  107. ui/static/schemas/index.txt +1 -1
  108. ui/static/settings/__next._full.txt +1 -1
  109. ui/static/settings/__next._head.txt +1 -1
  110. ui/static/settings/__next._index.txt +1 -1
  111. ui/static/settings/__next._tree.txt +1 -1
  112. ui/static/settings/__next.settings.__PAGE__.txt +1 -1
  113. ui/static/settings/__next.settings.txt +1 -1
  114. ui/static/settings/index.html +1 -1
  115. ui/static/settings/index.txt +1 -1
  116. ui/static/static/404/index.html +1 -1
  117. ui/static/static/404.html +1 -1
  118. ui/static/static/__next.__PAGE__.txt +1 -1
  119. ui/static/static/__next._full.txt +2 -2
  120. ui/static/static/__next._head.txt +1 -1
  121. ui/static/static/__next._index.txt +2 -2
  122. ui/static/static/__next._tree.txt +2 -2
  123. ui/static/static/_next/static/chunks/13d4a0fbd74c1ee4.js +1 -0
  124. ui/static/static/_next/static/chunks/2edb43b48432ac04.js +441 -0
  125. ui/static/static/_next/static/chunks/d2363397e1b2bcab.css +1 -0
  126. ui/static/static/_next/static/chunks/f7d1a90dd75d2572.js +1 -0
  127. ui/static/static/_not-found/__next._full.txt +2 -2
  128. ui/static/static/_not-found/__next._head.txt +1 -1
  129. ui/static/static/_not-found/__next._index.txt +2 -2
  130. ui/static/static/_not-found/__next._not-found.__PAGE__.txt +1 -1
  131. ui/static/static/_not-found/__next._not-found.txt +1 -1
  132. ui/static/static/_not-found/__next._tree.txt +2 -2
  133. ui/static/static/_not-found/index.html +1 -1
  134. ui/static/static/_not-found/index.txt +2 -2
  135. ui/static/static/contracts/__next._full.txt +3 -3
  136. ui/static/static/contracts/__next._head.txt +1 -1
  137. ui/static/static/contracts/__next._index.txt +2 -2
  138. ui/static/static/contracts/__next._tree.txt +2 -2
  139. ui/static/static/contracts/__next.contracts.__PAGE__.txt +2 -2
  140. ui/static/static/contracts/__next.contracts.txt +1 -1
  141. ui/static/static/contracts/index.html +1 -1
  142. ui/static/static/contracts/index.txt +3 -3
  143. ui/static/static/documentation/__next._full.txt +3 -3
  144. ui/static/static/documentation/__next._head.txt +1 -1
  145. ui/static/static/documentation/__next._index.txt +2 -2
  146. ui/static/static/documentation/__next._tree.txt +2 -2
  147. ui/static/static/documentation/__next.documentation.__PAGE__.txt +2 -2
  148. ui/static/static/documentation/__next.documentation.txt +1 -1
  149. ui/static/static/documentation/index.html +2 -2
  150. ui/static/static/documentation/index.txt +3 -3
  151. ui/static/static/index.html +1 -1
  152. ui/static/static/index.txt +2 -2
  153. ui/static/static/metadata/__next._full.txt +2 -2
  154. ui/static/static/metadata/__next._head.txt +1 -1
  155. ui/static/static/metadata/__next._index.txt +2 -2
  156. ui/static/static/metadata/__next._tree.txt +2 -2
  157. ui/static/static/metadata/__next.metadata.__PAGE__.txt +1 -1
  158. ui/static/static/metadata/__next.metadata.txt +1 -1
  159. ui/static/static/metadata/index.html +1 -1
  160. ui/static/static/metadata/index.txt +2 -2
  161. ui/static/static/quality/__next._full.txt +2 -2
  162. ui/static/static/quality/__next._head.txt +1 -1
  163. ui/static/static/quality/__next._index.txt +2 -2
  164. ui/static/static/quality/__next._tree.txt +2 -2
  165. ui/static/static/quality/__next.quality.__PAGE__.txt +1 -1
  166. ui/static/static/quality/__next.quality.txt +1 -1
  167. ui/static/static/quality/index.html +2 -2
  168. ui/static/static/quality/index.txt +2 -2
  169. ui/static/static/rules/__next._full.txt +2 -2
  170. ui/static/static/rules/__next._head.txt +1 -1
  171. ui/static/static/rules/__next._index.txt +2 -2
  172. ui/static/static/rules/__next._tree.txt +2 -2
  173. ui/static/static/rules/__next.rules.__PAGE__.txt +1 -1
  174. ui/static/static/rules/__next.rules.txt +1 -1
  175. ui/static/static/rules/index.html +1 -1
  176. ui/static/static/rules/index.txt +2 -2
  177. ui/static/static/schemas/__next._full.txt +2 -2
  178. ui/static/static/schemas/__next._head.txt +1 -1
  179. ui/static/static/schemas/__next._index.txt +2 -2
  180. ui/static/static/schemas/__next._tree.txt +2 -2
  181. ui/static/static/schemas/__next.schemas.__PAGE__.txt +1 -1
  182. ui/static/static/schemas/__next.schemas.txt +1 -1
  183. ui/static/static/schemas/index.html +1 -1
  184. ui/static/static/schemas/index.txt +2 -2
  185. ui/static/static/settings/__next._full.txt +2 -2
  186. ui/static/static/settings/__next._head.txt +1 -1
  187. ui/static/static/settings/__next._index.txt +2 -2
  188. ui/static/static/settings/__next._tree.txt +2 -2
  189. ui/static/static/settings/__next.settings.__PAGE__.txt +1 -1
  190. ui/static/static/settings/__next.settings.txt +1 -1
  191. ui/static/static/settings/index.html +1 -1
  192. ui/static/static/settings/index.txt +2 -2
  193. ui/static/static/static/.gitkeep +0 -0
  194. ui/static/static/static/404/index.html +1 -0
  195. ui/static/static/static/404.html +1 -0
  196. ui/static/static/static/__next.__PAGE__.txt +10 -0
  197. ui/static/static/static/__next._full.txt +30 -0
  198. ui/static/static/static/__next._head.txt +7 -0
  199. ui/static/static/static/__next._index.txt +9 -0
  200. ui/static/static/static/__next._tree.txt +2 -0
  201. ui/static/static/static/_next/static/chunks/222442f6da32302a.js +1 -0
  202. ui/static/static/static/_next/static/chunks/247eb132b7f7b574.js +1 -0
  203. ui/static/static/static/_next/static/chunks/297d55555b71baba.js +1 -0
  204. ui/static/static/static/_next/static/chunks/2ab439ce003cd691.js +1 -0
  205. ui/static/static/static/_next/static/chunks/414e77373f8ff61c.js +1 -0
  206. ui/static/static/static/_next/static/chunks/49ca65abd26ae49e.js +1 -0
  207. ui/static/static/static/_next/static/chunks/652ad0aa26265c47.js +2 -0
  208. ui/static/static/static/_next/static/chunks/9667e7a3d359eb39.js +1 -0
  209. ui/static/static/static/_next/static/chunks/9c23f44fff36548a.js +1 -0
  210. ui/static/static/static/_next/static/chunks/a6dad97d9634a72d.js +1 -0
  211. ui/static/static/static/_next/static/chunks/b32a0963684b9933.js +4 -0
  212. ui/static/static/static/_next/static/chunks/c69f6cba366bd988.js +1 -0
  213. ui/static/static/static/_next/static/chunks/db913959c675cea6.js +1 -0
  214. ui/static/static/static/_next/static/chunks/f061a4be97bfc3b3.js +1 -0
  215. ui/static/static/static/_next/static/chunks/f2e7afeab1178138.js +1 -0
  216. ui/static/static/static/_next/static/chunks/ff1a16fafef87110.js +1 -0
  217. ui/static/static/static/_next/static/chunks/turbopack-ffcb7ab6794027ef.js +3 -0
  218. ui/static/static/static/_next/static/tNTkVW6puVXC4bAm4WrHl/_buildManifest.js +11 -0
  219. ui/static/static/static/_next/static/tNTkVW6puVXC4bAm4WrHl/_ssgManifest.js +1 -0
  220. ui/static/static/static/_not-found/__next._full.txt +17 -0
  221. ui/static/static/static/_not-found/__next._head.txt +7 -0
  222. ui/static/static/static/_not-found/__next._index.txt +9 -0
  223. ui/static/static/static/_not-found/__next._not-found.__PAGE__.txt +5 -0
  224. ui/static/static/static/_not-found/__next._not-found.txt +4 -0
  225. ui/static/static/static/_not-found/__next._tree.txt +2 -0
  226. ui/static/static/static/_not-found/index.html +1 -0
  227. ui/static/static/static/_not-found/index.txt +17 -0
  228. ui/static/static/static/contracts/__next._full.txt +21 -0
  229. ui/static/static/static/contracts/__next._head.txt +7 -0
  230. ui/static/static/static/contracts/__next._index.txt +9 -0
  231. ui/static/static/static/contracts/__next._tree.txt +2 -0
  232. ui/static/static/static/contracts/__next.contracts.__PAGE__.txt +9 -0
  233. ui/static/static/static/contracts/__next.contracts.txt +4 -0
  234. ui/static/static/static/contracts/index.html +1 -0
  235. ui/static/static/static/contracts/index.txt +21 -0
  236. ui/static/static/static/documentation/__next._full.txt +21 -0
  237. ui/static/static/static/documentation/__next._head.txt +7 -0
  238. ui/static/static/static/documentation/__next._index.txt +9 -0
  239. ui/static/static/static/documentation/__next._tree.txt +2 -0
  240. ui/static/static/static/documentation/__next.documentation.__PAGE__.txt +9 -0
  241. ui/static/static/static/documentation/__next.documentation.txt +4 -0
  242. ui/static/static/static/documentation/index.html +93 -0
  243. ui/static/static/static/documentation/index.txt +21 -0
  244. ui/static/static/static/index.html +1 -0
  245. ui/static/static/static/index.txt +30 -0
  246. ui/static/static/static/metadata/__next._full.txt +21 -0
  247. ui/static/static/static/metadata/__next._head.txt +7 -0
  248. ui/static/static/static/metadata/__next._index.txt +9 -0
  249. ui/static/static/static/metadata/__next._tree.txt +2 -0
  250. ui/static/static/static/metadata/__next.metadata.__PAGE__.txt +9 -0
  251. ui/static/static/static/metadata/__next.metadata.txt +4 -0
  252. ui/static/static/static/metadata/index.html +1 -0
  253. ui/static/static/static/metadata/index.txt +21 -0
  254. ui/static/static/static/quality/__next._full.txt +21 -0
  255. ui/static/static/static/quality/__next._head.txt +7 -0
  256. ui/static/static/static/quality/__next._index.txt +9 -0
  257. ui/static/static/static/quality/__next._tree.txt +2 -0
  258. ui/static/static/static/quality/__next.quality.__PAGE__.txt +9 -0
  259. ui/static/static/static/quality/__next.quality.txt +4 -0
  260. ui/static/static/static/quality/index.html +2 -0
  261. ui/static/static/static/quality/index.txt +21 -0
  262. ui/static/static/static/rules/__next._full.txt +21 -0
  263. ui/static/static/static/rules/__next._head.txt +7 -0
  264. ui/static/static/static/rules/__next._index.txt +9 -0
  265. ui/static/static/static/rules/__next._tree.txt +2 -0
  266. ui/static/static/static/rules/__next.rules.__PAGE__.txt +9 -0
  267. ui/static/static/static/rules/__next.rules.txt +4 -0
  268. ui/static/static/static/rules/index.html +1 -0
  269. ui/static/static/static/rules/index.txt +21 -0
  270. ui/static/static/static/schemas/__next._full.txt +21 -0
  271. ui/static/static/static/schemas/__next._head.txt +7 -0
  272. ui/static/static/static/schemas/__next._index.txt +9 -0
  273. ui/static/static/static/schemas/__next._tree.txt +2 -0
  274. ui/static/static/static/schemas/__next.schemas.__PAGE__.txt +9 -0
  275. ui/static/static/static/schemas/__next.schemas.txt +4 -0
  276. ui/static/static/static/schemas/index.html +1 -0
  277. ui/static/static/static/schemas/index.txt +21 -0
  278. ui/static/static/static/settings/__next._full.txt +21 -0
  279. ui/static/static/static/settings/__next._head.txt +7 -0
  280. ui/static/static/static/settings/__next._index.txt +9 -0
  281. ui/static/static/static/settings/__next._tree.txt +2 -0
  282. ui/static/static/static/settings/__next.settings.__PAGE__.txt +9 -0
  283. ui/static/static/static/settings/__next.settings.txt +4 -0
  284. ui/static/static/static/settings/index.html +1 -0
  285. ui/static/static/static/settings/index.txt +21 -0
  286. ui/static/static/static/validation/__next._full.txt +21 -0
  287. ui/static/static/static/validation/__next._head.txt +7 -0
  288. ui/static/static/static/validation/__next._index.txt +9 -0
  289. ui/static/static/static/validation/__next._tree.txt +2 -0
  290. ui/static/static/static/validation/__next.validation.__PAGE__.txt +9 -0
  291. ui/static/static/static/validation/__next.validation.txt +4 -0
  292. ui/static/static/static/validation/index.html +1 -0
  293. ui/static/static/static/validation/index.txt +21 -0
  294. ui/static/static/validation/__next._full.txt +2 -2
  295. ui/static/static/validation/__next._head.txt +1 -1
  296. ui/static/static/validation/__next._index.txt +2 -2
  297. ui/static/static/validation/__next._tree.txt +2 -2
  298. ui/static/static/validation/__next.validation.__PAGE__.txt +1 -1
  299. ui/static/static/validation/__next.validation.txt +1 -1
  300. ui/static/static/validation/index.html +1 -1
  301. ui/static/static/validation/index.txt +2 -2
  302. ui/static/validation/__next._full.txt +2 -2
  303. ui/static/validation/__next._head.txt +1 -1
  304. ui/static/validation/__next._index.txt +1 -1
  305. ui/static/validation/__next._tree.txt +1 -1
  306. ui/static/validation/__next.validation.__PAGE__.txt +2 -2
  307. ui/static/validation/__next.validation.txt +1 -1
  308. ui/static/validation/index.html +1 -1
  309. ui/static/validation/index.txt +2 -2
  310. pycharter/data/templates/template_transform_advanced.yaml +0 -50
  311. pycharter/data/templates/template_transform_simple.yaml +0 -59
  312. pycharter-0.0.22.dist-info/RECORD +0 -358
  313. /pycharter/data/templates/{template_coercion_rules.yaml → contract/template_coercion_rules.yaml} +0 -0
  314. /pycharter/data/templates/{template_contract.yaml → contract/template_contract.yaml} +0 -0
  315. /pycharter/data/templates/{template_metadata.yaml → contract/template_metadata.yaml} +0 -0
  316. /pycharter/data/templates/{template_schema.yaml → contract/template_schema.yaml} +0 -0
  317. /pycharter/data/templates/{template_validation_rules.yaml → contract/template_validation_rules.yaml} +0 -0
  318. {pycharter-0.0.22.dist-info → pycharter-0.0.23.dist-info}/entry_points.txt +0 -0
  319. {pycharter-0.0.22.dist-info → pycharter-0.0.23.dist-info}/licenses/LICENSE +0 -0
  320. {pycharter-0.0.22.dist-info → pycharter-0.0.23.dist-info}/top_level.txt +0 -0
  321. /ui/static/_next/static/{0rYA78L88aUyD2Uh38hhX → 2gKjNv6YvE6BcIdFthBLs}/_buildManifest.js +0 -0
  322. /ui/static/_next/static/{0rYA78L88aUyD2Uh38hhX → 2gKjNv6YvE6BcIdFthBLs}/_ssgManifest.js +0 -0
  323. /ui/static/static/_next/static/{tNTkVW6puVXC4bAm4WrHl → 0rYA78L88aUyD2Uh38hhX}/_buildManifest.js +0 -0
  324. /ui/static/static/_next/static/{tNTkVW6puVXC4bAm4WrHl → 0rYA78L88aUyD2Uh38hhX}/_ssgManifest.js +0 -0
  325. /ui/static/{_next → static/_next}/static/chunks/c4fa4f4114b7c352.js +0 -0
  326. /ui/static/static/{_next → static/_next}/static/chunks/4e310fe5005770a3.css +0 -0
  327. /ui/static/{_next → static/static/_next}/static/chunks/5e04d10c4a7b58a3.js +0 -0
  328. /ui/static/static/{_next → static/_next}/static/chunks/5fc14c00a2779dc5.js +0 -0
  329. /ui/static/{_next → static/static/_next}/static/chunks/75d88a058d8ffaa6.js +0 -0
  330. /ui/static/{_next → static/static/_next}/static/chunks/8c89634cf6bad76f.js +0 -0
  331. /ui/static/static/{_next → static/_next}/static/chunks/b584574fdc8ab13e.js +0 -0
  332. /ui/static/static/{_next → static/_next}/static/chunks/d5989c94d3614b3a.js +0 -0
@@ -1,13 +1,14 @@
1
1
  """
2
- HTTP extraction utilities for ETL orchestrator.
3
-
4
- This module handles:
5
- - HTTP request configuration
6
- - Retry logic
7
- - Response parsing
8
- - Data extraction from various response formats
9
- - Pagination support (page, offset, cursor, next_url, link_header)
10
- - Streaming extraction for high-volume data
2
+ Extraction utilities for ETL orchestrator.
3
+
4
+ This module provides the main entry point for data extraction from various sources:
5
+ - HTTP/API extraction
6
+ - File-based extraction (CSV, JSON, Parquet, Excel, TSV, XML)
7
+ - Database extraction (PostgreSQL, MySQL, SQLite, MSSQL, Oracle)
8
+ - Cloud storage extraction (S3, GCS, Azure Blob)
9
+
10
+ The module maintains backward compatibility with the original HTTP-only interface
11
+ while supporting the new modular extractor architecture.
11
12
  """
12
13
 
13
14
  import asyncio
@@ -648,269 +649,53 @@ async def extract_with_pagination_streaming(
648
649
  """
649
650
  Extract data with pagination support, yielding batches for memory-efficient processing.
650
651
 
652
+ This is the main entry point for data extraction. It supports multiple source types:
653
+ - HTTP/API (default for backward compatibility)
654
+ - File-based (CSV, JSON, Parquet, Excel, TSV, XML)
655
+ - Database (PostgreSQL, MySQL, SQLite, MSSQL, Oracle)
656
+ - Cloud storage (S3, GCS, Azure Blob)
657
+
658
+ The source type is auto-detected from extract_config or can be explicitly set
659
+ via 'source_type' field.
660
+
651
661
  Yields batches as they are extracted, preventing memory exhaustion for large datasets.
652
662
 
653
663
  Args:
654
664
  extract_config: Extract configuration dictionary
655
- params: Request parameters
656
- headers: Request headers
665
+ params: Request/query parameters (source-specific)
666
+ headers: Request headers (source-specific, mainly for HTTP)
657
667
  contract_dir: Contract directory (for variable resolution)
658
668
  batch_size: Number of records to yield per batch
659
669
  max_records: Maximum total records to extract (None = all)
670
+ config_context: Optional context dictionary for value injection
660
671
 
661
672
  Yields:
662
673
  Batches of extracted records (lists of dictionaries)
663
- """
664
- pagination_config = extract_config.get('pagination', {})
665
-
666
- # If pagination is not enabled, extract all and yield in batches
667
- if not pagination_config.get('enabled', False):
668
- logger.info("Pagination disabled, extracting all data in single request")
669
- all_data = await extract_with_retry(extract_config, params, headers, contract_dir, config_context=config_context)
670
- if max_records:
671
- logger.info(f"Limiting to {max_records} records (extracted {len(all_data)})")
672
- all_data = all_data[:max_records]
673
-
674
- logger.info(f"Yielding {len(all_data)} records in batches of {batch_size}")
675
- for i in range(0, len(all_data), batch_size):
676
- batch = all_data[i:i + batch_size]
677
- logger.debug(f"Yielding batch {i // batch_size + 1} with {len(batch)} records")
678
- yield batch
679
- return
680
-
681
- # Pagination enabled - stream pages and yield in batches
682
- strategy = pagination_config.get('strategy', 'page')
683
- stop_conditions = pagination_config.get('stop_conditions', [])
684
- page_delay = float(pagination_config.get('page_delay', 0.1))
685
- max_pages = 1000
686
- max_records_from_config = None
687
-
688
- # Get max_pages and max_records from stop conditions
689
- for condition in stop_conditions:
690
- if condition.get('type') == 'max_pages':
691
- max_pages = condition.get('value', 1000)
692
- elif condition.get('type') == 'max_records':
693
- max_records_from_config = condition.get('value')
694
-
695
- # Use config max_records if not provided as parameter
696
- if max_records is None:
697
- max_records = max_records_from_config
698
-
699
- current_batch = []
700
- total_extracted = 0
701
- page_count = 0
702
- current_url = None
703
- current_cursor = None
704
-
705
- # Initialize pagination state
706
- if strategy == 'page':
707
- page_config = pagination_config.get('page', {})
708
- current_page = page_config.get('start', 0)
709
- page_increment = page_config.get('increment', 1)
710
- page_param_name = page_config.get('param_name', 'page')
711
- elif strategy == 'offset':
712
- offset_config = pagination_config.get('offset', {})
713
- current_offset = offset_config.get('start', 0)
714
- offset_param_name = offset_config.get('param_name', 'offset')
715
- increment_by = offset_config.get('increment_by', 'limit')
716
- elif strategy == 'cursor':
717
- cursor_config = pagination_config.get('cursor', {})
718
- cursor_param_name = cursor_config.get('param_name', 'cursor')
719
- cursor_response_path = cursor_config.get('response_path', 'next_cursor')
720
- elif strategy == 'next_url':
721
- next_url_config = pagination_config.get('next_url', {})
722
- next_url_response_path = next_url_config.get('response_path', 'next_url')
723
- elif strategy == 'link_header':
724
- pass
725
- else:
726
- raise ValueError(f"Unsupported pagination strategy: {strategy}")
727
-
728
- extract_config_copy = extract_config.copy()
729
- original_endpoint = extract_config_copy.get('api_endpoint')
730
- original_base_url = extract_config_copy.get('base_url', '')
731
674
 
732
- logger.info(
733
- f"Starting paginated extraction (strategy: {strategy}, "
734
- f"max_pages: {max_pages}, batch_size: {batch_size}, "
735
- f"page_delay: {page_delay}s)"
736
- )
737
-
738
- while page_count < max_pages:
739
- # Check max_records limit
740
- if max_records and total_extracted >= max_records:
741
- logger.info(
742
- f"Reached max_records limit ({max_records}), stopping pagination "
743
- f"(extracted {total_extracted} records from {page_count} pages)"
744
- )
745
- if current_batch:
746
- yield current_batch
747
- return
748
-
749
- # Update params/URL based on strategy
750
- if strategy == 'page':
751
- params[page_param_name] = current_page
752
- logger.debug(f"Fetching page {current_page} (page_count: {page_count + 1}/{max_pages})")
753
- elif strategy == 'offset':
754
- params[offset_param_name] = current_offset
755
- elif strategy == 'cursor' and current_cursor:
756
- params[cursor_param_name] = current_cursor
757
- elif strategy == 'next_url' and current_url:
758
- extract_config_copy['api_endpoint'] = current_url
759
- extract_config_copy['base_url'] = ''
760
-
761
- # Make request
762
- need_full_response = strategy in ['cursor', 'next_url', 'link_header']
763
- try:
764
- logger.debug(f"Extracting page {page_count + 1} (total extracted so far: {total_extracted})")
765
- page_data, full_response_data, response_obj = await _extract_single_page(
766
- extract_config_copy, params, headers, contract_dir, return_full_response=need_full_response, config_context=config_context
767
- )
768
- logger.info(f"Page {page_count + 1} extracted: {len(page_data)} records")
769
- except Exception as e:
770
- logger.error(
771
- f"Error extracting page {page_count + 1}",
772
- extra={
773
- 'page': page_count + 1,
774
- 'extracted': total_extracted,
775
- },
776
- exc_info=True
777
- )
778
- # Yield what we have so far before raising
779
- if current_batch:
780
- yield current_batch
781
- raise
782
-
783
- # Restore original endpoint if modified
784
- if strategy == 'next_url' and current_url:
785
- extract_config_copy['api_endpoint'] = original_endpoint
786
- extract_config_copy['base_url'] = original_base_url
787
-
788
- # Check for empty page first (before adding to batch)
789
- if not page_data:
790
- logger.info(f"Empty page {page_count + 1} received, stopping pagination")
791
- if current_batch:
792
- yield current_batch
793
- break
794
-
795
- # Check stop conditions BEFORE adding records to batch
796
- # This prevents unnecessary API calls when we know we should stop
797
- page_count += 1
798
- limit_value = params.get('limit', 100)
799
- record_count = len(page_data)
800
- logger.info(
801
- f"Evaluating stop conditions for page {page_count}: "
802
- f"{record_count} records returned, limit={limit_value}"
803
- )
804
- should_stop = _check_stop_conditions(page_data, stop_conditions, params, full_response_data)
805
- if should_stop:
806
- logger.info(
807
- f"✅ Stop condition met at page {page_count} "
808
- f"(page returned {record_count} records, limit: {limit_value})"
809
- )
810
- # Add the final page's records to current batch before yielding
811
- for record in page_data:
812
- current_batch.append(record)
813
- total_extracted += 1
814
- # Yield batch if it reaches batch_size during this final page
815
- if len(current_batch) >= batch_size:
816
- yield current_batch
817
- current_batch = []
818
- # Yield any remaining records
819
- if current_batch:
820
- yield current_batch
821
- break
822
-
823
- # Add page data to current batch (only if we're not stopping)
824
- for record in page_data:
825
- current_batch.append(record)
826
- total_extracted += 1
827
-
828
- # Yield batch when full
829
- if len(current_batch) >= batch_size:
830
- yield current_batch
831
- current_batch = []
832
-
833
- # Check max_records limit
834
- if max_records and total_extracted >= max_records:
835
- if current_batch:
836
- yield current_batch
837
- return
675
+ Example:
676
+ >>> # HTTP extraction (backward compatible)
677
+ >>> extract_config = {
678
+ ... 'base_url': 'https://api.example.com',
679
+ ... 'api_endpoint': '/v1/data'
680
+ ... }
681
+ >>> async for batch in extract_with_pagination_streaming(extract_config, {}, {}):
682
+ ... print(f"Extracted {len(batch)} records")
838
683
 
839
- # Extract pagination token/URL for next iteration
840
- if strategy == 'cursor' and full_response_data:
841
- try:
842
- current = full_response_data
843
- for part in cursor_response_path.split('.'):
844
- if isinstance(current, dict):
845
- current = current.get(part)
846
- elif isinstance(current, list) and part.isdigit():
847
- current = current[int(part)]
848
- else:
849
- current = None
850
- break
851
-
852
- if current and isinstance(current, str):
853
- current_cursor = current
854
- elif current:
855
- current_cursor = str(current)
856
- else:
857
- if current_batch:
858
- yield current_batch
859
- break
860
- except (KeyError, IndexError, TypeError, ValueError):
861
- if current_batch:
862
- yield current_batch
863
- break
864
-
865
- elif strategy == 'next_url' and full_response_data:
866
- try:
867
- current = full_response_data
868
- for part in next_url_response_path.split('.'):
869
- if isinstance(current, dict):
870
- current = current.get(part)
871
- elif isinstance(current, list) and part.isdigit():
872
- current = current[int(part)]
873
- else:
874
- current = None
875
- break
876
-
877
- if current and isinstance(current, str):
878
- current_url = current
879
- else:
880
- current_url = None
881
-
882
- if not current_url:
883
- if current_batch:
884
- yield current_batch
885
- break
886
- except (KeyError, IndexError, TypeError, ValueError):
887
- if current_batch:
888
- yield current_batch
889
- break
890
-
891
- elif strategy == 'link_header' and response_obj:
892
- current_url = _extract_link_header_url(response_obj)
893
- if not current_url:
894
- if current_batch:
895
- yield current_batch
896
- break
897
- extract_config_copy['api_endpoint'] = current_url
898
- extract_config_copy['base_url'] = ''
899
-
900
- # Update pagination state
901
- if strategy == 'page':
902
- current_page += page_increment
903
- elif strategy == 'offset':
904
- limit = params.get('limit', 100)
905
- if increment_by == 'limit':
906
- current_offset += limit
907
- else:
908
- current_offset += int(increment_by)
909
-
910
- # Delay between pages
911
- if page_delay > 0:
912
- await asyncio.sleep(page_delay)
684
+ >>> # File extraction
685
+ >>> extract_config = {
686
+ ... 'source_type': 'file',
687
+ ... 'file_path': '/path/to/data.csv'
688
+ ... }
689
+ >>> async for batch in extract_with_pagination_streaming(extract_config, {}, {}):
690
+ ... print(f"Extracted {len(batch)} records")
691
+ """
692
+ # Use factory to get appropriate extractor
693
+ from pycharter.etl_generator.extractors.factory import ExtractorFactory
694
+
695
+ extractor = ExtractorFactory.get_extractor(extract_config)
913
696
 
914
- # Yield remaining records
915
- if current_batch:
916
- yield current_batch
697
+ # Delegate to extractor
698
+ async for batch in extractor.extract_streaming(
699
+ extract_config, params, headers, contract_dir, batch_size, max_records, config_context
700
+ ):
701
+ yield batch
@@ -0,0 +1,26 @@
1
+ """
2
+ Extractors module for ETL orchestrator.
3
+
4
+ This module provides a modular architecture for data extraction from various sources:
5
+ - HTTP/API extraction
6
+ - File-based extraction (CSV, JSON, Parquet, Excel, TSV, XML)
7
+ - Database extraction (PostgreSQL, MySQL, SQLite, MSSQL, Oracle)
8
+ - Cloud storage extraction (S3, GCS, Azure Blob)
9
+ """
10
+
11
+ from pycharter.etl_generator.extractors.base import BaseExtractor
12
+ from pycharter.etl_generator.extractors.cloud_storage import CloudStorageExtractor
13
+ from pycharter.etl_generator.extractors.database import DatabaseExtractor
14
+ from pycharter.etl_generator.extractors.file import FileExtractor
15
+ from pycharter.etl_generator.extractors.factory import ExtractorFactory, get_extractor
16
+ from pycharter.etl_generator.extractors.http import HTTPExtractor
17
+
18
+ __all__ = [
19
+ "BaseExtractor",
20
+ "ExtractorFactory",
21
+ "get_extractor",
22
+ "HTTPExtractor",
23
+ "FileExtractor",
24
+ "DatabaseExtractor",
25
+ "CloudStorageExtractor",
26
+ ]
@@ -0,0 +1,70 @@
1
+ """
2
+ Base extractor interface for ETL orchestrator.
3
+
4
+ All extractors must implement this interface to ensure consistent behavior
5
+ across different data sources.
6
+ """
7
+
8
+ from abc import ABC, abstractmethod
9
+ from typing import Any, AsyncIterator, Dict, List, Optional
10
+
11
+
12
+ class BaseExtractor(ABC):
13
+ """
14
+ Base class for all data extractors.
15
+
16
+ All extractors must implement the extract_streaming method which yields
17
+ batches of records as dictionaries. Extractors are schema-agnostic and
18
+ focus purely on data retrieval from their respective sources.
19
+ """
20
+
21
+ @abstractmethod
22
+ async def extract_streaming(
23
+ self,
24
+ extract_config: Dict[str, Any],
25
+ params: Dict[str, Any],
26
+ headers: Dict[str, Any],
27
+ contract_dir: Optional[Any] = None,
28
+ batch_size: int = 1000,
29
+ max_records: Optional[int] = None,
30
+ config_context: Optional[Dict[str, Any]] = None,
31
+ ) -> AsyncIterator[List[Dict[str, Any]]]:
32
+ """
33
+ Extract data in batches using async generator.
34
+
35
+ This is the main interface that all extractors must implement.
36
+ It yields batches of records as lists of dictionaries, allowing
37
+ for memory-efficient processing of large datasets.
38
+
39
+ Args:
40
+ extract_config: Extract configuration dictionary (source-specific)
41
+ params: Request/query parameters (may be source-specific)
42
+ headers: Request headers (may be source-specific)
43
+ contract_dir: Contract directory path (for variable resolution)
44
+ batch_size: Number of records to yield per batch
45
+ max_records: Maximum total records to extract (None = all)
46
+ config_context: Optional context dictionary for value injection
47
+
48
+ Yields:
49
+ Batches of extracted records (lists of dictionaries)
50
+
51
+ Raises:
52
+ RuntimeError: If extraction fails
53
+ ValueError: If configuration is invalid
54
+ """
55
+ pass
56
+
57
+ def validate_config(self, extract_config: Dict[str, Any]) -> None:
58
+ """
59
+ Validate extractor-specific configuration.
60
+
61
+ Override this method in subclasses to validate source-specific
62
+ configuration requirements.
63
+
64
+ Args:
65
+ extract_config: Extract configuration dictionary
66
+
67
+ Raises:
68
+ ValueError: If configuration is invalid
69
+ """
70
+ pass