pycharter 0.0.22__py3-none-any.whl → 0.0.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (332) hide show
  1. api/routes/v1/templates.py +43 -24
  2. pycharter/data/templates/etl/README.md +91 -0
  3. pycharter/data/templates/etl/extract_cloud_azure.yaml +23 -0
  4. pycharter/data/templates/etl/extract_cloud_gcs.yaml +22 -0
  5. pycharter/data/templates/etl/extract_cloud_s3.yaml +24 -0
  6. pycharter/data/templates/etl/extract_database.yaml +28 -0
  7. pycharter/data/templates/etl/extract_database_ssh.yaml +27 -0
  8. pycharter/data/templates/etl/extract_file_csv.yaml +17 -0
  9. pycharter/data/templates/etl/extract_file_glob.yaml +17 -0
  10. pycharter/data/templates/etl/extract_file_json.yaml +14 -0
  11. pycharter/data/templates/etl/extract_file_parquet.yaml +13 -0
  12. pycharter/data/templates/etl/extract_http_paginated.yaml +75 -0
  13. pycharter/data/templates/etl/extract_http_path_params.yaml +45 -0
  14. pycharter/data/templates/etl/extract_http_simple.yaml +52 -0
  15. pycharter/data/templates/etl/load_insert.yaml +17 -0
  16. pycharter/data/templates/etl/load_postgresql.yaml +17 -0
  17. pycharter/data/templates/etl/load_sqlite.yaml +16 -0
  18. pycharter/data/templates/etl/load_truncate_and_load.yaml +18 -0
  19. pycharter/data/templates/etl/load_upsert.yaml +28 -0
  20. pycharter/data/templates/etl/load_with_dlq.yaml +24 -0
  21. pycharter/data/templates/etl/load_with_ssh_tunnel.yaml +28 -0
  22. pycharter/data/templates/etl/pipeline_http_to_db.yaml +38 -0
  23. pycharter/data/templates/etl/transform_combined.yaml +38 -0
  24. pycharter/data/templates/etl/transform_custom_function.yaml +18 -0
  25. pycharter/data/templates/etl/transform_jsonata.yaml +20 -0
  26. pycharter/data/templates/etl/transform_simple.yaml +41 -0
  27. pycharter/db/schemas/.ipynb_checkpoints/data_contract-checkpoint.py +160 -0
  28. pycharter/etl_generator/extraction.py +47 -262
  29. pycharter/etl_generator/extractors/__init__.py +26 -0
  30. pycharter/etl_generator/extractors/base.py +70 -0
  31. pycharter/etl_generator/extractors/cloud_storage.py +454 -0
  32. pycharter/etl_generator/extractors/database.py +151 -0
  33. pycharter/etl_generator/extractors/factory.py +141 -0
  34. pycharter/etl_generator/extractors/file.py +418 -0
  35. pycharter/etl_generator/extractors/http.py +816 -0
  36. {pycharter-0.0.22.dist-info → pycharter-0.0.23.dist-info}/METADATA +6 -1
  37. pycharter-0.0.23.dist-info/RECORD +498 -0
  38. {pycharter-0.0.22.dist-info → pycharter-0.0.23.dist-info}/WHEEL +1 -1
  39. ui/static/404/index.html +1 -1
  40. ui/static/404.html +1 -1
  41. ui/static/__next.__PAGE__.txt +1 -1
  42. ui/static/__next._full.txt +1 -1
  43. ui/static/__next._head.txt +1 -1
  44. ui/static/__next._index.txt +1 -1
  45. ui/static/__next._tree.txt +1 -1
  46. ui/static/_next/static/chunks/26dfc590f7714c03.js +1 -0
  47. ui/static/_next/static/chunks/34d289e6db2ef551.js +1 -0
  48. ui/static/_next/static/chunks/99508d9d5869cc27.js +1 -0
  49. ui/static/_next/static/chunks/b313c35a6ba76574.js +1 -0
  50. ui/static/_not-found/__next._full.txt +1 -1
  51. ui/static/_not-found/__next._head.txt +1 -1
  52. ui/static/_not-found/__next._index.txt +1 -1
  53. ui/static/_not-found/__next._not-found.__PAGE__.txt +1 -1
  54. ui/static/_not-found/__next._not-found.txt +1 -1
  55. ui/static/_not-found/__next._tree.txt +1 -1
  56. ui/static/_not-found/index.html +1 -1
  57. ui/static/_not-found/index.txt +1 -1
  58. ui/static/contracts/__next._full.txt +2 -2
  59. ui/static/contracts/__next._head.txt +1 -1
  60. ui/static/contracts/__next._index.txt +1 -1
  61. ui/static/contracts/__next._tree.txt +1 -1
  62. ui/static/contracts/__next.contracts.__PAGE__.txt +2 -2
  63. ui/static/contracts/__next.contracts.txt +1 -1
  64. ui/static/contracts/index.html +1 -1
  65. ui/static/contracts/index.txt +2 -2
  66. ui/static/documentation/__next._full.txt +1 -1
  67. ui/static/documentation/__next._head.txt +1 -1
  68. ui/static/documentation/__next._index.txt +1 -1
  69. ui/static/documentation/__next._tree.txt +1 -1
  70. ui/static/documentation/__next.documentation.__PAGE__.txt +1 -1
  71. ui/static/documentation/__next.documentation.txt +1 -1
  72. ui/static/documentation/index.html +2 -2
  73. ui/static/documentation/index.txt +1 -1
  74. ui/static/index.html +1 -1
  75. ui/static/index.txt +1 -1
  76. ui/static/metadata/__next._full.txt +1 -1
  77. ui/static/metadata/__next._head.txt +1 -1
  78. ui/static/metadata/__next._index.txt +1 -1
  79. ui/static/metadata/__next._tree.txt +1 -1
  80. ui/static/metadata/__next.metadata.__PAGE__.txt +1 -1
  81. ui/static/metadata/__next.metadata.txt +1 -1
  82. ui/static/metadata/index.html +1 -1
  83. ui/static/metadata/index.txt +1 -1
  84. ui/static/quality/__next._full.txt +2 -2
  85. ui/static/quality/__next._head.txt +1 -1
  86. ui/static/quality/__next._index.txt +1 -1
  87. ui/static/quality/__next._tree.txt +1 -1
  88. ui/static/quality/__next.quality.__PAGE__.txt +2 -2
  89. ui/static/quality/__next.quality.txt +1 -1
  90. ui/static/quality/index.html +2 -2
  91. ui/static/quality/index.txt +2 -2
  92. ui/static/rules/__next._full.txt +1 -1
  93. ui/static/rules/__next._head.txt +1 -1
  94. ui/static/rules/__next._index.txt +1 -1
  95. ui/static/rules/__next._tree.txt +1 -1
  96. ui/static/rules/__next.rules.__PAGE__.txt +1 -1
  97. ui/static/rules/__next.rules.txt +1 -1
  98. ui/static/rules/index.html +1 -1
  99. ui/static/rules/index.txt +1 -1
  100. ui/static/schemas/__next._full.txt +1 -1
  101. ui/static/schemas/__next._head.txt +1 -1
  102. ui/static/schemas/__next._index.txt +1 -1
  103. ui/static/schemas/__next._tree.txt +1 -1
  104. ui/static/schemas/__next.schemas.__PAGE__.txt +1 -1
  105. ui/static/schemas/__next.schemas.txt +1 -1
  106. ui/static/schemas/index.html +1 -1
  107. ui/static/schemas/index.txt +1 -1
  108. ui/static/settings/__next._full.txt +1 -1
  109. ui/static/settings/__next._head.txt +1 -1
  110. ui/static/settings/__next._index.txt +1 -1
  111. ui/static/settings/__next._tree.txt +1 -1
  112. ui/static/settings/__next.settings.__PAGE__.txt +1 -1
  113. ui/static/settings/__next.settings.txt +1 -1
  114. ui/static/settings/index.html +1 -1
  115. ui/static/settings/index.txt +1 -1
  116. ui/static/static/404/index.html +1 -1
  117. ui/static/static/404.html +1 -1
  118. ui/static/static/__next.__PAGE__.txt +1 -1
  119. ui/static/static/__next._full.txt +2 -2
  120. ui/static/static/__next._head.txt +1 -1
  121. ui/static/static/__next._index.txt +2 -2
  122. ui/static/static/__next._tree.txt +2 -2
  123. ui/static/static/_next/static/chunks/13d4a0fbd74c1ee4.js +1 -0
  124. ui/static/static/_next/static/chunks/2edb43b48432ac04.js +441 -0
  125. ui/static/static/_next/static/chunks/d2363397e1b2bcab.css +1 -0
  126. ui/static/static/_next/static/chunks/f7d1a90dd75d2572.js +1 -0
  127. ui/static/static/_not-found/__next._full.txt +2 -2
  128. ui/static/static/_not-found/__next._head.txt +1 -1
  129. ui/static/static/_not-found/__next._index.txt +2 -2
  130. ui/static/static/_not-found/__next._not-found.__PAGE__.txt +1 -1
  131. ui/static/static/_not-found/__next._not-found.txt +1 -1
  132. ui/static/static/_not-found/__next._tree.txt +2 -2
  133. ui/static/static/_not-found/index.html +1 -1
  134. ui/static/static/_not-found/index.txt +2 -2
  135. ui/static/static/contracts/__next._full.txt +3 -3
  136. ui/static/static/contracts/__next._head.txt +1 -1
  137. ui/static/static/contracts/__next._index.txt +2 -2
  138. ui/static/static/contracts/__next._tree.txt +2 -2
  139. ui/static/static/contracts/__next.contracts.__PAGE__.txt +2 -2
  140. ui/static/static/contracts/__next.contracts.txt +1 -1
  141. ui/static/static/contracts/index.html +1 -1
  142. ui/static/static/contracts/index.txt +3 -3
  143. ui/static/static/documentation/__next._full.txt +3 -3
  144. ui/static/static/documentation/__next._head.txt +1 -1
  145. ui/static/static/documentation/__next._index.txt +2 -2
  146. ui/static/static/documentation/__next._tree.txt +2 -2
  147. ui/static/static/documentation/__next.documentation.__PAGE__.txt +2 -2
  148. ui/static/static/documentation/__next.documentation.txt +1 -1
  149. ui/static/static/documentation/index.html +2 -2
  150. ui/static/static/documentation/index.txt +3 -3
  151. ui/static/static/index.html +1 -1
  152. ui/static/static/index.txt +2 -2
  153. ui/static/static/metadata/__next._full.txt +2 -2
  154. ui/static/static/metadata/__next._head.txt +1 -1
  155. ui/static/static/metadata/__next._index.txt +2 -2
  156. ui/static/static/metadata/__next._tree.txt +2 -2
  157. ui/static/static/metadata/__next.metadata.__PAGE__.txt +1 -1
  158. ui/static/static/metadata/__next.metadata.txt +1 -1
  159. ui/static/static/metadata/index.html +1 -1
  160. ui/static/static/metadata/index.txt +2 -2
  161. ui/static/static/quality/__next._full.txt +2 -2
  162. ui/static/static/quality/__next._head.txt +1 -1
  163. ui/static/static/quality/__next._index.txt +2 -2
  164. ui/static/static/quality/__next._tree.txt +2 -2
  165. ui/static/static/quality/__next.quality.__PAGE__.txt +1 -1
  166. ui/static/static/quality/__next.quality.txt +1 -1
  167. ui/static/static/quality/index.html +2 -2
  168. ui/static/static/quality/index.txt +2 -2
  169. ui/static/static/rules/__next._full.txt +2 -2
  170. ui/static/static/rules/__next._head.txt +1 -1
  171. ui/static/static/rules/__next._index.txt +2 -2
  172. ui/static/static/rules/__next._tree.txt +2 -2
  173. ui/static/static/rules/__next.rules.__PAGE__.txt +1 -1
  174. ui/static/static/rules/__next.rules.txt +1 -1
  175. ui/static/static/rules/index.html +1 -1
  176. ui/static/static/rules/index.txt +2 -2
  177. ui/static/static/schemas/__next._full.txt +2 -2
  178. ui/static/static/schemas/__next._head.txt +1 -1
  179. ui/static/static/schemas/__next._index.txt +2 -2
  180. ui/static/static/schemas/__next._tree.txt +2 -2
  181. ui/static/static/schemas/__next.schemas.__PAGE__.txt +1 -1
  182. ui/static/static/schemas/__next.schemas.txt +1 -1
  183. ui/static/static/schemas/index.html +1 -1
  184. ui/static/static/schemas/index.txt +2 -2
  185. ui/static/static/settings/__next._full.txt +2 -2
  186. ui/static/static/settings/__next._head.txt +1 -1
  187. ui/static/static/settings/__next._index.txt +2 -2
  188. ui/static/static/settings/__next._tree.txt +2 -2
  189. ui/static/static/settings/__next.settings.__PAGE__.txt +1 -1
  190. ui/static/static/settings/__next.settings.txt +1 -1
  191. ui/static/static/settings/index.html +1 -1
  192. ui/static/static/settings/index.txt +2 -2
  193. ui/static/static/static/.gitkeep +0 -0
  194. ui/static/static/static/404/index.html +1 -0
  195. ui/static/static/static/404.html +1 -0
  196. ui/static/static/static/__next.__PAGE__.txt +10 -0
  197. ui/static/static/static/__next._full.txt +30 -0
  198. ui/static/static/static/__next._head.txt +7 -0
  199. ui/static/static/static/__next._index.txt +9 -0
  200. ui/static/static/static/__next._tree.txt +2 -0
  201. ui/static/static/static/_next/static/chunks/222442f6da32302a.js +1 -0
  202. ui/static/static/static/_next/static/chunks/247eb132b7f7b574.js +1 -0
  203. ui/static/static/static/_next/static/chunks/297d55555b71baba.js +1 -0
  204. ui/static/static/static/_next/static/chunks/2ab439ce003cd691.js +1 -0
  205. ui/static/static/static/_next/static/chunks/414e77373f8ff61c.js +1 -0
  206. ui/static/static/static/_next/static/chunks/49ca65abd26ae49e.js +1 -0
  207. ui/static/static/static/_next/static/chunks/652ad0aa26265c47.js +2 -0
  208. ui/static/static/static/_next/static/chunks/9667e7a3d359eb39.js +1 -0
  209. ui/static/static/static/_next/static/chunks/9c23f44fff36548a.js +1 -0
  210. ui/static/static/static/_next/static/chunks/a6dad97d9634a72d.js +1 -0
  211. ui/static/static/static/_next/static/chunks/b32a0963684b9933.js +4 -0
  212. ui/static/static/static/_next/static/chunks/c69f6cba366bd988.js +1 -0
  213. ui/static/static/static/_next/static/chunks/db913959c675cea6.js +1 -0
  214. ui/static/static/static/_next/static/chunks/f061a4be97bfc3b3.js +1 -0
  215. ui/static/static/static/_next/static/chunks/f2e7afeab1178138.js +1 -0
  216. ui/static/static/static/_next/static/chunks/ff1a16fafef87110.js +1 -0
  217. ui/static/static/static/_next/static/chunks/turbopack-ffcb7ab6794027ef.js +3 -0
  218. ui/static/static/static/_next/static/tNTkVW6puVXC4bAm4WrHl/_buildManifest.js +11 -0
  219. ui/static/static/static/_next/static/tNTkVW6puVXC4bAm4WrHl/_ssgManifest.js +1 -0
  220. ui/static/static/static/_not-found/__next._full.txt +17 -0
  221. ui/static/static/static/_not-found/__next._head.txt +7 -0
  222. ui/static/static/static/_not-found/__next._index.txt +9 -0
  223. ui/static/static/static/_not-found/__next._not-found.__PAGE__.txt +5 -0
  224. ui/static/static/static/_not-found/__next._not-found.txt +4 -0
  225. ui/static/static/static/_not-found/__next._tree.txt +2 -0
  226. ui/static/static/static/_not-found/index.html +1 -0
  227. ui/static/static/static/_not-found/index.txt +17 -0
  228. ui/static/static/static/contracts/__next._full.txt +21 -0
  229. ui/static/static/static/contracts/__next._head.txt +7 -0
  230. ui/static/static/static/contracts/__next._index.txt +9 -0
  231. ui/static/static/static/contracts/__next._tree.txt +2 -0
  232. ui/static/static/static/contracts/__next.contracts.__PAGE__.txt +9 -0
  233. ui/static/static/static/contracts/__next.contracts.txt +4 -0
  234. ui/static/static/static/contracts/index.html +1 -0
  235. ui/static/static/static/contracts/index.txt +21 -0
  236. ui/static/static/static/documentation/__next._full.txt +21 -0
  237. ui/static/static/static/documentation/__next._head.txt +7 -0
  238. ui/static/static/static/documentation/__next._index.txt +9 -0
  239. ui/static/static/static/documentation/__next._tree.txt +2 -0
  240. ui/static/static/static/documentation/__next.documentation.__PAGE__.txt +9 -0
  241. ui/static/static/static/documentation/__next.documentation.txt +4 -0
  242. ui/static/static/static/documentation/index.html +93 -0
  243. ui/static/static/static/documentation/index.txt +21 -0
  244. ui/static/static/static/index.html +1 -0
  245. ui/static/static/static/index.txt +30 -0
  246. ui/static/static/static/metadata/__next._full.txt +21 -0
  247. ui/static/static/static/metadata/__next._head.txt +7 -0
  248. ui/static/static/static/metadata/__next._index.txt +9 -0
  249. ui/static/static/static/metadata/__next._tree.txt +2 -0
  250. ui/static/static/static/metadata/__next.metadata.__PAGE__.txt +9 -0
  251. ui/static/static/static/metadata/__next.metadata.txt +4 -0
  252. ui/static/static/static/metadata/index.html +1 -0
  253. ui/static/static/static/metadata/index.txt +21 -0
  254. ui/static/static/static/quality/__next._full.txt +21 -0
  255. ui/static/static/static/quality/__next._head.txt +7 -0
  256. ui/static/static/static/quality/__next._index.txt +9 -0
  257. ui/static/static/static/quality/__next._tree.txt +2 -0
  258. ui/static/static/static/quality/__next.quality.__PAGE__.txt +9 -0
  259. ui/static/static/static/quality/__next.quality.txt +4 -0
  260. ui/static/static/static/quality/index.html +2 -0
  261. ui/static/static/static/quality/index.txt +21 -0
  262. ui/static/static/static/rules/__next._full.txt +21 -0
  263. ui/static/static/static/rules/__next._head.txt +7 -0
  264. ui/static/static/static/rules/__next._index.txt +9 -0
  265. ui/static/static/static/rules/__next._tree.txt +2 -0
  266. ui/static/static/static/rules/__next.rules.__PAGE__.txt +9 -0
  267. ui/static/static/static/rules/__next.rules.txt +4 -0
  268. ui/static/static/static/rules/index.html +1 -0
  269. ui/static/static/static/rules/index.txt +21 -0
  270. ui/static/static/static/schemas/__next._full.txt +21 -0
  271. ui/static/static/static/schemas/__next._head.txt +7 -0
  272. ui/static/static/static/schemas/__next._index.txt +9 -0
  273. ui/static/static/static/schemas/__next._tree.txt +2 -0
  274. ui/static/static/static/schemas/__next.schemas.__PAGE__.txt +9 -0
  275. ui/static/static/static/schemas/__next.schemas.txt +4 -0
  276. ui/static/static/static/schemas/index.html +1 -0
  277. ui/static/static/static/schemas/index.txt +21 -0
  278. ui/static/static/static/settings/__next._full.txt +21 -0
  279. ui/static/static/static/settings/__next._head.txt +7 -0
  280. ui/static/static/static/settings/__next._index.txt +9 -0
  281. ui/static/static/static/settings/__next._tree.txt +2 -0
  282. ui/static/static/static/settings/__next.settings.__PAGE__.txt +9 -0
  283. ui/static/static/static/settings/__next.settings.txt +4 -0
  284. ui/static/static/static/settings/index.html +1 -0
  285. ui/static/static/static/settings/index.txt +21 -0
  286. ui/static/static/static/validation/__next._full.txt +21 -0
  287. ui/static/static/static/validation/__next._head.txt +7 -0
  288. ui/static/static/static/validation/__next._index.txt +9 -0
  289. ui/static/static/static/validation/__next._tree.txt +2 -0
  290. ui/static/static/static/validation/__next.validation.__PAGE__.txt +9 -0
  291. ui/static/static/static/validation/__next.validation.txt +4 -0
  292. ui/static/static/static/validation/index.html +1 -0
  293. ui/static/static/static/validation/index.txt +21 -0
  294. ui/static/static/validation/__next._full.txt +2 -2
  295. ui/static/static/validation/__next._head.txt +1 -1
  296. ui/static/static/validation/__next._index.txt +2 -2
  297. ui/static/static/validation/__next._tree.txt +2 -2
  298. ui/static/static/validation/__next.validation.__PAGE__.txt +1 -1
  299. ui/static/static/validation/__next.validation.txt +1 -1
  300. ui/static/static/validation/index.html +1 -1
  301. ui/static/static/validation/index.txt +2 -2
  302. ui/static/validation/__next._full.txt +2 -2
  303. ui/static/validation/__next._head.txt +1 -1
  304. ui/static/validation/__next._index.txt +1 -1
  305. ui/static/validation/__next._tree.txt +1 -1
  306. ui/static/validation/__next.validation.__PAGE__.txt +2 -2
  307. ui/static/validation/__next.validation.txt +1 -1
  308. ui/static/validation/index.html +1 -1
  309. ui/static/validation/index.txt +2 -2
  310. pycharter/data/templates/template_transform_advanced.yaml +0 -50
  311. pycharter/data/templates/template_transform_simple.yaml +0 -59
  312. pycharter-0.0.22.dist-info/RECORD +0 -358
  313. /pycharter/data/templates/{template_coercion_rules.yaml → contract/template_coercion_rules.yaml} +0 -0
  314. /pycharter/data/templates/{template_contract.yaml → contract/template_contract.yaml} +0 -0
  315. /pycharter/data/templates/{template_metadata.yaml → contract/template_metadata.yaml} +0 -0
  316. /pycharter/data/templates/{template_schema.yaml → contract/template_schema.yaml} +0 -0
  317. /pycharter/data/templates/{template_validation_rules.yaml → contract/template_validation_rules.yaml} +0 -0
  318. {pycharter-0.0.22.dist-info → pycharter-0.0.23.dist-info}/entry_points.txt +0 -0
  319. {pycharter-0.0.22.dist-info → pycharter-0.0.23.dist-info}/licenses/LICENSE +0 -0
  320. {pycharter-0.0.22.dist-info → pycharter-0.0.23.dist-info}/top_level.txt +0 -0
  321. /ui/static/_next/static/{0rYA78L88aUyD2Uh38hhX → 2gKjNv6YvE6BcIdFthBLs}/_buildManifest.js +0 -0
  322. /ui/static/_next/static/{0rYA78L88aUyD2Uh38hhX → 2gKjNv6YvE6BcIdFthBLs}/_ssgManifest.js +0 -0
  323. /ui/static/static/_next/static/{tNTkVW6puVXC4bAm4WrHl → 0rYA78L88aUyD2Uh38hhX}/_buildManifest.js +0 -0
  324. /ui/static/static/_next/static/{tNTkVW6puVXC4bAm4WrHl → 0rYA78L88aUyD2Uh38hhX}/_ssgManifest.js +0 -0
  325. /ui/static/{_next → static/_next}/static/chunks/c4fa4f4114b7c352.js +0 -0
  326. /ui/static/static/{_next → static/_next}/static/chunks/4e310fe5005770a3.css +0 -0
  327. /ui/static/{_next → static/static/_next}/static/chunks/5e04d10c4a7b58a3.js +0 -0
  328. /ui/static/static/{_next → static/_next}/static/chunks/5fc14c00a2779dc5.js +0 -0
  329. /ui/static/{_next → static/static/_next}/static/chunks/75d88a058d8ffaa6.js +0 -0
  330. /ui/static/{_next → static/static/_next}/static/chunks/8c89634cf6bad76f.js +0 -0
  331. /ui/static/static/{_next → static/_next}/static/chunks/b584574fdc8ab13e.js +0 -0
  332. /ui/static/static/{_next → static/_next}/static/chunks/d5989c94d3614b3a.js +0 -0
@@ -0,0 +1,141 @@
1
+ """
2
+ Extractor factory for ETL orchestrator.
3
+
4
+ Provides a registry pattern to select and instantiate the appropriate extractor
5
+ based on the source type specified in extract.yaml configuration.
6
+ """
7
+
8
+ import logging
9
+ from typing import Any, Dict, Optional
10
+
11
+ from pycharter.etl_generator.extractors.base import BaseExtractor
12
+ from pycharter.etl_generator.extractors.cloud_storage import CloudStorageExtractor
13
+ from pycharter.etl_generator.extractors.database import DatabaseExtractor
14
+ from pycharter.etl_generator.extractors.file import FileExtractor
15
+ from pycharter.etl_generator.extractors.http import HTTPExtractor
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class ExtractorFactory:
21
+ """
22
+ Factory for creating extractor instances based on source type.
23
+
24
+ Supports auto-detection of source type from extract_config if not explicitly specified.
25
+ """
26
+
27
+ # Registry of extractors by source type
28
+ _extractors: Dict[str, type[BaseExtractor]] = {
29
+ 'http': HTTPExtractor,
30
+ 'file': FileExtractor,
31
+ 'database': DatabaseExtractor,
32
+ 'cloud_storage': CloudStorageExtractor,
33
+ }
34
+
35
+ @classmethod
36
+ def register_extractor(cls, source_type: str, extractor_class: type[BaseExtractor]) -> None:
37
+ """
38
+ Register a custom extractor class.
39
+
40
+ Args:
41
+ source_type: Source type identifier (e.g., 'kafka', 'mongodb')
42
+ extractor_class: Extractor class that inherits from BaseExtractor
43
+ """
44
+ if not issubclass(extractor_class, BaseExtractor):
45
+ raise ValueError(f"Extractor class must inherit from BaseExtractor: {extractor_class}")
46
+ cls._extractors[source_type] = extractor_class
47
+ logger.info(f"Registered extractor for source_type: {source_type}")
48
+
49
+ @classmethod
50
+ def get_extractor(cls, extract_config: Dict[str, Any]) -> BaseExtractor:
51
+ """
52
+ Get appropriate extractor instance based on extract configuration.
53
+
54
+ Auto-detects source type if not explicitly specified:
55
+ - If 'source_type' is specified, use it
56
+ - If 'base_url' or 'api_endpoint' exists, assume 'http'
57
+ - If 'file_path' exists, assume 'file'
58
+ - If 'database' config exists, assume 'database'
59
+ - If 'storage' config exists, assume 'cloud_storage'
60
+
61
+ Args:
62
+ extract_config: Extract configuration dictionary
63
+
64
+ Returns:
65
+ Extractor instance
66
+
67
+ Raises:
68
+ ValueError: If source type cannot be determined or extractor not found
69
+ """
70
+ # Check for explicit source_type
71
+ source_type = extract_config.get('source_type')
72
+
73
+ # Auto-detect if not specified
74
+ if not source_type:
75
+ source_type = cls._detect_source_type(extract_config)
76
+ logger.info(f"Auto-detected source_type: {source_type}")
77
+
78
+ # Get extractor class
79
+ extractor_class = cls._extractors.get(source_type)
80
+ if not extractor_class:
81
+ available = ', '.join(cls._extractors.keys())
82
+ raise ValueError(
83
+ f"Unknown source_type: {source_type}. "
84
+ f"Available types: {available}. "
85
+ f"Register a custom extractor with ExtractorFactory.register_extractor()"
86
+ )
87
+
88
+ # Create and validate extractor instance
89
+ extractor = extractor_class()
90
+ extractor.validate_config(extract_config)
91
+
92
+ logger.debug(f"Created {extractor_class.__name__} for source_type: {source_type}")
93
+ return extractor
94
+
95
+ @classmethod
96
+ def _detect_source_type(cls, extract_config: Dict[str, Any]) -> str:
97
+ """
98
+ Auto-detect source type from extract configuration.
99
+
100
+ Args:
101
+ extract_config: Extract configuration dictionary
102
+
103
+ Returns:
104
+ Detected source type string
105
+ """
106
+ # Check for HTTP indicators
107
+ if extract_config.get('base_url') or extract_config.get('api_endpoint'):
108
+ return 'http'
109
+
110
+ # Check for file indicators
111
+ if extract_config.get('file_path'):
112
+ return 'file'
113
+
114
+ # Check for database indicators
115
+ if extract_config.get('database'):
116
+ return 'database'
117
+
118
+ # Check for cloud storage indicators
119
+ if extract_config.get('storage'):
120
+ return 'cloud_storage'
121
+
122
+ # Default to HTTP for backward compatibility
123
+ logger.warning(
124
+ "Could not auto-detect source_type from extract_config. "
125
+ "Defaulting to 'http' for backward compatibility. "
126
+ "Consider explicitly setting 'source_type' in extract.yaml"
127
+ )
128
+ return 'http'
129
+
130
+
131
+ def get_extractor(extract_config: Dict[str, Any]) -> BaseExtractor:
132
+ """
133
+ Convenience function to get extractor instance.
134
+
135
+ Args:
136
+ extract_config: Extract configuration dictionary
137
+
138
+ Returns:
139
+ Extractor instance
140
+ """
141
+ return ExtractorFactory.get_extractor(extract_config)
@@ -0,0 +1,418 @@
1
+ """
2
+ File-based extractor for ETL orchestrator.
3
+
4
+ Supports reading from local files in various formats:
5
+ - CSV, TSV
6
+ - JSON (single file or newline-delimited JSON)
7
+ - Parquet
8
+ - Excel (xlsx, xls)
9
+ - XML
10
+ """
11
+
12
+ import gzip
13
+ import json
14
+ import logging
15
+ import zipfile
16
+ from pathlib import Path
17
+ from typing import Any, AsyncIterator, Dict, List, Optional
18
+
19
+ import pandas as pd
20
+
21
+ from pycharter.etl_generator.extractors.base import BaseExtractor
22
+ from pycharter.utils.value_injector import resolve_values
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ # Supported file formats
27
+ SUPPORTED_FORMATS = {
28
+ '.csv': 'csv',
29
+ '.tsv': 'tsv',
30
+ '.json': 'json',
31
+ '.jsonl': 'jsonl', # Newline-delimited JSON
32
+ '.ndjson': 'jsonl',
33
+ '.parquet': 'parquet',
34
+ '.xlsx': 'excel',
35
+ '.xls': 'excel',
36
+ '.xml': 'xml',
37
+ }
38
+
39
+
40
+ class FileExtractor(BaseExtractor):
41
+ """Extractor for file-based data sources."""
42
+
43
+ def validate_config(self, extract_config: Dict[str, Any]) -> None:
44
+ """Validate file extractor configuration."""
45
+ if 'source_type' in extract_config and extract_config['source_type'] != 'file':
46
+ raise ValueError(f"FileExtractor requires source_type='file', got '{extract_config.get('source_type')}'")
47
+
48
+ file_path = extract_config.get('file_path')
49
+ if not file_path:
50
+ raise ValueError("File extractor requires 'file_path' in extract_config")
51
+
52
+ async def extract_streaming(
53
+ self,
54
+ extract_config: Dict[str, Any],
55
+ params: Dict[str, Any],
56
+ headers: Dict[str, Any],
57
+ contract_dir: Optional[Any] = None,
58
+ batch_size: int = 1000,
59
+ max_records: Optional[int] = None,
60
+ config_context: Optional[Dict[str, Any]] = None,
61
+ ) -> AsyncIterator[List[Dict[str, Any]]]:
62
+ """
63
+ Extract data from file(s) in batches.
64
+
65
+ Supports:
66
+ - Single files
67
+ - Glob patterns for multiple files
68
+ - Compressed files (gzip, zip)
69
+ """
70
+ # Resolve file_path with variable injection
71
+ source_file = str(contract_dir / "extract.yaml") if contract_dir else None
72
+ file_path = extract_config.get('file_path')
73
+ if not file_path:
74
+ raise ValueError("File extractor requires 'file_path' in extract_config")
75
+
76
+ file_path = resolve_values(file_path, context=config_context, source_file=source_file)
77
+
78
+ # Detect format
79
+ file_format = extract_config.get('format')
80
+ if not file_format:
81
+ file_format = self._detect_format(file_path)
82
+
83
+ # Handle glob patterns
84
+ path = Path(file_path)
85
+ if '*' in str(path) or '?' in str(path):
86
+ # Glob pattern - process multiple files
87
+ files = list(path.parent.glob(path.name))
88
+ if not files:
89
+ raise FileNotFoundError(f"No files found matching pattern: {file_path}")
90
+ logger.info(f"Found {len(files)} files matching pattern: {file_path}")
91
+
92
+ total_extracted = 0
93
+ for file in sorted(files):
94
+ if max_records and total_extracted >= max_records:
95
+ break
96
+
97
+ logger.info(f"Processing file: {file}")
98
+ async for batch in self._extract_from_file(
99
+ file, file_format, batch_size, max_records, total_extracted
100
+ ):
101
+ total_extracted += len(batch)
102
+ yield batch
103
+ if max_records and total_extracted >= max_records:
104
+ break
105
+ else:
106
+ # Single file
107
+ if not path.exists():
108
+ raise FileNotFoundError(f"File not found: {file_path}")
109
+
110
+ async for batch in self._extract_from_file(
111
+ path, file_format, batch_size, max_records, 0
112
+ ):
113
+ yield batch
114
+
115
+ async def _extract_from_file(
116
+ self,
117
+ file_path: Path,
118
+ file_format: str,
119
+ batch_size: int,
120
+ max_records: Optional[int],
121
+ offset: int = 0,
122
+ ) -> AsyncIterator[List[Dict[str, Any]]]:
123
+ """Extract data from a single file."""
124
+ extracted_file = None
125
+ original_path = file_path
126
+
127
+ # Handle compressed files
128
+ if file_path.suffix == '.gz':
129
+ # Gzip compressed - pandas can handle this directly
130
+ # No need to decompress manually
131
+ pass
132
+ elif file_path.suffix == '.zip':
133
+ # Zip file - extract first file
134
+ with zipfile.ZipFile(file_path, 'r') as zip_ref:
135
+ file_list = zip_ref.namelist()
136
+ if not file_list:
137
+ raise ValueError(f"Zip file is empty: {file_path}")
138
+ # Use first file in zip
139
+ extracted_file = zip_ref.extract(file_list[0])
140
+ file_path = Path(extracted_file)
141
+
142
+ try:
143
+ if file_format == 'csv' or file_format == 'tsv':
144
+ async for batch in self._extract_csv(file_path, batch_size, max_records, offset, file_format):
145
+ yield batch
146
+ elif file_format == 'json':
147
+ async for batch in self._extract_json(file_path, batch_size, max_records, offset):
148
+ yield batch
149
+ elif file_format == 'jsonl':
150
+ async for batch in self._extract_jsonl(file_path, batch_size, max_records, offset):
151
+ yield batch
152
+ elif file_format == 'parquet':
153
+ async for batch in self._extract_parquet(file_path, batch_size, max_records, offset):
154
+ yield batch
155
+ elif file_format == 'excel':
156
+ async for batch in self._extract_excel(file_path, batch_size, max_records, offset):
157
+ yield batch
158
+ elif file_format == 'xml':
159
+ async for batch in self._extract_xml(file_path, batch_size, max_records, offset):
160
+ yield batch
161
+ else:
162
+ raise ValueError(f"Unsupported file format: {file_format}")
163
+ finally:
164
+ # Cleanup if we extracted from zip
165
+ if extracted_file and Path(extracted_file).exists():
166
+ Path(extracted_file).unlink()
167
+
168
+ async def _extract_csv(
169
+ self,
170
+ file_path: Path,
171
+ batch_size: int,
172
+ max_records: Optional[int],
173
+ offset: int,
174
+ format_type: str,
175
+ ) -> AsyncIterator[List[Dict[str, Any]]]:
176
+ """Extract data from CSV/TSV file."""
177
+ delimiter = '\t' if format_type == 'tsv' else ','
178
+
179
+ # Use pandas for efficient CSV reading
180
+ chunk_size = batch_size
181
+ total_read = 0
182
+
183
+ try:
184
+ for chunk in pd.read_csv(
185
+ file_path,
186
+ delimiter=delimiter,
187
+ chunksize=chunk_size,
188
+ skiprows=offset if offset > 0 else None,
189
+ ):
190
+ records = chunk.to_dict('records')
191
+
192
+ # Convert pandas types to native Python types
193
+ records = [self._convert_pandas_types(record) for record in records]
194
+
195
+ if max_records and total_read + len(records) > max_records:
196
+ records = records[:max_records - total_read]
197
+
198
+ total_read += len(records)
199
+ yield records
200
+
201
+ if max_records and total_read >= max_records:
202
+ break
203
+ except Exception as e:
204
+ raise RuntimeError(f"Error reading CSV file {file_path}: {e}") from e
205
+
206
+ async def _extract_json(
207
+ self,
208
+ file_path: Path,
209
+ batch_size: int,
210
+ max_records: Optional[int],
211
+ offset: int,
212
+ ) -> AsyncIterator[List[Dict[str, Any]]]:
213
+ """Extract data from JSON file."""
214
+ try:
215
+ # Handle gzip compressed JSON
216
+ if file_path.suffix == ".gz":
217
+ with gzip.open(file_path, "rt", encoding="utf-8") as f:
218
+ data = json.load(f)
219
+ else:
220
+ with open(file_path, 'r', encoding='utf-8') as f:
221
+ data = json.load(f)
222
+
223
+ # Handle different JSON structures
224
+ if isinstance(data, list):
225
+ records = data
226
+ elif isinstance(data, dict):
227
+ # Try to find array in common keys
228
+ for key in ['data', 'results', 'items', 'records', 'values']:
229
+ if key in data and isinstance(data[key], list):
230
+ records = data[key]
231
+ break
232
+ else:
233
+ # Single object
234
+ records = [data]
235
+ else:
236
+ raise ValueError(f"JSON file must contain a list or dict, got {type(data)}")
237
+
238
+ # Apply offset and max_records
239
+ if offset > 0:
240
+ records = records[offset:]
241
+ if max_records:
242
+ records = records[:max_records]
243
+
244
+ # Yield in batches
245
+ for i in range(0, len(records), batch_size):
246
+ yield records[i:i + batch_size]
247
+ except Exception as e:
248
+ raise RuntimeError(f"Error reading JSON file {file_path}: {e}") from e
249
+
250
+ async def _extract_jsonl(
251
+ self,
252
+ file_path: Path,
253
+ batch_size: int,
254
+ max_records: Optional[int],
255
+ offset: int,
256
+ ) -> AsyncIterator[List[Dict[str, Any]]]:
257
+ """Extract data from newline-delimited JSON file."""
258
+ try:
259
+ current_batch = []
260
+ total_read = 0
261
+ skipped = 0
262
+
263
+ # Handle gzip compressed JSONL
264
+ if file_path.suffix == '.gz':
265
+ import gzip
266
+ file_handle = gzip.open(file_path, 'rt', encoding='utf-8')
267
+ else:
268
+ file_handle = open(file_path, 'r', encoding='utf-8')
269
+
270
+ with file_handle as f:
271
+ for line in f:
272
+ # Skip lines until offset
273
+ if skipped < offset:
274
+ skipped += 1
275
+ continue
276
+
277
+ if max_records and total_read >= max_records:
278
+ break
279
+
280
+ line = line.strip()
281
+ if not line:
282
+ continue
283
+
284
+ try:
285
+ record = json.loads(line)
286
+ current_batch.append(record)
287
+ total_read += 1
288
+
289
+ if len(current_batch) >= batch_size:
290
+ yield current_batch
291
+ current_batch = []
292
+ except json.JSONDecodeError as e:
293
+ logger.warning(f"Skipping invalid JSON line in {file_path}: {e}")
294
+ continue
295
+
296
+ # Yield remaining records
297
+ if current_batch:
298
+ yield current_batch
299
+ except Exception as e:
300
+ raise RuntimeError(f"Error reading JSONL file {file_path}: {e}") from e
301
+
302
+ async def _extract_parquet(
303
+ self,
304
+ file_path: Path,
305
+ batch_size: int,
306
+ max_records: Optional[int],
307
+ offset: int,
308
+ ) -> AsyncIterator[List[Dict[str, Any]]]:
309
+ """Extract data from Parquet file."""
310
+ try:
311
+ # Read parquet file
312
+ df = pd.read_parquet(file_path)
313
+
314
+ # Apply offset
315
+ if offset > 0:
316
+ df = df.iloc[offset:]
317
+
318
+ # Apply max_records
319
+ if max_records:
320
+ df = df.head(max_records)
321
+
322
+ # Yield in batches
323
+ for i in range(0, len(df), batch_size):
324
+ chunk = df.iloc[i:i + batch_size]
325
+ records = chunk.to_dict('records')
326
+ records = [self._convert_pandas_types(record) for record in records]
327
+ yield records
328
+ except Exception as e:
329
+ raise RuntimeError(f"Error reading Parquet file {file_path}: {e}") from e
330
+
331
+ async def _extract_excel(
332
+ self,
333
+ file_path: Path,
334
+ batch_size: int,
335
+ max_records: Optional[int],
336
+ offset: int,
337
+ ) -> AsyncIterator[List[Dict[str, Any]]]:
338
+ """Extract data from Excel file."""
339
+ try:
340
+ # Read Excel file
341
+ df = pd.read_excel(file_path)
342
+
343
+ # Apply offset
344
+ if offset > 0:
345
+ df = df.iloc[offset:]
346
+
347
+ # Apply max_records
348
+ if max_records:
349
+ df = df.head(max_records)
350
+
351
+ # Yield in batches
352
+ for i in range(0, len(df), batch_size):
353
+ chunk = df.iloc[i:i + batch_size]
354
+ records = chunk.to_dict('records')
355
+ records = [self._convert_pandas_types(record) for record in records]
356
+ yield records
357
+ except Exception as e:
358
+ raise RuntimeError(f"Error reading Excel file {file_path}: {e}") from e
359
+
360
+ async def _extract_xml(
361
+ self,
362
+ file_path: Path,
363
+ batch_size: int,
364
+ max_records: Optional[int],
365
+ offset: int,
366
+ ) -> AsyncIterator[List[Dict[str, Any]]]:
367
+ """Extract data from XML file."""
368
+ try:
369
+ # Use pandas to read XML (requires lxml)
370
+ df = pd.read_xml(file_path)
371
+
372
+ # Apply offset
373
+ if offset > 0:
374
+ df = df.iloc[offset:]
375
+
376
+ # Apply max_records
377
+ if max_records:
378
+ df = df.head(max_records)
379
+
380
+ # Yield in batches
381
+ for i in range(0, len(df), batch_size):
382
+ chunk = df.iloc[i:i + batch_size]
383
+ records = chunk.to_dict('records')
384
+ records = [self._convert_pandas_types(record) for record in records]
385
+ yield records
386
+ except Exception as e:
387
+ raise RuntimeError(f"Error reading XML file {file_path}: {e}") from e
388
+
389
+ def _detect_format(self, file_path: str) -> str:
390
+ """Detect file format from extension."""
391
+ path = Path(file_path)
392
+ suffix = path.suffix.lower()
393
+
394
+ if suffix in SUPPORTED_FORMATS:
395
+ return SUPPORTED_FORMATS[suffix]
396
+
397
+ # Check for compressed files
398
+ if suffix == '.gz':
399
+ # Remove .gz and check again
400
+ stem_suffix = path.stem.split('.')[-1] if '.' in path.stem else ''
401
+ if f'.{stem_suffix}' in SUPPORTED_FORMATS:
402
+ return SUPPORTED_FORMATS[f'.{stem_suffix}']
403
+
404
+ raise ValueError(f"Could not detect file format from extension: {suffix}")
405
+
406
+ def _convert_pandas_types(self, record: Dict[str, Any]) -> Dict[str, Any]:
407
+ """Convert pandas types to native Python types."""
408
+ converted = {}
409
+ for key, value in record.items():
410
+ if pd.isna(value):
411
+ converted[key] = None
412
+ elif isinstance(value, (pd.Timestamp, pd.DatetimeTZDtype)):
413
+ converted[key] = value.isoformat()
414
+ elif isinstance(value, pd.Timedelta):
415
+ converted[key] = str(value)
416
+ else:
417
+ converted[key] = value
418
+ return converted