pycharter 0.0.22__py3-none-any.whl → 0.0.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (404) hide show
  1. api/main.py +27 -1
  2. api/models/docs.py +68 -0
  3. api/models/evolution.py +117 -0
  4. api/models/tracking.py +111 -0
  5. api/models/validation.py +46 -6
  6. api/routes/v1/__init__.py +14 -1
  7. api/routes/v1/docs.py +187 -0
  8. api/routes/v1/evolution.py +337 -0
  9. api/routes/v1/templates.py +211 -27
  10. api/routes/v1/tracking.py +301 -0
  11. api/routes/v1/validation.py +68 -31
  12. pycharter/__init__.py +268 -58
  13. pycharter/data/templates/contract/template_coercion_rules.yaml +57 -0
  14. pycharter/data/templates/contract/template_contract.yaml +122 -0
  15. pycharter/data/templates/contract/template_metadata.yaml +68 -0
  16. pycharter/data/templates/contract/template_schema.yaml +100 -0
  17. pycharter/data/templates/contract/template_validation_rules.yaml +75 -0
  18. pycharter/data/templates/etl/README.md +224 -0
  19. pycharter/data/templates/etl/extract_cloud_azure.yaml +24 -0
  20. pycharter/data/templates/etl/extract_cloud_gcs.yaml +25 -0
  21. pycharter/data/templates/etl/extract_cloud_s3.yaml +30 -0
  22. pycharter/data/templates/etl/extract_database.yaml +34 -0
  23. pycharter/data/templates/etl/extract_database_ssh.yaml +40 -0
  24. pycharter/data/templates/etl/extract_file_csv.yaml +21 -0
  25. pycharter/data/templates/etl/extract_file_glob.yaml +25 -0
  26. pycharter/data/templates/etl/extract_file_json.yaml +24 -0
  27. pycharter/data/templates/etl/extract_file_parquet.yaml +20 -0
  28. pycharter/data/templates/etl/extract_http_paginated.yaml +79 -0
  29. pycharter/data/templates/etl/extract_http_path_params.yaml +38 -0
  30. pycharter/data/templates/etl/extract_http_simple.yaml +62 -0
  31. pycharter/data/templates/etl/load_cloud_azure.yaml +24 -0
  32. pycharter/data/templates/etl/load_cloud_gcs.yaml +22 -0
  33. pycharter/data/templates/etl/load_cloud_s3.yaml +27 -0
  34. pycharter/data/templates/etl/load_file.yaml +34 -0
  35. pycharter/data/templates/etl/load_insert.yaml +18 -0
  36. pycharter/data/templates/etl/load_postgresql.yaml +39 -0
  37. pycharter/data/templates/etl/load_sqlite.yaml +21 -0
  38. pycharter/data/templates/etl/load_truncate_and_load.yaml +20 -0
  39. pycharter/data/templates/etl/load_upsert.yaml +25 -0
  40. pycharter/data/templates/etl/load_with_dlq.yaml +34 -0
  41. pycharter/data/templates/etl/load_with_ssh_tunnel.yaml +35 -0
  42. pycharter/data/templates/etl/pipeline_http_to_db.yaml +75 -0
  43. pycharter/data/templates/etl/transform_combined.yaml +48 -0
  44. pycharter/data/templates/etl/transform_custom_function.yaml +58 -0
  45. pycharter/data/templates/etl/transform_jsonata.yaml +51 -0
  46. pycharter/data/templates/etl/transform_simple.yaml +59 -0
  47. pycharter/db/schemas/.ipynb_checkpoints/data_contract-checkpoint.py +160 -0
  48. pycharter/docs_generator/__init__.py +43 -0
  49. pycharter/docs_generator/generator.py +465 -0
  50. pycharter/docs_generator/renderers.py +247 -0
  51. pycharter/etl_generator/__init__.py +168 -80
  52. pycharter/etl_generator/builder.py +121 -0
  53. pycharter/etl_generator/config_loader.py +394 -0
  54. pycharter/etl_generator/config_validator.py +418 -0
  55. pycharter/etl_generator/context.py +132 -0
  56. pycharter/etl_generator/expression.py +499 -0
  57. pycharter/etl_generator/extractors/__init__.py +30 -0
  58. pycharter/etl_generator/extractors/base.py +70 -0
  59. pycharter/etl_generator/extractors/cloud_storage.py +530 -0
  60. pycharter/etl_generator/extractors/database.py +221 -0
  61. pycharter/etl_generator/extractors/factory.py +185 -0
  62. pycharter/etl_generator/extractors/file.py +475 -0
  63. pycharter/etl_generator/extractors/http.py +895 -0
  64. pycharter/etl_generator/extractors/streaming.py +57 -0
  65. pycharter/etl_generator/loaders/__init__.py +41 -0
  66. pycharter/etl_generator/loaders/base.py +35 -0
  67. pycharter/etl_generator/loaders/cloud.py +87 -0
  68. pycharter/etl_generator/loaders/cloud_storage_loader.py +275 -0
  69. pycharter/etl_generator/loaders/database.py +274 -0
  70. pycharter/etl_generator/loaders/factory.py +180 -0
  71. pycharter/etl_generator/loaders/file.py +72 -0
  72. pycharter/etl_generator/loaders/file_loader.py +130 -0
  73. pycharter/etl_generator/pipeline.py +743 -0
  74. pycharter/etl_generator/protocols.py +54 -0
  75. pycharter/etl_generator/result.py +63 -0
  76. pycharter/etl_generator/schemas/__init__.py +49 -0
  77. pycharter/etl_generator/transformers/__init__.py +49 -0
  78. pycharter/etl_generator/transformers/base.py +63 -0
  79. pycharter/etl_generator/transformers/config.py +45 -0
  80. pycharter/etl_generator/transformers/custom_function.py +101 -0
  81. pycharter/etl_generator/transformers/jsonata_transformer.py +56 -0
  82. pycharter/etl_generator/transformers/operations.py +218 -0
  83. pycharter/etl_generator/transformers/pipeline.py +54 -0
  84. pycharter/etl_generator/transformers/simple_operations.py +131 -0
  85. pycharter/quality/__init__.py +25 -0
  86. pycharter/quality/tracking/__init__.py +64 -0
  87. pycharter/quality/tracking/collector.py +318 -0
  88. pycharter/quality/tracking/exporters.py +238 -0
  89. pycharter/quality/tracking/models.py +194 -0
  90. pycharter/quality/tracking/store.py +385 -0
  91. pycharter/runtime_validator/__init__.py +20 -7
  92. pycharter/runtime_validator/builder.py +328 -0
  93. pycharter/runtime_validator/validator.py +311 -7
  94. pycharter/runtime_validator/validator_core.py +61 -0
  95. pycharter/schema_evolution/__init__.py +61 -0
  96. pycharter/schema_evolution/compatibility.py +270 -0
  97. pycharter/schema_evolution/diff.py +496 -0
  98. pycharter/schema_evolution/models.py +201 -0
  99. pycharter/shared/__init__.py +56 -0
  100. pycharter/shared/errors.py +296 -0
  101. pycharter/shared/protocols.py +234 -0
  102. {pycharter-0.0.22.dist-info → pycharter-0.0.24.dist-info}/METADATA +146 -26
  103. pycharter-0.0.24.dist-info/RECORD +543 -0
  104. {pycharter-0.0.22.dist-info → pycharter-0.0.24.dist-info}/WHEEL +1 -1
  105. ui/static/404/index.html +1 -1
  106. ui/static/404.html +1 -1
  107. ui/static/__next.__PAGE__.txt +1 -1
  108. ui/static/__next._full.txt +1 -1
  109. ui/static/__next._head.txt +1 -1
  110. ui/static/__next._index.txt +1 -1
  111. ui/static/__next._tree.txt +1 -1
  112. ui/static/_next/static/chunks/26dfc590f7714c03.js +1 -0
  113. ui/static/_next/static/chunks/34d289e6db2ef551.js +1 -0
  114. ui/static/_next/static/chunks/99508d9d5869cc27.js +1 -0
  115. ui/static/_next/static/chunks/b313c35a6ba76574.js +1 -0
  116. ui/static/_not-found/__next._full.txt +1 -1
  117. ui/static/_not-found/__next._head.txt +1 -1
  118. ui/static/_not-found/__next._index.txt +1 -1
  119. ui/static/_not-found/__next._not-found.__PAGE__.txt +1 -1
  120. ui/static/_not-found/__next._not-found.txt +1 -1
  121. ui/static/_not-found/__next._tree.txt +1 -1
  122. ui/static/_not-found/index.html +1 -1
  123. ui/static/_not-found/index.txt +1 -1
  124. ui/static/contracts/__next._full.txt +2 -2
  125. ui/static/contracts/__next._head.txt +1 -1
  126. ui/static/contracts/__next._index.txt +1 -1
  127. ui/static/contracts/__next._tree.txt +1 -1
  128. ui/static/contracts/__next.contracts.__PAGE__.txt +2 -2
  129. ui/static/contracts/__next.contracts.txt +1 -1
  130. ui/static/contracts/index.html +1 -1
  131. ui/static/contracts/index.txt +2 -2
  132. ui/static/documentation/__next._full.txt +1 -1
  133. ui/static/documentation/__next._head.txt +1 -1
  134. ui/static/documentation/__next._index.txt +1 -1
  135. ui/static/documentation/__next._tree.txt +1 -1
  136. ui/static/documentation/__next.documentation.__PAGE__.txt +1 -1
  137. ui/static/documentation/__next.documentation.txt +1 -1
  138. ui/static/documentation/index.html +2 -2
  139. ui/static/documentation/index.txt +1 -1
  140. ui/static/index.html +1 -1
  141. ui/static/index.txt +1 -1
  142. ui/static/metadata/__next._full.txt +1 -1
  143. ui/static/metadata/__next._head.txt +1 -1
  144. ui/static/metadata/__next._index.txt +1 -1
  145. ui/static/metadata/__next._tree.txt +1 -1
  146. ui/static/metadata/__next.metadata.__PAGE__.txt +1 -1
  147. ui/static/metadata/__next.metadata.txt +1 -1
  148. ui/static/metadata/index.html +1 -1
  149. ui/static/metadata/index.txt +1 -1
  150. ui/static/quality/__next._full.txt +2 -2
  151. ui/static/quality/__next._head.txt +1 -1
  152. ui/static/quality/__next._index.txt +1 -1
  153. ui/static/quality/__next._tree.txt +1 -1
  154. ui/static/quality/__next.quality.__PAGE__.txt +2 -2
  155. ui/static/quality/__next.quality.txt +1 -1
  156. ui/static/quality/index.html +2 -2
  157. ui/static/quality/index.txt +2 -2
  158. ui/static/rules/__next._full.txt +1 -1
  159. ui/static/rules/__next._head.txt +1 -1
  160. ui/static/rules/__next._index.txt +1 -1
  161. ui/static/rules/__next._tree.txt +1 -1
  162. ui/static/rules/__next.rules.__PAGE__.txt +1 -1
  163. ui/static/rules/__next.rules.txt +1 -1
  164. ui/static/rules/index.html +1 -1
  165. ui/static/rules/index.txt +1 -1
  166. ui/static/schemas/__next._full.txt +1 -1
  167. ui/static/schemas/__next._head.txt +1 -1
  168. ui/static/schemas/__next._index.txt +1 -1
  169. ui/static/schemas/__next._tree.txt +1 -1
  170. ui/static/schemas/__next.schemas.__PAGE__.txt +1 -1
  171. ui/static/schemas/__next.schemas.txt +1 -1
  172. ui/static/schemas/index.html +1 -1
  173. ui/static/schemas/index.txt +1 -1
  174. ui/static/settings/__next._full.txt +1 -1
  175. ui/static/settings/__next._head.txt +1 -1
  176. ui/static/settings/__next._index.txt +1 -1
  177. ui/static/settings/__next._tree.txt +1 -1
  178. ui/static/settings/__next.settings.__PAGE__.txt +1 -1
  179. ui/static/settings/__next.settings.txt +1 -1
  180. ui/static/settings/index.html +1 -1
  181. ui/static/settings/index.txt +1 -1
  182. ui/static/static/404/index.html +1 -1
  183. ui/static/static/404.html +1 -1
  184. ui/static/static/__next.__PAGE__.txt +1 -1
  185. ui/static/static/__next._full.txt +2 -2
  186. ui/static/static/__next._head.txt +1 -1
  187. ui/static/static/__next._index.txt +2 -2
  188. ui/static/static/__next._tree.txt +2 -2
  189. ui/static/static/_next/static/chunks/13d4a0fbd74c1ee4.js +1 -0
  190. ui/static/static/_next/static/chunks/2edb43b48432ac04.js +441 -0
  191. ui/static/static/_next/static/chunks/d2363397e1b2bcab.css +1 -0
  192. ui/static/static/_next/static/chunks/f7d1a90dd75d2572.js +1 -0
  193. ui/static/static/_not-found/__next._full.txt +2 -2
  194. ui/static/static/_not-found/__next._head.txt +1 -1
  195. ui/static/static/_not-found/__next._index.txt +2 -2
  196. ui/static/static/_not-found/__next._not-found.__PAGE__.txt +1 -1
  197. ui/static/static/_not-found/__next._not-found.txt +1 -1
  198. ui/static/static/_not-found/__next._tree.txt +2 -2
  199. ui/static/static/_not-found/index.html +1 -1
  200. ui/static/static/_not-found/index.txt +2 -2
  201. ui/static/static/contracts/__next._full.txt +3 -3
  202. ui/static/static/contracts/__next._head.txt +1 -1
  203. ui/static/static/contracts/__next._index.txt +2 -2
  204. ui/static/static/contracts/__next._tree.txt +2 -2
  205. ui/static/static/contracts/__next.contracts.__PAGE__.txt +2 -2
  206. ui/static/static/contracts/__next.contracts.txt +1 -1
  207. ui/static/static/contracts/index.html +1 -1
  208. ui/static/static/contracts/index.txt +3 -3
  209. ui/static/static/documentation/__next._full.txt +3 -3
  210. ui/static/static/documentation/__next._head.txt +1 -1
  211. ui/static/static/documentation/__next._index.txt +2 -2
  212. ui/static/static/documentation/__next._tree.txt +2 -2
  213. ui/static/static/documentation/__next.documentation.__PAGE__.txt +2 -2
  214. ui/static/static/documentation/__next.documentation.txt +1 -1
  215. ui/static/static/documentation/index.html +2 -2
  216. ui/static/static/documentation/index.txt +3 -3
  217. ui/static/static/index.html +1 -1
  218. ui/static/static/index.txt +2 -2
  219. ui/static/static/metadata/__next._full.txt +2 -2
  220. ui/static/static/metadata/__next._head.txt +1 -1
  221. ui/static/static/metadata/__next._index.txt +2 -2
  222. ui/static/static/metadata/__next._tree.txt +2 -2
  223. ui/static/static/metadata/__next.metadata.__PAGE__.txt +1 -1
  224. ui/static/static/metadata/__next.metadata.txt +1 -1
  225. ui/static/static/metadata/index.html +1 -1
  226. ui/static/static/metadata/index.txt +2 -2
  227. ui/static/static/quality/__next._full.txt +2 -2
  228. ui/static/static/quality/__next._head.txt +1 -1
  229. ui/static/static/quality/__next._index.txt +2 -2
  230. ui/static/static/quality/__next._tree.txt +2 -2
  231. ui/static/static/quality/__next.quality.__PAGE__.txt +1 -1
  232. ui/static/static/quality/__next.quality.txt +1 -1
  233. ui/static/static/quality/index.html +2 -2
  234. ui/static/static/quality/index.txt +2 -2
  235. ui/static/static/rules/__next._full.txt +2 -2
  236. ui/static/static/rules/__next._head.txt +1 -1
  237. ui/static/static/rules/__next._index.txt +2 -2
  238. ui/static/static/rules/__next._tree.txt +2 -2
  239. ui/static/static/rules/__next.rules.__PAGE__.txt +1 -1
  240. ui/static/static/rules/__next.rules.txt +1 -1
  241. ui/static/static/rules/index.html +1 -1
  242. ui/static/static/rules/index.txt +2 -2
  243. ui/static/static/schemas/__next._full.txt +2 -2
  244. ui/static/static/schemas/__next._head.txt +1 -1
  245. ui/static/static/schemas/__next._index.txt +2 -2
  246. ui/static/static/schemas/__next._tree.txt +2 -2
  247. ui/static/static/schemas/__next.schemas.__PAGE__.txt +1 -1
  248. ui/static/static/schemas/__next.schemas.txt +1 -1
  249. ui/static/static/schemas/index.html +1 -1
  250. ui/static/static/schemas/index.txt +2 -2
  251. ui/static/static/settings/__next._full.txt +2 -2
  252. ui/static/static/settings/__next._head.txt +1 -1
  253. ui/static/static/settings/__next._index.txt +2 -2
  254. ui/static/static/settings/__next._tree.txt +2 -2
  255. ui/static/static/settings/__next.settings.__PAGE__.txt +1 -1
  256. ui/static/static/settings/__next.settings.txt +1 -1
  257. ui/static/static/settings/index.html +1 -1
  258. ui/static/static/settings/index.txt +2 -2
  259. ui/static/static/static/.gitkeep +0 -0
  260. ui/static/static/static/404/index.html +1 -0
  261. ui/static/static/static/404.html +1 -0
  262. ui/static/static/static/__next.__PAGE__.txt +10 -0
  263. ui/static/static/static/__next._full.txt +30 -0
  264. ui/static/static/static/__next._head.txt +7 -0
  265. ui/static/static/static/__next._index.txt +9 -0
  266. ui/static/static/static/__next._tree.txt +2 -0
  267. ui/static/static/static/_next/static/chunks/222442f6da32302a.js +1 -0
  268. ui/static/static/static/_next/static/chunks/247eb132b7f7b574.js +1 -0
  269. ui/static/static/static/_next/static/chunks/297d55555b71baba.js +1 -0
  270. ui/static/static/static/_next/static/chunks/2ab439ce003cd691.js +1 -0
  271. ui/static/static/static/_next/static/chunks/414e77373f8ff61c.js +1 -0
  272. ui/static/static/static/_next/static/chunks/49ca65abd26ae49e.js +1 -0
  273. ui/static/static/static/_next/static/chunks/652ad0aa26265c47.js +2 -0
  274. ui/static/static/static/_next/static/chunks/9667e7a3d359eb39.js +1 -0
  275. ui/static/static/static/_next/static/chunks/9c23f44fff36548a.js +1 -0
  276. ui/static/static/static/_next/static/chunks/a6dad97d9634a72d.js +1 -0
  277. ui/static/static/static/_next/static/chunks/b32a0963684b9933.js +4 -0
  278. ui/static/static/static/_next/static/chunks/c69f6cba366bd988.js +1 -0
  279. ui/static/static/static/_next/static/chunks/db913959c675cea6.js +1 -0
  280. ui/static/static/static/_next/static/chunks/f061a4be97bfc3b3.js +1 -0
  281. ui/static/static/static/_next/static/chunks/f2e7afeab1178138.js +1 -0
  282. ui/static/static/static/_next/static/chunks/ff1a16fafef87110.js +1 -0
  283. ui/static/static/static/_next/static/chunks/turbopack-ffcb7ab6794027ef.js +3 -0
  284. ui/static/static/static/_next/static/tNTkVW6puVXC4bAm4WrHl/_buildManifest.js +11 -0
  285. ui/static/static/static/_next/static/tNTkVW6puVXC4bAm4WrHl/_ssgManifest.js +1 -0
  286. ui/static/static/static/_not-found/__next._full.txt +17 -0
  287. ui/static/static/static/_not-found/__next._head.txt +7 -0
  288. ui/static/static/static/_not-found/__next._index.txt +9 -0
  289. ui/static/static/static/_not-found/__next._not-found.__PAGE__.txt +5 -0
  290. ui/static/static/static/_not-found/__next._not-found.txt +4 -0
  291. ui/static/static/static/_not-found/__next._tree.txt +2 -0
  292. ui/static/static/static/_not-found/index.html +1 -0
  293. ui/static/static/static/_not-found/index.txt +17 -0
  294. ui/static/static/static/contracts/__next._full.txt +21 -0
  295. ui/static/static/static/contracts/__next._head.txt +7 -0
  296. ui/static/static/static/contracts/__next._index.txt +9 -0
  297. ui/static/static/static/contracts/__next._tree.txt +2 -0
  298. ui/static/static/static/contracts/__next.contracts.__PAGE__.txt +9 -0
  299. ui/static/static/static/contracts/__next.contracts.txt +4 -0
  300. ui/static/static/static/contracts/index.html +1 -0
  301. ui/static/static/static/contracts/index.txt +21 -0
  302. ui/static/static/static/documentation/__next._full.txt +21 -0
  303. ui/static/static/static/documentation/__next._head.txt +7 -0
  304. ui/static/static/static/documentation/__next._index.txt +9 -0
  305. ui/static/static/static/documentation/__next._tree.txt +2 -0
  306. ui/static/static/static/documentation/__next.documentation.__PAGE__.txt +9 -0
  307. ui/static/static/static/documentation/__next.documentation.txt +4 -0
  308. ui/static/static/static/documentation/index.html +93 -0
  309. ui/static/static/static/documentation/index.txt +21 -0
  310. ui/static/static/static/index.html +1 -0
  311. ui/static/static/static/index.txt +30 -0
  312. ui/static/static/static/metadata/__next._full.txt +21 -0
  313. ui/static/static/static/metadata/__next._head.txt +7 -0
  314. ui/static/static/static/metadata/__next._index.txt +9 -0
  315. ui/static/static/static/metadata/__next._tree.txt +2 -0
  316. ui/static/static/static/metadata/__next.metadata.__PAGE__.txt +9 -0
  317. ui/static/static/static/metadata/__next.metadata.txt +4 -0
  318. ui/static/static/static/metadata/index.html +1 -0
  319. ui/static/static/static/metadata/index.txt +21 -0
  320. ui/static/static/static/quality/__next._full.txt +21 -0
  321. ui/static/static/static/quality/__next._head.txt +7 -0
  322. ui/static/static/static/quality/__next._index.txt +9 -0
  323. ui/static/static/static/quality/__next._tree.txt +2 -0
  324. ui/static/static/static/quality/__next.quality.__PAGE__.txt +9 -0
  325. ui/static/static/static/quality/__next.quality.txt +4 -0
  326. ui/static/static/static/quality/index.html +2 -0
  327. ui/static/static/static/quality/index.txt +21 -0
  328. ui/static/static/static/rules/__next._full.txt +21 -0
  329. ui/static/static/static/rules/__next._head.txt +7 -0
  330. ui/static/static/static/rules/__next._index.txt +9 -0
  331. ui/static/static/static/rules/__next._tree.txt +2 -0
  332. ui/static/static/static/rules/__next.rules.__PAGE__.txt +9 -0
  333. ui/static/static/static/rules/__next.rules.txt +4 -0
  334. ui/static/static/static/rules/index.html +1 -0
  335. ui/static/static/static/rules/index.txt +21 -0
  336. ui/static/static/static/schemas/__next._full.txt +21 -0
  337. ui/static/static/static/schemas/__next._head.txt +7 -0
  338. ui/static/static/static/schemas/__next._index.txt +9 -0
  339. ui/static/static/static/schemas/__next._tree.txt +2 -0
  340. ui/static/static/static/schemas/__next.schemas.__PAGE__.txt +9 -0
  341. ui/static/static/static/schemas/__next.schemas.txt +4 -0
  342. ui/static/static/static/schemas/index.html +1 -0
  343. ui/static/static/static/schemas/index.txt +21 -0
  344. ui/static/static/static/settings/__next._full.txt +21 -0
  345. ui/static/static/static/settings/__next._head.txt +7 -0
  346. ui/static/static/static/settings/__next._index.txt +9 -0
  347. ui/static/static/static/settings/__next._tree.txt +2 -0
  348. ui/static/static/static/settings/__next.settings.__PAGE__.txt +9 -0
  349. ui/static/static/static/settings/__next.settings.txt +4 -0
  350. ui/static/static/static/settings/index.html +1 -0
  351. ui/static/static/static/settings/index.txt +21 -0
  352. ui/static/static/static/validation/__next._full.txt +21 -0
  353. ui/static/static/static/validation/__next._head.txt +7 -0
  354. ui/static/static/static/validation/__next._index.txt +9 -0
  355. ui/static/static/static/validation/__next._tree.txt +2 -0
  356. ui/static/static/static/validation/__next.validation.__PAGE__.txt +9 -0
  357. ui/static/static/static/validation/__next.validation.txt +4 -0
  358. ui/static/static/static/validation/index.html +1 -0
  359. ui/static/static/static/validation/index.txt +21 -0
  360. ui/static/static/validation/__next._full.txt +2 -2
  361. ui/static/static/validation/__next._head.txt +1 -1
  362. ui/static/static/validation/__next._index.txt +2 -2
  363. ui/static/static/validation/__next._tree.txt +2 -2
  364. ui/static/static/validation/__next.validation.__PAGE__.txt +1 -1
  365. ui/static/static/validation/__next.validation.txt +1 -1
  366. ui/static/static/validation/index.html +1 -1
  367. ui/static/static/validation/index.txt +2 -2
  368. ui/static/validation/__next._full.txt +2 -2
  369. ui/static/validation/__next._head.txt +1 -1
  370. ui/static/validation/__next._index.txt +1 -1
  371. ui/static/validation/__next._tree.txt +1 -1
  372. ui/static/validation/__next.validation.__PAGE__.txt +2 -2
  373. ui/static/validation/__next.validation.txt +1 -1
  374. ui/static/validation/index.html +1 -1
  375. ui/static/validation/index.txt +2 -2
  376. pycharter/data/templates/template_coercion_rules.yaml +0 -15
  377. pycharter/data/templates/template_contract.yaml +0 -587
  378. pycharter/data/templates/template_metadata.yaml +0 -38
  379. pycharter/data/templates/template_schema.yaml +0 -22
  380. pycharter/data/templates/template_transform_advanced.yaml +0 -50
  381. pycharter/data/templates/template_transform_simple.yaml +0 -59
  382. pycharter/data/templates/template_validation_rules.yaml +0 -29
  383. pycharter/etl_generator/extraction.py +0 -916
  384. pycharter/etl_generator/factory.py +0 -174
  385. pycharter/etl_generator/orchestrator.py +0 -1650
  386. pycharter/integrations/__init__.py +0 -19
  387. pycharter/integrations/kafka.py +0 -178
  388. pycharter/integrations/streaming.py +0 -100
  389. pycharter-0.0.22.dist-info/RECORD +0 -358
  390. {pycharter-0.0.22.dist-info → pycharter-0.0.24.dist-info}/entry_points.txt +0 -0
  391. {pycharter-0.0.22.dist-info → pycharter-0.0.24.dist-info}/licenses/LICENSE +0 -0
  392. {pycharter-0.0.22.dist-info → pycharter-0.0.24.dist-info}/top_level.txt +0 -0
  393. /ui/static/_next/static/{0rYA78L88aUyD2Uh38hhX → 2gKjNv6YvE6BcIdFthBLs}/_buildManifest.js +0 -0
  394. /ui/static/_next/static/{0rYA78L88aUyD2Uh38hhX → 2gKjNv6YvE6BcIdFthBLs}/_ssgManifest.js +0 -0
  395. /ui/static/static/_next/static/{tNTkVW6puVXC4bAm4WrHl → 0rYA78L88aUyD2Uh38hhX}/_buildManifest.js +0 -0
  396. /ui/static/static/_next/static/{tNTkVW6puVXC4bAm4WrHl → 0rYA78L88aUyD2Uh38hhX}/_ssgManifest.js +0 -0
  397. /ui/static/{_next → static/_next}/static/chunks/c4fa4f4114b7c352.js +0 -0
  398. /ui/static/static/{_next → static/_next}/static/chunks/4e310fe5005770a3.css +0 -0
  399. /ui/static/{_next → static/static/_next}/static/chunks/5e04d10c4a7b58a3.js +0 -0
  400. /ui/static/static/{_next → static/_next}/static/chunks/5fc14c00a2779dc5.js +0 -0
  401. /ui/static/{_next → static/static/_next}/static/chunks/75d88a058d8ffaa6.js +0 -0
  402. /ui/static/{_next → static/static/_next}/static/chunks/8c89634cf6bad76f.js +0 -0
  403. /ui/static/static/{_next → static/_next}/static/chunks/b584574fdc8ab13e.js +0 -0
  404. /ui/static/static/{_next → static/_next}/static/chunks/d5989c94d3614b3a.js +0 -0
@@ -0,0 +1,475 @@
1
+ """
2
+ File-based extractor for ETL orchestrator.
3
+
4
+ Supports reading from local files in various formats:
5
+ - CSV, TSV
6
+ - JSON (single file or newline-delimited JSON)
7
+ - Parquet
8
+ - Excel (xlsx, xls)
9
+ - XML
10
+ """
11
+
12
+ import gzip
13
+ import json
14
+ import logging
15
+ import zipfile
16
+ from pathlib import Path
17
+ from typing import Any, AsyncIterator, Dict, List, Optional
18
+
19
+ import pandas as pd
20
+
21
+ from pycharter.etl_generator.extractors.base import BaseExtractor
22
+ from pycharter.utils.value_injector import resolve_values
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ # Supported file formats
27
+ SUPPORTED_FORMATS = {
28
+ '.csv': 'csv',
29
+ '.tsv': 'tsv',
30
+ '.json': 'json',
31
+ '.jsonl': 'jsonl', # Newline-delimited JSON
32
+ '.ndjson': 'jsonl',
33
+ '.parquet': 'parquet',
34
+ '.xlsx': 'excel',
35
+ '.xls': 'excel',
36
+ '.xml': 'xml',
37
+ }
38
+
39
+
40
+ class FileExtractor(BaseExtractor):
41
+ """
42
+ Extractor for file-based data sources.
43
+
44
+ Supports two modes:
45
+ 1. Programmatic API:
46
+ >>> extractor = FileExtractor(path="data.csv")
47
+ >>> async for batch in extractor.extract():
48
+ ... process(batch)
49
+
50
+ 2. Config-driven:
51
+ >>> extractor = FileExtractor()
52
+ >>> async for batch in extractor.extract_streaming(config, params, headers):
53
+ ... process(batch)
54
+ """
55
+
56
+ def __init__(
57
+ self,
58
+ path: Optional[str] = None,
59
+ file_format: Optional[str] = None,
60
+ batch_size: int = 1000,
61
+ max_records: Optional[int] = None,
62
+ ):
63
+ self.path = path
64
+ self.file_format = file_format
65
+ self.batch_size = batch_size
66
+ self.max_records = max_records
67
+
68
+ @classmethod
69
+ def from_config(cls, config: Dict[str, Any]) -> "FileExtractor":
70
+ """Create extractor from configuration dict."""
71
+ return cls(
72
+ path=config.get("file_path") or config.get("path"),
73
+ file_format=config.get("format"),
74
+ batch_size=config.get("batch_size", 1000),
75
+ max_records=config.get("max_records"),
76
+ )
77
+
78
+ async def extract(self, **params) -> AsyncIterator[List[Dict[str, Any]]]:
79
+ """
80
+ Extract data from file.
81
+
82
+ Yields:
83
+ Batches of records
84
+ """
85
+ if not self.path:
86
+ raise ValueError("File path is required")
87
+
88
+ extract_config = {
89
+ "file_path": self.path,
90
+ "format": self.file_format,
91
+ }
92
+
93
+ async for batch in self.extract_streaming(
94
+ extract_config, {}, {},
95
+ batch_size=self.batch_size,
96
+ max_records=self.max_records,
97
+ ):
98
+ yield batch
99
+
100
+ def validate_config(self, extract_config: Dict[str, Any]) -> None:
101
+ """Validate file extractor configuration."""
102
+ if 'source_type' in extract_config and extract_config['source_type'] != 'file':
103
+ raise ValueError(f"FileExtractor requires source_type='file', got '{extract_config.get('source_type')}'")
104
+
105
+ file_path = extract_config.get('file_path')
106
+ if not file_path:
107
+ raise ValueError("File extractor requires 'file_path' in extract_config")
108
+
109
+ async def extract_streaming(
110
+ self,
111
+ extract_config: Dict[str, Any],
112
+ params: Dict[str, Any],
113
+ headers: Dict[str, Any],
114
+ contract_dir: Optional[Any] = None,
115
+ batch_size: int = 1000,
116
+ max_records: Optional[int] = None,
117
+ config_context: Optional[Dict[str, Any]] = None,
118
+ ) -> AsyncIterator[List[Dict[str, Any]]]:
119
+ """
120
+ Extract data from file(s) in batches.
121
+
122
+ Supports:
123
+ - Single files
124
+ - Glob patterns for multiple files
125
+ - Compressed files (gzip, zip)
126
+ """
127
+ # Resolve file_path with variable injection
128
+ source_file = str(contract_dir / "extract.yaml") if contract_dir else None
129
+ file_path = extract_config.get('file_path')
130
+ if not file_path:
131
+ raise ValueError("File extractor requires 'file_path' in extract_config")
132
+
133
+ file_path = resolve_values(file_path, context=config_context, source_file=source_file)
134
+
135
+ # Detect format
136
+ file_format = extract_config.get('format')
137
+ if not file_format:
138
+ file_format = self._detect_format(file_path)
139
+
140
+ # Handle glob patterns
141
+ path = Path(file_path)
142
+ if '*' in str(path) or '?' in str(path):
143
+ # Glob pattern - process multiple files
144
+ files = list(path.parent.glob(path.name))
145
+ if not files:
146
+ raise FileNotFoundError(f"No files found matching pattern: {file_path}")
147
+ logger.info(f"Found {len(files)} files matching pattern: {file_path}")
148
+
149
+ total_extracted = 0
150
+ for file in sorted(files):
151
+ if max_records and total_extracted >= max_records:
152
+ break
153
+
154
+ logger.info(f"Processing file: {file}")
155
+ async for batch in self._extract_from_file(
156
+ file, file_format, batch_size, max_records, total_extracted
157
+ ):
158
+ total_extracted += len(batch)
159
+ yield batch
160
+ if max_records and total_extracted >= max_records:
161
+ break
162
+ else:
163
+ # Single file
164
+ if not path.exists():
165
+ raise FileNotFoundError(f"File not found: {file_path}")
166
+
167
+ async for batch in self._extract_from_file(
168
+ path, file_format, batch_size, max_records, 0
169
+ ):
170
+ yield batch
171
+
172
+ async def _extract_from_file(
173
+ self,
174
+ file_path: Path,
175
+ file_format: str,
176
+ batch_size: int,
177
+ max_records: Optional[int],
178
+ offset: int = 0,
179
+ ) -> AsyncIterator[List[Dict[str, Any]]]:
180
+ """Extract data from a single file."""
181
+ extracted_file = None
182
+ original_path = file_path
183
+
184
+ # Handle compressed files
185
+ if file_path.suffix == '.gz':
186
+ # Gzip compressed - pandas can handle this directly
187
+ # No need to decompress manually
188
+ pass
189
+ elif file_path.suffix == '.zip':
190
+ # Zip file - extract first file
191
+ with zipfile.ZipFile(file_path, 'r') as zip_ref:
192
+ file_list = zip_ref.namelist()
193
+ if not file_list:
194
+ raise ValueError(f"Zip file is empty: {file_path}")
195
+ # Use first file in zip
196
+ extracted_file = zip_ref.extract(file_list[0])
197
+ file_path = Path(extracted_file)
198
+
199
+ try:
200
+ if file_format == 'csv' or file_format == 'tsv':
201
+ async for batch in self._extract_csv(file_path, batch_size, max_records, offset, file_format):
202
+ yield batch
203
+ elif file_format == 'json':
204
+ async for batch in self._extract_json(file_path, batch_size, max_records, offset):
205
+ yield batch
206
+ elif file_format == 'jsonl':
207
+ async for batch in self._extract_jsonl(file_path, batch_size, max_records, offset):
208
+ yield batch
209
+ elif file_format == 'parquet':
210
+ async for batch in self._extract_parquet(file_path, batch_size, max_records, offset):
211
+ yield batch
212
+ elif file_format == 'excel':
213
+ async for batch in self._extract_excel(file_path, batch_size, max_records, offset):
214
+ yield batch
215
+ elif file_format == 'xml':
216
+ async for batch in self._extract_xml(file_path, batch_size, max_records, offset):
217
+ yield batch
218
+ else:
219
+ raise ValueError(f"Unsupported file format: {file_format}")
220
+ finally:
221
+ # Cleanup if we extracted from zip
222
+ if extracted_file and Path(extracted_file).exists():
223
+ Path(extracted_file).unlink()
224
+
225
+ async def _extract_csv(
226
+ self,
227
+ file_path: Path,
228
+ batch_size: int,
229
+ max_records: Optional[int],
230
+ offset: int,
231
+ format_type: str,
232
+ ) -> AsyncIterator[List[Dict[str, Any]]]:
233
+ """Extract data from CSV/TSV file."""
234
+ delimiter = '\t' if format_type == 'tsv' else ','
235
+
236
+ # Use pandas for efficient CSV reading
237
+ chunk_size = batch_size
238
+ total_read = 0
239
+
240
+ try:
241
+ for chunk in pd.read_csv(
242
+ file_path,
243
+ delimiter=delimiter,
244
+ chunksize=chunk_size,
245
+ skiprows=offset if offset > 0 else None,
246
+ ):
247
+ records = chunk.to_dict('records')
248
+
249
+ # Convert pandas types to native Python types
250
+ records = [self._convert_pandas_types(record) for record in records]
251
+
252
+ if max_records and total_read + len(records) > max_records:
253
+ records = records[:max_records - total_read]
254
+
255
+ total_read += len(records)
256
+ yield records
257
+
258
+ if max_records and total_read >= max_records:
259
+ break
260
+ except Exception as e:
261
+ raise RuntimeError(f"Error reading CSV file {file_path}: {e}") from e
262
+
263
+ async def _extract_json(
264
+ self,
265
+ file_path: Path,
266
+ batch_size: int,
267
+ max_records: Optional[int],
268
+ offset: int,
269
+ ) -> AsyncIterator[List[Dict[str, Any]]]:
270
+ """Extract data from JSON file."""
271
+ try:
272
+ # Handle gzip compressed JSON
273
+ if file_path.suffix == ".gz":
274
+ with gzip.open(file_path, "rt", encoding="utf-8") as f:
275
+ data = json.load(f)
276
+ else:
277
+ with open(file_path, 'r', encoding='utf-8') as f:
278
+ data = json.load(f)
279
+
280
+ # Handle different JSON structures
281
+ if isinstance(data, list):
282
+ records = data
283
+ elif isinstance(data, dict):
284
+ # Try to find array in common keys
285
+ for key in ['data', 'results', 'items', 'records', 'values']:
286
+ if key in data and isinstance(data[key], list):
287
+ records = data[key]
288
+ break
289
+ else:
290
+ # Single object
291
+ records = [data]
292
+ else:
293
+ raise ValueError(f"JSON file must contain a list or dict, got {type(data)}")
294
+
295
+ # Apply offset and max_records
296
+ if offset > 0:
297
+ records = records[offset:]
298
+ if max_records:
299
+ records = records[:max_records]
300
+
301
+ # Yield in batches
302
+ for i in range(0, len(records), batch_size):
303
+ yield records[i:i + batch_size]
304
+ except Exception as e:
305
+ raise RuntimeError(f"Error reading JSON file {file_path}: {e}") from e
306
+
307
+ async def _extract_jsonl(
308
+ self,
309
+ file_path: Path,
310
+ batch_size: int,
311
+ max_records: Optional[int],
312
+ offset: int,
313
+ ) -> AsyncIterator[List[Dict[str, Any]]]:
314
+ """Extract data from newline-delimited JSON file."""
315
+ try:
316
+ current_batch = []
317
+ total_read = 0
318
+ skipped = 0
319
+
320
+ # Handle gzip compressed JSONL
321
+ if file_path.suffix == '.gz':
322
+ import gzip
323
+ file_handle = gzip.open(file_path, 'rt', encoding='utf-8')
324
+ else:
325
+ file_handle = open(file_path, 'r', encoding='utf-8')
326
+
327
+ with file_handle as f:
328
+ for line in f:
329
+ # Skip lines until offset
330
+ if skipped < offset:
331
+ skipped += 1
332
+ continue
333
+
334
+ if max_records and total_read >= max_records:
335
+ break
336
+
337
+ line = line.strip()
338
+ if not line:
339
+ continue
340
+
341
+ try:
342
+ record = json.loads(line)
343
+ current_batch.append(record)
344
+ total_read += 1
345
+
346
+ if len(current_batch) >= batch_size:
347
+ yield current_batch
348
+ current_batch = []
349
+ except json.JSONDecodeError as e:
350
+ logger.warning(f"Skipping invalid JSON line in {file_path}: {e}")
351
+ continue
352
+
353
+ # Yield remaining records
354
+ if current_batch:
355
+ yield current_batch
356
+ except Exception as e:
357
+ raise RuntimeError(f"Error reading JSONL file {file_path}: {e}") from e
358
+
359
+ async def _extract_parquet(
360
+ self,
361
+ file_path: Path,
362
+ batch_size: int,
363
+ max_records: Optional[int],
364
+ offset: int,
365
+ ) -> AsyncIterator[List[Dict[str, Any]]]:
366
+ """Extract data from Parquet file."""
367
+ try:
368
+ # Read parquet file
369
+ df = pd.read_parquet(file_path)
370
+
371
+ # Apply offset
372
+ if offset > 0:
373
+ df = df.iloc[offset:]
374
+
375
+ # Apply max_records
376
+ if max_records:
377
+ df = df.head(max_records)
378
+
379
+ # Yield in batches
380
+ for i in range(0, len(df), batch_size):
381
+ chunk = df.iloc[i:i + batch_size]
382
+ records = chunk.to_dict('records')
383
+ records = [self._convert_pandas_types(record) for record in records]
384
+ yield records
385
+ except Exception as e:
386
+ raise RuntimeError(f"Error reading Parquet file {file_path}: {e}") from e
387
+
388
+ async def _extract_excel(
389
+ self,
390
+ file_path: Path,
391
+ batch_size: int,
392
+ max_records: Optional[int],
393
+ offset: int,
394
+ ) -> AsyncIterator[List[Dict[str, Any]]]:
395
+ """Extract data from Excel file."""
396
+ try:
397
+ # Read Excel file
398
+ df = pd.read_excel(file_path)
399
+
400
+ # Apply offset
401
+ if offset > 0:
402
+ df = df.iloc[offset:]
403
+
404
+ # Apply max_records
405
+ if max_records:
406
+ df = df.head(max_records)
407
+
408
+ # Yield in batches
409
+ for i in range(0, len(df), batch_size):
410
+ chunk = df.iloc[i:i + batch_size]
411
+ records = chunk.to_dict('records')
412
+ records = [self._convert_pandas_types(record) for record in records]
413
+ yield records
414
+ except Exception as e:
415
+ raise RuntimeError(f"Error reading Excel file {file_path}: {e}") from e
416
+
417
+ async def _extract_xml(
418
+ self,
419
+ file_path: Path,
420
+ batch_size: int,
421
+ max_records: Optional[int],
422
+ offset: int,
423
+ ) -> AsyncIterator[List[Dict[str, Any]]]:
424
+ """Extract data from XML file."""
425
+ try:
426
+ # Use pandas to read XML (requires lxml)
427
+ df = pd.read_xml(file_path)
428
+
429
+ # Apply offset
430
+ if offset > 0:
431
+ df = df.iloc[offset:]
432
+
433
+ # Apply max_records
434
+ if max_records:
435
+ df = df.head(max_records)
436
+
437
+ # Yield in batches
438
+ for i in range(0, len(df), batch_size):
439
+ chunk = df.iloc[i:i + batch_size]
440
+ records = chunk.to_dict('records')
441
+ records = [self._convert_pandas_types(record) for record in records]
442
+ yield records
443
+ except Exception as e:
444
+ raise RuntimeError(f"Error reading XML file {file_path}: {e}") from e
445
+
446
+ def _detect_format(self, file_path: str) -> str:
447
+ """Detect file format from extension."""
448
+ path = Path(file_path)
449
+ suffix = path.suffix.lower()
450
+
451
+ if suffix in SUPPORTED_FORMATS:
452
+ return SUPPORTED_FORMATS[suffix]
453
+
454
+ # Check for compressed files
455
+ if suffix == '.gz':
456
+ # Remove .gz and check again
457
+ stem_suffix = path.stem.split('.')[-1] if '.' in path.stem else ''
458
+ if f'.{stem_suffix}' in SUPPORTED_FORMATS:
459
+ return SUPPORTED_FORMATS[f'.{stem_suffix}']
460
+
461
+ raise ValueError(f"Could not detect file format from extension: {suffix}")
462
+
463
+ def _convert_pandas_types(self, record: Dict[str, Any]) -> Dict[str, Any]:
464
+ """Convert pandas types to native Python types."""
465
+ converted = {}
466
+ for key, value in record.items():
467
+ if pd.isna(value):
468
+ converted[key] = None
469
+ elif isinstance(value, (pd.Timestamp, pd.DatetimeTZDtype)):
470
+ converted[key] = value.isoformat()
471
+ elif isinstance(value, pd.Timedelta):
472
+ converted[key] = str(value)
473
+ else:
474
+ converted[key] = value
475
+ return converted