pycharter 0.0.22__py3-none-any.whl → 0.0.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (404) hide show
  1. api/main.py +27 -1
  2. api/models/docs.py +68 -0
  3. api/models/evolution.py +117 -0
  4. api/models/tracking.py +111 -0
  5. api/models/validation.py +46 -6
  6. api/routes/v1/__init__.py +14 -1
  7. api/routes/v1/docs.py +187 -0
  8. api/routes/v1/evolution.py +337 -0
  9. api/routes/v1/templates.py +211 -27
  10. api/routes/v1/tracking.py +301 -0
  11. api/routes/v1/validation.py +68 -31
  12. pycharter/__init__.py +268 -58
  13. pycharter/data/templates/contract/template_coercion_rules.yaml +57 -0
  14. pycharter/data/templates/contract/template_contract.yaml +122 -0
  15. pycharter/data/templates/contract/template_metadata.yaml +68 -0
  16. pycharter/data/templates/contract/template_schema.yaml +100 -0
  17. pycharter/data/templates/contract/template_validation_rules.yaml +75 -0
  18. pycharter/data/templates/etl/README.md +224 -0
  19. pycharter/data/templates/etl/extract_cloud_azure.yaml +24 -0
  20. pycharter/data/templates/etl/extract_cloud_gcs.yaml +25 -0
  21. pycharter/data/templates/etl/extract_cloud_s3.yaml +30 -0
  22. pycharter/data/templates/etl/extract_database.yaml +34 -0
  23. pycharter/data/templates/etl/extract_database_ssh.yaml +40 -0
  24. pycharter/data/templates/etl/extract_file_csv.yaml +21 -0
  25. pycharter/data/templates/etl/extract_file_glob.yaml +25 -0
  26. pycharter/data/templates/etl/extract_file_json.yaml +24 -0
  27. pycharter/data/templates/etl/extract_file_parquet.yaml +20 -0
  28. pycharter/data/templates/etl/extract_http_paginated.yaml +79 -0
  29. pycharter/data/templates/etl/extract_http_path_params.yaml +38 -0
  30. pycharter/data/templates/etl/extract_http_simple.yaml +62 -0
  31. pycharter/data/templates/etl/load_cloud_azure.yaml +24 -0
  32. pycharter/data/templates/etl/load_cloud_gcs.yaml +22 -0
  33. pycharter/data/templates/etl/load_cloud_s3.yaml +27 -0
  34. pycharter/data/templates/etl/load_file.yaml +34 -0
  35. pycharter/data/templates/etl/load_insert.yaml +18 -0
  36. pycharter/data/templates/etl/load_postgresql.yaml +39 -0
  37. pycharter/data/templates/etl/load_sqlite.yaml +21 -0
  38. pycharter/data/templates/etl/load_truncate_and_load.yaml +20 -0
  39. pycharter/data/templates/etl/load_upsert.yaml +25 -0
  40. pycharter/data/templates/etl/load_with_dlq.yaml +34 -0
  41. pycharter/data/templates/etl/load_with_ssh_tunnel.yaml +35 -0
  42. pycharter/data/templates/etl/pipeline_http_to_db.yaml +75 -0
  43. pycharter/data/templates/etl/transform_combined.yaml +48 -0
  44. pycharter/data/templates/etl/transform_custom_function.yaml +58 -0
  45. pycharter/data/templates/etl/transform_jsonata.yaml +51 -0
  46. pycharter/data/templates/etl/transform_simple.yaml +59 -0
  47. pycharter/db/schemas/.ipynb_checkpoints/data_contract-checkpoint.py +160 -0
  48. pycharter/docs_generator/__init__.py +43 -0
  49. pycharter/docs_generator/generator.py +465 -0
  50. pycharter/docs_generator/renderers.py +247 -0
  51. pycharter/etl_generator/__init__.py +168 -80
  52. pycharter/etl_generator/builder.py +121 -0
  53. pycharter/etl_generator/config_loader.py +394 -0
  54. pycharter/etl_generator/config_validator.py +418 -0
  55. pycharter/etl_generator/context.py +132 -0
  56. pycharter/etl_generator/expression.py +499 -0
  57. pycharter/etl_generator/extractors/__init__.py +30 -0
  58. pycharter/etl_generator/extractors/base.py +70 -0
  59. pycharter/etl_generator/extractors/cloud_storage.py +530 -0
  60. pycharter/etl_generator/extractors/database.py +221 -0
  61. pycharter/etl_generator/extractors/factory.py +185 -0
  62. pycharter/etl_generator/extractors/file.py +475 -0
  63. pycharter/etl_generator/extractors/http.py +895 -0
  64. pycharter/etl_generator/extractors/streaming.py +57 -0
  65. pycharter/etl_generator/loaders/__init__.py +41 -0
  66. pycharter/etl_generator/loaders/base.py +35 -0
  67. pycharter/etl_generator/loaders/cloud.py +87 -0
  68. pycharter/etl_generator/loaders/cloud_storage_loader.py +275 -0
  69. pycharter/etl_generator/loaders/database.py +274 -0
  70. pycharter/etl_generator/loaders/factory.py +180 -0
  71. pycharter/etl_generator/loaders/file.py +72 -0
  72. pycharter/etl_generator/loaders/file_loader.py +130 -0
  73. pycharter/etl_generator/pipeline.py +743 -0
  74. pycharter/etl_generator/protocols.py +54 -0
  75. pycharter/etl_generator/result.py +63 -0
  76. pycharter/etl_generator/schemas/__init__.py +49 -0
  77. pycharter/etl_generator/transformers/__init__.py +49 -0
  78. pycharter/etl_generator/transformers/base.py +63 -0
  79. pycharter/etl_generator/transformers/config.py +45 -0
  80. pycharter/etl_generator/transformers/custom_function.py +101 -0
  81. pycharter/etl_generator/transformers/jsonata_transformer.py +56 -0
  82. pycharter/etl_generator/transformers/operations.py +218 -0
  83. pycharter/etl_generator/transformers/pipeline.py +54 -0
  84. pycharter/etl_generator/transformers/simple_operations.py +131 -0
  85. pycharter/quality/__init__.py +25 -0
  86. pycharter/quality/tracking/__init__.py +64 -0
  87. pycharter/quality/tracking/collector.py +318 -0
  88. pycharter/quality/tracking/exporters.py +238 -0
  89. pycharter/quality/tracking/models.py +194 -0
  90. pycharter/quality/tracking/store.py +385 -0
  91. pycharter/runtime_validator/__init__.py +20 -7
  92. pycharter/runtime_validator/builder.py +328 -0
  93. pycharter/runtime_validator/validator.py +311 -7
  94. pycharter/runtime_validator/validator_core.py +61 -0
  95. pycharter/schema_evolution/__init__.py +61 -0
  96. pycharter/schema_evolution/compatibility.py +270 -0
  97. pycharter/schema_evolution/diff.py +496 -0
  98. pycharter/schema_evolution/models.py +201 -0
  99. pycharter/shared/__init__.py +56 -0
  100. pycharter/shared/errors.py +296 -0
  101. pycharter/shared/protocols.py +234 -0
  102. {pycharter-0.0.22.dist-info → pycharter-0.0.24.dist-info}/METADATA +146 -26
  103. pycharter-0.0.24.dist-info/RECORD +543 -0
  104. {pycharter-0.0.22.dist-info → pycharter-0.0.24.dist-info}/WHEEL +1 -1
  105. ui/static/404/index.html +1 -1
  106. ui/static/404.html +1 -1
  107. ui/static/__next.__PAGE__.txt +1 -1
  108. ui/static/__next._full.txt +1 -1
  109. ui/static/__next._head.txt +1 -1
  110. ui/static/__next._index.txt +1 -1
  111. ui/static/__next._tree.txt +1 -1
  112. ui/static/_next/static/chunks/26dfc590f7714c03.js +1 -0
  113. ui/static/_next/static/chunks/34d289e6db2ef551.js +1 -0
  114. ui/static/_next/static/chunks/99508d9d5869cc27.js +1 -0
  115. ui/static/_next/static/chunks/b313c35a6ba76574.js +1 -0
  116. ui/static/_not-found/__next._full.txt +1 -1
  117. ui/static/_not-found/__next._head.txt +1 -1
  118. ui/static/_not-found/__next._index.txt +1 -1
  119. ui/static/_not-found/__next._not-found.__PAGE__.txt +1 -1
  120. ui/static/_not-found/__next._not-found.txt +1 -1
  121. ui/static/_not-found/__next._tree.txt +1 -1
  122. ui/static/_not-found/index.html +1 -1
  123. ui/static/_not-found/index.txt +1 -1
  124. ui/static/contracts/__next._full.txt +2 -2
  125. ui/static/contracts/__next._head.txt +1 -1
  126. ui/static/contracts/__next._index.txt +1 -1
  127. ui/static/contracts/__next._tree.txt +1 -1
  128. ui/static/contracts/__next.contracts.__PAGE__.txt +2 -2
  129. ui/static/contracts/__next.contracts.txt +1 -1
  130. ui/static/contracts/index.html +1 -1
  131. ui/static/contracts/index.txt +2 -2
  132. ui/static/documentation/__next._full.txt +1 -1
  133. ui/static/documentation/__next._head.txt +1 -1
  134. ui/static/documentation/__next._index.txt +1 -1
  135. ui/static/documentation/__next._tree.txt +1 -1
  136. ui/static/documentation/__next.documentation.__PAGE__.txt +1 -1
  137. ui/static/documentation/__next.documentation.txt +1 -1
  138. ui/static/documentation/index.html +2 -2
  139. ui/static/documentation/index.txt +1 -1
  140. ui/static/index.html +1 -1
  141. ui/static/index.txt +1 -1
  142. ui/static/metadata/__next._full.txt +1 -1
  143. ui/static/metadata/__next._head.txt +1 -1
  144. ui/static/metadata/__next._index.txt +1 -1
  145. ui/static/metadata/__next._tree.txt +1 -1
  146. ui/static/metadata/__next.metadata.__PAGE__.txt +1 -1
  147. ui/static/metadata/__next.metadata.txt +1 -1
  148. ui/static/metadata/index.html +1 -1
  149. ui/static/metadata/index.txt +1 -1
  150. ui/static/quality/__next._full.txt +2 -2
  151. ui/static/quality/__next._head.txt +1 -1
  152. ui/static/quality/__next._index.txt +1 -1
  153. ui/static/quality/__next._tree.txt +1 -1
  154. ui/static/quality/__next.quality.__PAGE__.txt +2 -2
  155. ui/static/quality/__next.quality.txt +1 -1
  156. ui/static/quality/index.html +2 -2
  157. ui/static/quality/index.txt +2 -2
  158. ui/static/rules/__next._full.txt +1 -1
  159. ui/static/rules/__next._head.txt +1 -1
  160. ui/static/rules/__next._index.txt +1 -1
  161. ui/static/rules/__next._tree.txt +1 -1
  162. ui/static/rules/__next.rules.__PAGE__.txt +1 -1
  163. ui/static/rules/__next.rules.txt +1 -1
  164. ui/static/rules/index.html +1 -1
  165. ui/static/rules/index.txt +1 -1
  166. ui/static/schemas/__next._full.txt +1 -1
  167. ui/static/schemas/__next._head.txt +1 -1
  168. ui/static/schemas/__next._index.txt +1 -1
  169. ui/static/schemas/__next._tree.txt +1 -1
  170. ui/static/schemas/__next.schemas.__PAGE__.txt +1 -1
  171. ui/static/schemas/__next.schemas.txt +1 -1
  172. ui/static/schemas/index.html +1 -1
  173. ui/static/schemas/index.txt +1 -1
  174. ui/static/settings/__next._full.txt +1 -1
  175. ui/static/settings/__next._head.txt +1 -1
  176. ui/static/settings/__next._index.txt +1 -1
  177. ui/static/settings/__next._tree.txt +1 -1
  178. ui/static/settings/__next.settings.__PAGE__.txt +1 -1
  179. ui/static/settings/__next.settings.txt +1 -1
  180. ui/static/settings/index.html +1 -1
  181. ui/static/settings/index.txt +1 -1
  182. ui/static/static/404/index.html +1 -1
  183. ui/static/static/404.html +1 -1
  184. ui/static/static/__next.__PAGE__.txt +1 -1
  185. ui/static/static/__next._full.txt +2 -2
  186. ui/static/static/__next._head.txt +1 -1
  187. ui/static/static/__next._index.txt +2 -2
  188. ui/static/static/__next._tree.txt +2 -2
  189. ui/static/static/_next/static/chunks/13d4a0fbd74c1ee4.js +1 -0
  190. ui/static/static/_next/static/chunks/2edb43b48432ac04.js +441 -0
  191. ui/static/static/_next/static/chunks/d2363397e1b2bcab.css +1 -0
  192. ui/static/static/_next/static/chunks/f7d1a90dd75d2572.js +1 -0
  193. ui/static/static/_not-found/__next._full.txt +2 -2
  194. ui/static/static/_not-found/__next._head.txt +1 -1
  195. ui/static/static/_not-found/__next._index.txt +2 -2
  196. ui/static/static/_not-found/__next._not-found.__PAGE__.txt +1 -1
  197. ui/static/static/_not-found/__next._not-found.txt +1 -1
  198. ui/static/static/_not-found/__next._tree.txt +2 -2
  199. ui/static/static/_not-found/index.html +1 -1
  200. ui/static/static/_not-found/index.txt +2 -2
  201. ui/static/static/contracts/__next._full.txt +3 -3
  202. ui/static/static/contracts/__next._head.txt +1 -1
  203. ui/static/static/contracts/__next._index.txt +2 -2
  204. ui/static/static/contracts/__next._tree.txt +2 -2
  205. ui/static/static/contracts/__next.contracts.__PAGE__.txt +2 -2
  206. ui/static/static/contracts/__next.contracts.txt +1 -1
  207. ui/static/static/contracts/index.html +1 -1
  208. ui/static/static/contracts/index.txt +3 -3
  209. ui/static/static/documentation/__next._full.txt +3 -3
  210. ui/static/static/documentation/__next._head.txt +1 -1
  211. ui/static/static/documentation/__next._index.txt +2 -2
  212. ui/static/static/documentation/__next._tree.txt +2 -2
  213. ui/static/static/documentation/__next.documentation.__PAGE__.txt +2 -2
  214. ui/static/static/documentation/__next.documentation.txt +1 -1
  215. ui/static/static/documentation/index.html +2 -2
  216. ui/static/static/documentation/index.txt +3 -3
  217. ui/static/static/index.html +1 -1
  218. ui/static/static/index.txt +2 -2
  219. ui/static/static/metadata/__next._full.txt +2 -2
  220. ui/static/static/metadata/__next._head.txt +1 -1
  221. ui/static/static/metadata/__next._index.txt +2 -2
  222. ui/static/static/metadata/__next._tree.txt +2 -2
  223. ui/static/static/metadata/__next.metadata.__PAGE__.txt +1 -1
  224. ui/static/static/metadata/__next.metadata.txt +1 -1
  225. ui/static/static/metadata/index.html +1 -1
  226. ui/static/static/metadata/index.txt +2 -2
  227. ui/static/static/quality/__next._full.txt +2 -2
  228. ui/static/static/quality/__next._head.txt +1 -1
  229. ui/static/static/quality/__next._index.txt +2 -2
  230. ui/static/static/quality/__next._tree.txt +2 -2
  231. ui/static/static/quality/__next.quality.__PAGE__.txt +1 -1
  232. ui/static/static/quality/__next.quality.txt +1 -1
  233. ui/static/static/quality/index.html +2 -2
  234. ui/static/static/quality/index.txt +2 -2
  235. ui/static/static/rules/__next._full.txt +2 -2
  236. ui/static/static/rules/__next._head.txt +1 -1
  237. ui/static/static/rules/__next._index.txt +2 -2
  238. ui/static/static/rules/__next._tree.txt +2 -2
  239. ui/static/static/rules/__next.rules.__PAGE__.txt +1 -1
  240. ui/static/static/rules/__next.rules.txt +1 -1
  241. ui/static/static/rules/index.html +1 -1
  242. ui/static/static/rules/index.txt +2 -2
  243. ui/static/static/schemas/__next._full.txt +2 -2
  244. ui/static/static/schemas/__next._head.txt +1 -1
  245. ui/static/static/schemas/__next._index.txt +2 -2
  246. ui/static/static/schemas/__next._tree.txt +2 -2
  247. ui/static/static/schemas/__next.schemas.__PAGE__.txt +1 -1
  248. ui/static/static/schemas/__next.schemas.txt +1 -1
  249. ui/static/static/schemas/index.html +1 -1
  250. ui/static/static/schemas/index.txt +2 -2
  251. ui/static/static/settings/__next._full.txt +2 -2
  252. ui/static/static/settings/__next._head.txt +1 -1
  253. ui/static/static/settings/__next._index.txt +2 -2
  254. ui/static/static/settings/__next._tree.txt +2 -2
  255. ui/static/static/settings/__next.settings.__PAGE__.txt +1 -1
  256. ui/static/static/settings/__next.settings.txt +1 -1
  257. ui/static/static/settings/index.html +1 -1
  258. ui/static/static/settings/index.txt +2 -2
  259. ui/static/static/static/.gitkeep +0 -0
  260. ui/static/static/static/404/index.html +1 -0
  261. ui/static/static/static/404.html +1 -0
  262. ui/static/static/static/__next.__PAGE__.txt +10 -0
  263. ui/static/static/static/__next._full.txt +30 -0
  264. ui/static/static/static/__next._head.txt +7 -0
  265. ui/static/static/static/__next._index.txt +9 -0
  266. ui/static/static/static/__next._tree.txt +2 -0
  267. ui/static/static/static/_next/static/chunks/222442f6da32302a.js +1 -0
  268. ui/static/static/static/_next/static/chunks/247eb132b7f7b574.js +1 -0
  269. ui/static/static/static/_next/static/chunks/297d55555b71baba.js +1 -0
  270. ui/static/static/static/_next/static/chunks/2ab439ce003cd691.js +1 -0
  271. ui/static/static/static/_next/static/chunks/414e77373f8ff61c.js +1 -0
  272. ui/static/static/static/_next/static/chunks/49ca65abd26ae49e.js +1 -0
  273. ui/static/static/static/_next/static/chunks/652ad0aa26265c47.js +2 -0
  274. ui/static/static/static/_next/static/chunks/9667e7a3d359eb39.js +1 -0
  275. ui/static/static/static/_next/static/chunks/9c23f44fff36548a.js +1 -0
  276. ui/static/static/static/_next/static/chunks/a6dad97d9634a72d.js +1 -0
  277. ui/static/static/static/_next/static/chunks/b32a0963684b9933.js +4 -0
  278. ui/static/static/static/_next/static/chunks/c69f6cba366bd988.js +1 -0
  279. ui/static/static/static/_next/static/chunks/db913959c675cea6.js +1 -0
  280. ui/static/static/static/_next/static/chunks/f061a4be97bfc3b3.js +1 -0
  281. ui/static/static/static/_next/static/chunks/f2e7afeab1178138.js +1 -0
  282. ui/static/static/static/_next/static/chunks/ff1a16fafef87110.js +1 -0
  283. ui/static/static/static/_next/static/chunks/turbopack-ffcb7ab6794027ef.js +3 -0
  284. ui/static/static/static/_next/static/tNTkVW6puVXC4bAm4WrHl/_buildManifest.js +11 -0
  285. ui/static/static/static/_next/static/tNTkVW6puVXC4bAm4WrHl/_ssgManifest.js +1 -0
  286. ui/static/static/static/_not-found/__next._full.txt +17 -0
  287. ui/static/static/static/_not-found/__next._head.txt +7 -0
  288. ui/static/static/static/_not-found/__next._index.txt +9 -0
  289. ui/static/static/static/_not-found/__next._not-found.__PAGE__.txt +5 -0
  290. ui/static/static/static/_not-found/__next._not-found.txt +4 -0
  291. ui/static/static/static/_not-found/__next._tree.txt +2 -0
  292. ui/static/static/static/_not-found/index.html +1 -0
  293. ui/static/static/static/_not-found/index.txt +17 -0
  294. ui/static/static/static/contracts/__next._full.txt +21 -0
  295. ui/static/static/static/contracts/__next._head.txt +7 -0
  296. ui/static/static/static/contracts/__next._index.txt +9 -0
  297. ui/static/static/static/contracts/__next._tree.txt +2 -0
  298. ui/static/static/static/contracts/__next.contracts.__PAGE__.txt +9 -0
  299. ui/static/static/static/contracts/__next.contracts.txt +4 -0
  300. ui/static/static/static/contracts/index.html +1 -0
  301. ui/static/static/static/contracts/index.txt +21 -0
  302. ui/static/static/static/documentation/__next._full.txt +21 -0
  303. ui/static/static/static/documentation/__next._head.txt +7 -0
  304. ui/static/static/static/documentation/__next._index.txt +9 -0
  305. ui/static/static/static/documentation/__next._tree.txt +2 -0
  306. ui/static/static/static/documentation/__next.documentation.__PAGE__.txt +9 -0
  307. ui/static/static/static/documentation/__next.documentation.txt +4 -0
  308. ui/static/static/static/documentation/index.html +93 -0
  309. ui/static/static/static/documentation/index.txt +21 -0
  310. ui/static/static/static/index.html +1 -0
  311. ui/static/static/static/index.txt +30 -0
  312. ui/static/static/static/metadata/__next._full.txt +21 -0
  313. ui/static/static/static/metadata/__next._head.txt +7 -0
  314. ui/static/static/static/metadata/__next._index.txt +9 -0
  315. ui/static/static/static/metadata/__next._tree.txt +2 -0
  316. ui/static/static/static/metadata/__next.metadata.__PAGE__.txt +9 -0
  317. ui/static/static/static/metadata/__next.metadata.txt +4 -0
  318. ui/static/static/static/metadata/index.html +1 -0
  319. ui/static/static/static/metadata/index.txt +21 -0
  320. ui/static/static/static/quality/__next._full.txt +21 -0
  321. ui/static/static/static/quality/__next._head.txt +7 -0
  322. ui/static/static/static/quality/__next._index.txt +9 -0
  323. ui/static/static/static/quality/__next._tree.txt +2 -0
  324. ui/static/static/static/quality/__next.quality.__PAGE__.txt +9 -0
  325. ui/static/static/static/quality/__next.quality.txt +4 -0
  326. ui/static/static/static/quality/index.html +2 -0
  327. ui/static/static/static/quality/index.txt +21 -0
  328. ui/static/static/static/rules/__next._full.txt +21 -0
  329. ui/static/static/static/rules/__next._head.txt +7 -0
  330. ui/static/static/static/rules/__next._index.txt +9 -0
  331. ui/static/static/static/rules/__next._tree.txt +2 -0
  332. ui/static/static/static/rules/__next.rules.__PAGE__.txt +9 -0
  333. ui/static/static/static/rules/__next.rules.txt +4 -0
  334. ui/static/static/static/rules/index.html +1 -0
  335. ui/static/static/static/rules/index.txt +21 -0
  336. ui/static/static/static/schemas/__next._full.txt +21 -0
  337. ui/static/static/static/schemas/__next._head.txt +7 -0
  338. ui/static/static/static/schemas/__next._index.txt +9 -0
  339. ui/static/static/static/schemas/__next._tree.txt +2 -0
  340. ui/static/static/static/schemas/__next.schemas.__PAGE__.txt +9 -0
  341. ui/static/static/static/schemas/__next.schemas.txt +4 -0
  342. ui/static/static/static/schemas/index.html +1 -0
  343. ui/static/static/static/schemas/index.txt +21 -0
  344. ui/static/static/static/settings/__next._full.txt +21 -0
  345. ui/static/static/static/settings/__next._head.txt +7 -0
  346. ui/static/static/static/settings/__next._index.txt +9 -0
  347. ui/static/static/static/settings/__next._tree.txt +2 -0
  348. ui/static/static/static/settings/__next.settings.__PAGE__.txt +9 -0
  349. ui/static/static/static/settings/__next.settings.txt +4 -0
  350. ui/static/static/static/settings/index.html +1 -0
  351. ui/static/static/static/settings/index.txt +21 -0
  352. ui/static/static/static/validation/__next._full.txt +21 -0
  353. ui/static/static/static/validation/__next._head.txt +7 -0
  354. ui/static/static/static/validation/__next._index.txt +9 -0
  355. ui/static/static/static/validation/__next._tree.txt +2 -0
  356. ui/static/static/static/validation/__next.validation.__PAGE__.txt +9 -0
  357. ui/static/static/static/validation/__next.validation.txt +4 -0
  358. ui/static/static/static/validation/index.html +1 -0
  359. ui/static/static/static/validation/index.txt +21 -0
  360. ui/static/static/validation/__next._full.txt +2 -2
  361. ui/static/static/validation/__next._head.txt +1 -1
  362. ui/static/static/validation/__next._index.txt +2 -2
  363. ui/static/static/validation/__next._tree.txt +2 -2
  364. ui/static/static/validation/__next.validation.__PAGE__.txt +1 -1
  365. ui/static/static/validation/__next.validation.txt +1 -1
  366. ui/static/static/validation/index.html +1 -1
  367. ui/static/static/validation/index.txt +2 -2
  368. ui/static/validation/__next._full.txt +2 -2
  369. ui/static/validation/__next._head.txt +1 -1
  370. ui/static/validation/__next._index.txt +1 -1
  371. ui/static/validation/__next._tree.txt +1 -1
  372. ui/static/validation/__next.validation.__PAGE__.txt +2 -2
  373. ui/static/validation/__next.validation.txt +1 -1
  374. ui/static/validation/index.html +1 -1
  375. ui/static/validation/index.txt +2 -2
  376. pycharter/data/templates/template_coercion_rules.yaml +0 -15
  377. pycharter/data/templates/template_contract.yaml +0 -587
  378. pycharter/data/templates/template_metadata.yaml +0 -38
  379. pycharter/data/templates/template_schema.yaml +0 -22
  380. pycharter/data/templates/template_transform_advanced.yaml +0 -50
  381. pycharter/data/templates/template_transform_simple.yaml +0 -59
  382. pycharter/data/templates/template_validation_rules.yaml +0 -29
  383. pycharter/etl_generator/extraction.py +0 -916
  384. pycharter/etl_generator/factory.py +0 -174
  385. pycharter/etl_generator/orchestrator.py +0 -1650
  386. pycharter/integrations/__init__.py +0 -19
  387. pycharter/integrations/kafka.py +0 -178
  388. pycharter/integrations/streaming.py +0 -100
  389. pycharter-0.0.22.dist-info/RECORD +0 -358
  390. {pycharter-0.0.22.dist-info → pycharter-0.0.24.dist-info}/entry_points.txt +0 -0
  391. {pycharter-0.0.22.dist-info → pycharter-0.0.24.dist-info}/licenses/LICENSE +0 -0
  392. {pycharter-0.0.22.dist-info → pycharter-0.0.24.dist-info}/top_level.txt +0 -0
  393. /ui/static/_next/static/{0rYA78L88aUyD2Uh38hhX → 2gKjNv6YvE6BcIdFthBLs}/_buildManifest.js +0 -0
  394. /ui/static/_next/static/{0rYA78L88aUyD2Uh38hhX → 2gKjNv6YvE6BcIdFthBLs}/_ssgManifest.js +0 -0
  395. /ui/static/static/_next/static/{tNTkVW6puVXC4bAm4WrHl → 0rYA78L88aUyD2Uh38hhX}/_buildManifest.js +0 -0
  396. /ui/static/static/_next/static/{tNTkVW6puVXC4bAm4WrHl → 0rYA78L88aUyD2Uh38hhX}/_ssgManifest.js +0 -0
  397. /ui/static/{_next → static/_next}/static/chunks/c4fa4f4114b7c352.js +0 -0
  398. /ui/static/static/{_next → static/_next}/static/chunks/4e310fe5005770a3.css +0 -0
  399. /ui/static/{_next → static/static/_next}/static/chunks/5e04d10c4a7b58a3.js +0 -0
  400. /ui/static/static/{_next → static/_next}/static/chunks/5fc14c00a2779dc5.js +0 -0
  401. /ui/static/{_next → static/static/_next}/static/chunks/75d88a058d8ffaa6.js +0 -0
  402. /ui/static/{_next → static/static/_next}/static/chunks/8c89634cf6bad76f.js +0 -0
  403. /ui/static/static/{_next → static/_next}/static/chunks/b584574fdc8ab13e.js +0 -0
  404. /ui/static/static/{_next → static/_next}/static/chunks/d5989c94d3614b3a.js +0 -0
@@ -0,0 +1,895 @@
1
+ """
2
+ HTTP/API extractor for ETL orchestrator.
3
+
4
+ Handles HTTP-based data extraction with support for:
5
+ - GET and POST requests
6
+ - Retry logic with exponential backoff
7
+ - Rate limiting
8
+ - Pagination (page, offset, cursor, next_url, link_header)
9
+ - Response parsing (JSON, text)
10
+ - Path parameter substitution
11
+ """
12
+
13
+ import asyncio
14
+ import logging
15
+ import re
16
+ import time
17
+ from typing import Any, AsyncIterator, Dict, List, Optional
18
+
19
+ import httpx
20
+
21
+ from pycharter.etl_generator.extractors.base import BaseExtractor
22
+ from pycharter.utils.value_injector import resolve_values
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ # Default configuration values
27
+ DEFAULT_RATE_LIMIT_DELAY = 0.2
28
+ DEFAULT_MAX_ATTEMPTS = 3
29
+ DEFAULT_BACKOFF_FACTOR = 2.0
30
+ DEFAULT_RETRY_STATUS_CODES = [429, 500, 502, 503, 504]
31
+ DEFAULT_TIMEOUT_CONNECT = 10.0
32
+ DEFAULT_TIMEOUT_READ = 30.0
33
+ DEFAULT_TIMEOUT_WRITE = 10.0
34
+ DEFAULT_TIMEOUT_POOL = 10.0
35
+
36
+ # Common response data keys
37
+ RESPONSE_DATA_KEYS = ['data', 'results', 'items', 'records', 'values']
38
+
39
+
40
+ class HTTPExtractor(BaseExtractor):
41
+ """
42
+ Extractor for HTTP/API data sources.
43
+
44
+ Supports two modes:
45
+ 1. Programmatic API:
46
+ >>> extractor = HTTPExtractor(url="https://api.example.com/users")
47
+ >>> async for batch in extractor.extract():
48
+ ... process(batch)
49
+
50
+ 2. Config-driven (legacy):
51
+ >>> extractor = HTTPExtractor()
52
+ >>> async for batch in extractor.extract_streaming(config, params, headers):
53
+ ... process(batch)
54
+ """
55
+
56
+ def __init__(
57
+ self,
58
+ url: Optional[str] = None,
59
+ base_url: Optional[str] = None,
60
+ endpoint: Optional[str] = None,
61
+ method: str = "GET",
62
+ headers: Optional[Dict[str, str]] = None,
63
+ params: Optional[Dict[str, Any]] = None,
64
+ body: Optional[Any] = None,
65
+ response_path: Optional[str] = None,
66
+ batch_size: int = 1000,
67
+ pagination: Optional[Dict[str, Any]] = None,
68
+ ):
69
+ self.url = url
70
+ self.base_url = base_url or ""
71
+ self.endpoint = endpoint or ""
72
+ self.method = method
73
+ self.headers = headers or {}
74
+ self.params = params or {}
75
+ self.body = body
76
+ self.response_path = response_path
77
+ self.batch_size = batch_size
78
+ self.pagination = pagination or {}
79
+
80
+ @classmethod
81
+ def from_config(cls, config: Dict[str, Any]) -> "HTTPExtractor":
82
+ """Create extractor from configuration dict."""
83
+ return cls(
84
+ url=config.get("url"),
85
+ base_url=config.get("base_url", ""),
86
+ endpoint=config.get("api_endpoint", config.get("endpoint", "")),
87
+ method=config.get("method", "GET"),
88
+ headers=config.get("headers", {}),
89
+ params=config.get("params", {}),
90
+ body=config.get("body"),
91
+ response_path=config.get("response_path"),
92
+ batch_size=config.get("batch_size", 1000),
93
+ pagination=config.get("pagination"),
94
+ )
95
+
96
+ async def extract(self, **params) -> AsyncIterator[List[Dict[str, Any]]]:
97
+ """
98
+ Extract data from HTTP source.
99
+
100
+ Yields:
101
+ Batches of records
102
+ """
103
+ # Build extract config from instance attributes
104
+ extract_config = {
105
+ "base_url": self.base_url,
106
+ "api_endpoint": self.url or self.endpoint,
107
+ "method": self.method,
108
+ "response_path": self.response_path,
109
+ "pagination": self.pagination,
110
+ }
111
+
112
+ merged_params = {**self.params, **params}
113
+
114
+ async for batch in self.extract_streaming(
115
+ extract_config,
116
+ merged_params,
117
+ self.headers,
118
+ batch_size=self.batch_size,
119
+ ):
120
+ yield batch
121
+
122
+ def validate_config(self, extract_config: Dict[str, Any]) -> None:
123
+ """Validate HTTP extractor configuration."""
124
+ if 'source_type' in extract_config and extract_config['source_type'] != 'http':
125
+ raise ValueError(f"HTTPExtractor requires source_type='http', got '{extract_config.get('source_type')}'")
126
+
127
+ # Check for required HTTP config fields
128
+ if not extract_config.get('api_endpoint') and not extract_config.get('base_url'):
129
+ # Allow if api_endpoint is a full URL
130
+ api_endpoint = extract_config.get('api_endpoint', '')
131
+ if not api_endpoint.startswith(('http://', 'https://')):
132
+ raise ValueError(
133
+ "HTTP extractor requires either 'api_endpoint' (with 'base_url') "
134
+ "or 'api_endpoint' as full URL"
135
+ )
136
+
137
+ async def extract_streaming(
138
+ self,
139
+ extract_config: Dict[str, Any],
140
+ params: Dict[str, Any],
141
+ headers: Dict[str, Any],
142
+ contract_dir: Optional[Any] = None,
143
+ batch_size: int = 1000,
144
+ max_records: Optional[int] = None,
145
+ config_context: Optional[Dict[str, Any]] = None,
146
+ ) -> AsyncIterator[List[Dict[str, Any]]]:
147
+ """
148
+ Extract data from HTTP/API source with pagination support.
149
+
150
+ Yields batches as they are extracted, preventing memory exhaustion for large datasets.
151
+ """
152
+ pagination_config = extract_config.get('pagination', {})
153
+
154
+ # If pagination is not enabled, extract all and yield in batches
155
+ if not pagination_config.get('enabled', False):
156
+ logger.info("Pagination disabled, extracting all data in single request")
157
+ all_data = await self._extract_with_retry(
158
+ extract_config, params, headers, contract_dir, config_context=config_context
159
+ )
160
+ if max_records:
161
+ logger.info(f"Limiting to {max_records} records (extracted {len(all_data)})")
162
+ all_data = all_data[:max_records]
163
+
164
+ logger.info(f"Yielding {len(all_data)} records in batches of {batch_size}")
165
+ for i in range(0, len(all_data), batch_size):
166
+ batch = all_data[i:i + batch_size]
167
+ logger.debug(f"Yielding batch {i // batch_size + 1} with {len(batch)} records")
168
+ yield batch
169
+ return
170
+
171
+ # Pagination enabled - stream pages and yield in batches
172
+ async for batch in self._extract_with_pagination(
173
+ extract_config, params, headers, contract_dir, batch_size, max_records, config_context
174
+ ):
175
+ yield batch
176
+
177
+ async def _extract_with_retry(
178
+ self,
179
+ extract_config: Dict[str, Any],
180
+ params: Dict[str, Any],
181
+ headers: Dict[str, Any],
182
+ contract_dir: Optional[Any] = None,
183
+ config_context: Optional[Dict[str, Any]] = None,
184
+ ) -> List[Dict[str, Any]]:
185
+ """Extract data from API with retry logic."""
186
+ extracted_data, _, _ = await self._extract_single_page(
187
+ extract_config, params, headers, contract_dir, return_full_response=False, config_context=config_context
188
+ )
189
+ return extracted_data
190
+
191
+ async def _extract_single_page(
192
+ self,
193
+ extract_config: Dict[str, Any],
194
+ params: Dict[str, Any],
195
+ headers: Dict[str, Any],
196
+ contract_dir: Optional[Any] = None,
197
+ return_full_response: bool = False,
198
+ config_context: Optional[Dict[str, Any]] = None,
199
+ ) -> tuple[List[Dict[str, Any]], Optional[Any], Optional[httpx.Response]]:
200
+ """Extract data from a single API request with retry logic."""
201
+ # Get configuration
202
+ base_url = extract_config.get('base_url', '')
203
+ api_endpoint = extract_config.get('api_endpoint', '')
204
+ method = extract_config.get('method', 'GET').upper()
205
+ timeout_config = extract_config.get('timeout', {})
206
+ retry_config = extract_config.get('retry', {})
207
+ response_path = extract_config.get('response_path')
208
+ response_format = extract_config.get('response_format', 'json')
209
+ rate_limit_delay = extract_config.get('rate_limit_delay', DEFAULT_RATE_LIMIT_DELAY)
210
+ body = extract_config.get('body')
211
+
212
+ # Resolve variables and convert types
213
+ source_file = str(contract_dir / "extract.yaml") if contract_dir else None
214
+ resolved_params = resolve_values(params, context=config_context, source_file=source_file)
215
+ resolved_headers = resolve_values(headers, context=config_context, source_file=source_file)
216
+ resolved_timeout_config = resolve_values(timeout_config, context=config_context, source_file=source_file)
217
+ resolved_rate_limit_delay = self._resolve_rate_limit_delay(rate_limit_delay, contract_dir, config_context)
218
+
219
+ if body:
220
+ resolved_body = resolve_values(body, context=config_context, source_file=source_file)
221
+ else:
222
+ resolved_body = None
223
+
224
+ # Extract path parameters from api_endpoint
225
+ path_params = {}
226
+ if '{' in api_endpoint:
227
+ path_param_names = re.findall(r'\{(\w+)\}', api_endpoint)
228
+ for param_name in path_param_names:
229
+ if param_name in resolved_params:
230
+ path_params[param_name] = resolved_params.pop(param_name)
231
+
232
+ # Build URL with path parameter substitution
233
+ url = self._build_request_url(base_url, api_endpoint, path_params)
234
+
235
+ # Configure timeout
236
+ timeout = self._configure_timeout(resolved_timeout_config)
237
+
238
+ # Configure retry
239
+ max_attempts = int(retry_config.get('max_attempts', DEFAULT_MAX_ATTEMPTS))
240
+ backoff_factor = float(retry_config.get('backoff_factor', DEFAULT_BACKOFF_FACTOR))
241
+ retry_on_status = retry_config.get('retry_on_status', DEFAULT_RETRY_STATUS_CODES)
242
+
243
+ # Make request with retry logic
244
+ last_exception = None
245
+ request_start_time = None
246
+
247
+ logger.info(
248
+ f"Starting HTTP extraction: {method} {url} "
249
+ f"(timeout: connect={timeout.connect}s, read={timeout.read}s, "
250
+ f"max_attempts={max_attempts})"
251
+ )
252
+ logger.debug(f"Request params: {resolved_params}")
253
+ logger.debug(f"Request headers: {dict(resolved_headers)}")
254
+
255
+ for attempt in range(max_attempts):
256
+ try:
257
+ request_start_time = time.time()
258
+ logger.debug(f"HTTP request attempt {attempt + 1}/{max_attempts} to {url}")
259
+
260
+ async with httpx.AsyncClient(timeout=timeout) as client:
261
+ if attempt > 0:
262
+ wait_time = backoff_factor ** (attempt - 1)
263
+ logger.info(f"Retrying after {wait_time:.2f}s (attempt {attempt + 1}/{max_attempts})")
264
+ await asyncio.sleep(wait_time)
265
+
266
+ request_attempt_start = time.time()
267
+ try:
268
+ response = await self._make_http_request(
269
+ client, method, url, resolved_params, resolved_headers, resolved_body
270
+ )
271
+ request_duration = time.time() - request_attempt_start
272
+ logger.info(
273
+ f"HTTP request completed: {response.status_code} "
274
+ f"({request_duration:.2f}s, attempt {attempt + 1}/{max_attempts})"
275
+ )
276
+ except httpx.TimeoutException as timeout_error:
277
+ request_duration = time.time() - request_attempt_start
278
+ timeout_info = ""
279
+ if hasattr(timeout_error, 'timeout') and isinstance(timeout_error.timeout, httpx.Timeout):
280
+ timeout_info = (
281
+ f" (connect={timeout_error.timeout.connect}s, "
282
+ f"read={timeout_error.timeout.read}s)"
283
+ )
284
+ logger.error(
285
+ f"HTTP request timeout after {request_duration:.2f}s{timeout_info}: "
286
+ f"{type(timeout_error).__name__}: {timeout_error} "
287
+ f"(attempt {attempt + 1}/{max_attempts})"
288
+ )
289
+ raise
290
+ except httpx.RequestError as request_error:
291
+ request_duration = time.time() - request_attempt_start
292
+ logger.error(
293
+ f"HTTP request error after {request_duration:.2f}s: "
294
+ f"{type(request_error).__name__}: {request_error} "
295
+ f"(attempt {attempt + 1}/{max_attempts})"
296
+ )
297
+ raise
298
+
299
+ # Check if we should retry based on status code
300
+ if response.status_code in retry_on_status and attempt < max_attempts - 1:
301
+ wait_time = backoff_factor ** attempt
302
+ logger.warning(
303
+ f"HTTP {response.status_code} received, will retry after {wait_time:.2f}s "
304
+ f"(attempt {attempt + 1}/{max_attempts})"
305
+ )
306
+ await asyncio.sleep(wait_time)
307
+ continue
308
+
309
+ # Raise for non-2xx status codes
310
+ response.raise_for_status()
311
+
312
+ # Parse response
313
+ parse_start = time.time()
314
+ if response_format == 'json':
315
+ data = response.json()
316
+ else:
317
+ data = response.text
318
+ parse_duration = time.time() - parse_start
319
+ logger.debug(f"Response parsed in {parse_duration:.3f}s")
320
+
321
+ # Extract data array
322
+ extract_start = time.time()
323
+ if response_path:
324
+ extracted_data = self._extract_by_path(data, response_path)
325
+ else:
326
+ extracted_data = self._extract_data_array(data)
327
+ extract_duration = time.time() - extract_start
328
+
329
+ total_duration = time.time() - request_start_time
330
+ logger.info(
331
+ f"Extraction successful: {len(extracted_data)} records extracted "
332
+ f"(total: {total_duration:.2f}s, parse: {parse_duration:.3f}s, "
333
+ f"extract: {extract_duration:.3f}s)"
334
+ )
335
+
336
+ # Apply rate limiting delay
337
+ if resolved_rate_limit_delay > 0:
338
+ logger.debug(f"Applying rate limit delay: {resolved_rate_limit_delay}s")
339
+ await asyncio.sleep(resolved_rate_limit_delay)
340
+
341
+ if return_full_response:
342
+ return extracted_data, data, response
343
+ return extracted_data, None, None
344
+
345
+ except httpx.HTTPStatusError as e:
346
+ last_exception = e
347
+ request_duration = time.time() - request_start_time if request_start_time else 0
348
+
349
+ logger.error(
350
+ f"HTTP status error {e.response.status_code}",
351
+ extra={
352
+ 'status_code': e.response.status_code,
353
+ 'url': url,
354
+ 'attempt': attempt + 1,
355
+ 'duration': request_duration,
356
+ },
357
+ exc_info=True
358
+ )
359
+
360
+ if e.response.status_code in retry_on_status and attempt < max_attempts - 1:
361
+ wait_time = backoff_factor ** attempt
362
+ await asyncio.sleep(wait_time)
363
+ continue
364
+ raise RuntimeError(
365
+ f"HTTP error {e.response.status_code}: {e.response.text}"
366
+ ) from e
367
+ except httpx.TimeoutException as e:
368
+ last_exception = e
369
+ request_duration = time.time() - request_start_time if request_start_time else 0
370
+
371
+ logger.error(
372
+ "HTTP timeout",
373
+ extra={
374
+ 'url': url,
375
+ 'duration': request_duration,
376
+ 'attempt': attempt + 1,
377
+ },
378
+ exc_info=True
379
+ )
380
+
381
+ if attempt < max_attempts - 1:
382
+ wait_time = backoff_factor ** attempt
383
+ await asyncio.sleep(wait_time)
384
+ continue
385
+ raise RuntimeError(f"Request timeout after {request_duration:.2f}s: {e}") from e
386
+ except httpx.RequestError as e:
387
+ last_exception = e
388
+ request_duration = time.time() - request_start_time if request_start_time else 0
389
+
390
+ logger.error(
391
+ "HTTP request error",
392
+ extra={
393
+ 'url': url,
394
+ 'duration': request_duration,
395
+ 'attempt': attempt + 1,
396
+ },
397
+ exc_info=True
398
+ )
399
+
400
+ if attempt < max_attempts - 1:
401
+ wait_time = backoff_factor ** attempt
402
+ await asyncio.sleep(wait_time)
403
+ continue
404
+ raise RuntimeError(f"Request failed: {e}") from e
405
+ except Exception as e:
406
+ request_duration = time.time() - request_start_time if request_start_time else 0
407
+
408
+ logger.error(
409
+ "Unexpected extraction error",
410
+ extra={
411
+ 'url': url,
412
+ 'duration': request_duration,
413
+ 'attempt': attempt + 1,
414
+ },
415
+ exc_info=True
416
+ )
417
+ raise RuntimeError(f"Extraction failed: {e}") from e
418
+
419
+ # If we exhausted all retries
420
+ if last_exception:
421
+ raise RuntimeError(
422
+ f"Extraction failed after {max_attempts} attempts: {last_exception}"
423
+ ) from last_exception
424
+ raise RuntimeError("Extraction failed: unknown error")
425
+
426
+ async def _extract_with_pagination(
427
+ self,
428
+ extract_config: Dict[str, Any],
429
+ params: Dict[str, Any],
430
+ headers: Dict[str, Any],
431
+ contract_dir: Optional[Any] = None,
432
+ batch_size: int = 1000,
433
+ max_records: Optional[int] = None,
434
+ config_context: Optional[Dict[str, Any]] = None,
435
+ ) -> AsyncIterator[List[Dict[str, Any]]]:
436
+ """Extract data with pagination support."""
437
+ pagination_config = extract_config.get('pagination', {})
438
+ strategy = pagination_config.get('strategy', 'page')
439
+ stop_conditions = pagination_config.get('stop_conditions', [])
440
+ page_delay = float(pagination_config.get('page_delay', 0.1))
441
+ max_pages = 1000
442
+ max_records_from_config = None
443
+
444
+ # Get max_pages and max_records from stop conditions
445
+ for condition in stop_conditions:
446
+ if condition.get('type') == 'max_pages':
447
+ max_pages = condition.get('value', 1000)
448
+ elif condition.get('type') == 'max_records':
449
+ max_records_from_config = condition.get('value')
450
+
451
+ if max_records is None:
452
+ max_records = max_records_from_config
453
+
454
+ current_batch = []
455
+ total_extracted = 0
456
+ page_count = 0
457
+ current_url = None
458
+ current_cursor = None
459
+
460
+ # Initialize pagination state
461
+ if strategy == 'page':
462
+ page_config = pagination_config.get('page', {})
463
+ current_page = page_config.get('start', 0)
464
+ page_increment = page_config.get('increment', 1)
465
+ page_param_name = page_config.get('param_name', 'page')
466
+ elif strategy == 'offset':
467
+ offset_config = pagination_config.get('offset', {})
468
+ current_offset = offset_config.get('start', 0)
469
+ offset_param_name = offset_config.get('param_name', 'offset')
470
+ increment_by = offset_config.get('increment_by', 'limit')
471
+ elif strategy == 'cursor':
472
+ cursor_config = pagination_config.get('cursor', {})
473
+ cursor_param_name = cursor_config.get('param_name', 'cursor')
474
+ cursor_response_path = cursor_config.get('response_path', 'next_cursor')
475
+ elif strategy == 'next_url':
476
+ next_url_config = pagination_config.get('next_url', {})
477
+ next_url_response_path = next_url_config.get('response_path', 'next_url')
478
+ elif strategy == 'link_header':
479
+ pass
480
+ else:
481
+ raise ValueError(f"Unsupported pagination strategy: {strategy}")
482
+
483
+ extract_config_copy = extract_config.copy()
484
+ original_endpoint = extract_config_copy.get('api_endpoint')
485
+ original_base_url = extract_config_copy.get('base_url', '')
486
+
487
+ logger.info(
488
+ f"Starting paginated extraction (strategy: {strategy}, "
489
+ f"max_pages: {max_pages}, batch_size: {batch_size}, "
490
+ f"page_delay: {page_delay}s)"
491
+ )
492
+
493
+ while page_count < max_pages:
494
+ # Check max_records limit
495
+ if max_records and total_extracted >= max_records:
496
+ logger.info(
497
+ f"Reached max_records limit ({max_records}), stopping pagination "
498
+ f"(extracted {total_extracted} records from {page_count} pages)"
499
+ )
500
+ if current_batch:
501
+ yield current_batch
502
+ return
503
+
504
+ # Update params/URL based on strategy
505
+ if strategy == 'page':
506
+ params[page_param_name] = current_page
507
+ logger.debug(f"Fetching page {current_page} (page_count: {page_count + 1}/{max_pages})")
508
+ elif strategy == 'offset':
509
+ params[offset_param_name] = current_offset
510
+ elif strategy == 'cursor' and current_cursor:
511
+ params[cursor_param_name] = current_cursor
512
+ elif strategy == 'next_url' and current_url:
513
+ extract_config_copy['api_endpoint'] = current_url
514
+ extract_config_copy['base_url'] = ''
515
+
516
+ # Make request
517
+ need_full_response = strategy in ['cursor', 'next_url', 'link_header']
518
+ try:
519
+ logger.debug(f"Extracting page {page_count + 1} (total extracted so far: {total_extracted})")
520
+ page_data, full_response_data, response_obj = await self._extract_single_page(
521
+ extract_config_copy, params, headers, contract_dir, return_full_response=need_full_response, config_context=config_context
522
+ )
523
+ logger.info(f"Page {page_count + 1} extracted: {len(page_data)} records")
524
+ except Exception as e:
525
+ logger.error(
526
+ f"Error extracting page {page_count + 1}",
527
+ extra={
528
+ 'page': page_count + 1,
529
+ 'extracted': total_extracted,
530
+ },
531
+ exc_info=True
532
+ )
533
+ if current_batch:
534
+ yield current_batch
535
+ raise
536
+
537
+ # Restore original endpoint if modified
538
+ if strategy == 'next_url' and current_url:
539
+ extract_config_copy['api_endpoint'] = original_endpoint
540
+ extract_config_copy['base_url'] = original_base_url
541
+
542
+ # Check for empty page first
543
+ if not page_data:
544
+ logger.info(f"Empty page {page_count + 1} received, stopping pagination")
545
+ if current_batch:
546
+ yield current_batch
547
+ break
548
+
549
+ # Check stop conditions
550
+ page_count += 1
551
+ limit_value = params.get('limit', 100)
552
+ record_count = len(page_data)
553
+ logger.info(
554
+ f"Evaluating stop conditions for page {page_count}: "
555
+ f"{record_count} records returned, limit={limit_value}"
556
+ )
557
+ should_stop = self._check_stop_conditions(page_data, stop_conditions, params, full_response_data)
558
+ if should_stop:
559
+ logger.info(
560
+ f"✅ Stop condition met at page {page_count} "
561
+ f"(page returned {record_count} records, limit: {limit_value})"
562
+ )
563
+ for record in page_data:
564
+ current_batch.append(record)
565
+ total_extracted += 1
566
+ if len(current_batch) >= batch_size:
567
+ yield current_batch
568
+ current_batch = []
569
+ if current_batch:
570
+ yield current_batch
571
+ break
572
+
573
+ # Add page data to current batch
574
+ for record in page_data:
575
+ current_batch.append(record)
576
+ total_extracted += 1
577
+
578
+ if len(current_batch) >= batch_size:
579
+ yield current_batch
580
+ current_batch = []
581
+
582
+ if max_records and total_extracted >= max_records:
583
+ if current_batch:
584
+ yield current_batch
585
+ return
586
+
587
+ # Extract pagination token/URL for next iteration
588
+ if strategy == 'cursor' and full_response_data:
589
+ try:
590
+ current = full_response_data
591
+ for part in cursor_response_path.split('.'):
592
+ if isinstance(current, dict):
593
+ current = current.get(part)
594
+ elif isinstance(current, list) and part.isdigit():
595
+ current = current[int(part)]
596
+ else:
597
+ current = None
598
+ break
599
+
600
+ if current and isinstance(current, str):
601
+ current_cursor = current
602
+ elif current:
603
+ current_cursor = str(current)
604
+ else:
605
+ if current_batch:
606
+ yield current_batch
607
+ break
608
+ except (KeyError, IndexError, TypeError, ValueError):
609
+ if current_batch:
610
+ yield current_batch
611
+ break
612
+
613
+ elif strategy == 'next_url' and full_response_data:
614
+ try:
615
+ current = full_response_data
616
+ for part in next_url_response_path.split('.'):
617
+ if isinstance(current, dict):
618
+ current = current.get(part)
619
+ elif isinstance(current, list) and part.isdigit():
620
+ current = current[int(part)]
621
+ else:
622
+ current = None
623
+ break
624
+
625
+ if current and isinstance(current, str):
626
+ current_url = current
627
+ else:
628
+ current_url = None
629
+
630
+ if not current_url:
631
+ if current_batch:
632
+ yield current_batch
633
+ break
634
+ except (KeyError, IndexError, TypeError, ValueError):
635
+ if current_batch:
636
+ yield current_batch
637
+ break
638
+
639
+ elif strategy == 'link_header' and response_obj:
640
+ current_url = self._extract_link_header_url(response_obj)
641
+ if not current_url:
642
+ if current_batch:
643
+ yield current_batch
644
+ break
645
+ extract_config_copy['api_endpoint'] = current_url
646
+ extract_config_copy['base_url'] = ''
647
+
648
+ # Update pagination state
649
+ if strategy == 'page':
650
+ current_page += page_increment
651
+ elif strategy == 'offset':
652
+ limit = params.get('limit', 100)
653
+ if increment_by == 'limit':
654
+ current_offset += limit
655
+ else:
656
+ current_offset += int(increment_by)
657
+
658
+ # Delay between pages
659
+ if page_delay > 0:
660
+ await asyncio.sleep(page_delay)
661
+
662
+ # Yield remaining records
663
+ if current_batch:
664
+ yield current_batch
665
+
666
+ # Helper methods
667
+ def _resolve_rate_limit_delay(
668
+ self,
669
+ rate_limit_delay: Any,
670
+ contract_dir: Optional[Any] = None,
671
+ config_context: Optional[Dict[str, Any]] = None,
672
+ ) -> float:
673
+ """Resolve and convert rate_limit_delay to float."""
674
+ if isinstance(rate_limit_delay, str):
675
+ source_file = str(contract_dir / "extract.yaml") if contract_dir else None
676
+ resolved = resolve_values(rate_limit_delay, context=config_context, source_file=source_file)
677
+ return float(resolved)
678
+ return float(rate_limit_delay)
679
+
680
+ def _build_request_url(
681
+ self,
682
+ base_url: str,
683
+ api_endpoint: str,
684
+ path_params: Optional[Dict[str, Any]] = None,
685
+ ) -> str:
686
+ """Build full request URL from base URL and endpoint."""
687
+ if api_endpoint.startswith(('http://', 'https://')):
688
+ url = api_endpoint
689
+ elif base_url:
690
+ base_url = base_url.rstrip('/')
691
+ endpoint = api_endpoint.lstrip('/')
692
+ url = f"{base_url}/{endpoint}"
693
+ else:
694
+ raise ValueError(
695
+ "Either 'api_endpoint' must be a full URL (starting with http:// or https://) "
696
+ "or 'base_url' must be provided in extract.yaml"
697
+ )
698
+
699
+ # Substitute path parameters
700
+ if path_params and '{' in url:
701
+ try:
702
+ url = url.format(**path_params)
703
+ except KeyError as e:
704
+ raise ValueError(
705
+ f"Missing required path parameter in URL: {e}. "
706
+ f"URL: {url}, Available params: {list(path_params.keys())}"
707
+ ) from e
708
+
709
+ return url
710
+
711
+ def _configure_timeout(self, timeout_config: Dict[str, Any]) -> httpx.Timeout:
712
+ """Configure HTTP timeout from config dictionary."""
713
+ timeout = httpx.Timeout(
714
+ connect=float(timeout_config.get('connect', DEFAULT_TIMEOUT_CONNECT)),
715
+ read=float(timeout_config.get('read', DEFAULT_TIMEOUT_READ)),
716
+ write=float(timeout_config.get('write', DEFAULT_TIMEOUT_WRITE)),
717
+ pool=float(timeout_config.get('pool', DEFAULT_TIMEOUT_POOL)),
718
+ )
719
+ logger.debug(
720
+ f"Configured HTTP timeout: connect={timeout.connect}s, "
721
+ f"read={timeout.read}s, write={timeout.write}s, pool={timeout.pool}s"
722
+ )
723
+ return timeout
724
+
725
+ async def _make_http_request(
726
+ self,
727
+ client: httpx.AsyncClient,
728
+ method: str,
729
+ url: str,
730
+ params: Dict[str, Any],
731
+ headers: Dict[str, Any],
732
+ body: Optional[Any] = None,
733
+ ) -> httpx.Response:
734
+ """Make HTTP request with specified method."""
735
+ method = method.upper()
736
+
737
+ logger.debug(f"Making {method} request to {url}")
738
+
739
+ try:
740
+ if method == 'GET':
741
+ return await client.get(url, params=params, headers=headers)
742
+ elif method == 'POST':
743
+ if body:
744
+ return await client.post(
745
+ url,
746
+ json=body if isinstance(body, dict) else body,
747
+ params=params,
748
+ headers=headers,
749
+ )
750
+ else:
751
+ return await client.post(url, params=params, headers=headers)
752
+ else:
753
+ raise ValueError(f"Unsupported HTTP method: {method}")
754
+ except httpx.TimeoutException as e:
755
+ timeout_info = ""
756
+ if hasattr(e, 'timeout') and isinstance(e.timeout, httpx.Timeout):
757
+ timeout_info = (
758
+ f" (connect timeout: {e.timeout.connect}s, "
759
+ f"read timeout: {e.timeout.read}s)"
760
+ )
761
+ logger.error(f"HTTP request timeout for {method} {url}{timeout_info}")
762
+ raise
763
+ except httpx.RequestError as e:
764
+ logger.error(f"HTTP request error for {method} {url}: {type(e).__name__}: {e}")
765
+ raise
766
+
767
+ def _extract_by_path(self, data: Any, path: str) -> List[Dict[str, Any]]:
768
+ """Extract data using a simple path notation (e.g., 'data.items')."""
769
+ current = data
770
+ for part in path.split('.'):
771
+ if isinstance(current, dict):
772
+ current = current.get(part)
773
+ elif isinstance(current, list) and part.isdigit():
774
+ current = current[int(part)]
775
+ else:
776
+ return []
777
+
778
+ if current is None:
779
+ return []
780
+
781
+ if isinstance(current, list):
782
+ return current
783
+ elif isinstance(current, dict):
784
+ return [current]
785
+ else:
786
+ return []
787
+
788
+ def _extract_data_array(self, data: Any) -> List[Dict[str, Any]]:
789
+ """Extract data array from response, handling common response structures."""
790
+ if isinstance(data, list):
791
+ return data
792
+ elif isinstance(data, dict):
793
+ # Try common keys for data arrays
794
+ for key in RESPONSE_DATA_KEYS:
795
+ if key in data and isinstance(data[key], list):
796
+ return data[key]
797
+ # If no array found, return as single-item list
798
+ return [data]
799
+ else:
800
+ return []
801
+
802
+ def _check_stop_conditions(
803
+ self,
804
+ page_data: List[Dict[str, Any]],
805
+ stop_conditions: List[Dict[str, Any]],
806
+ params: Dict[str, Any],
807
+ response_data: Any = None,
808
+ ) -> bool:
809
+ """Check if pagination should stop based on configured stop conditions."""
810
+ if not stop_conditions:
811
+ # Default: stop if fewer records than limit
812
+ limit = params.get('limit', 100)
813
+ return len(page_data) < limit
814
+
815
+ for condition in stop_conditions:
816
+ if self._check_stop_condition(condition, page_data, params, response_data):
817
+ return True
818
+
819
+ return False
820
+
821
+ def _check_stop_condition(
822
+ self,
823
+ condition: Dict[str, Any],
824
+ page_data: List[Dict[str, Any]],
825
+ params: Dict[str, Any],
826
+ response_data: Any = None,
827
+ ) -> bool:
828
+ """Check a single stop condition."""
829
+ condition_type = condition.get('type')
830
+
831
+ if condition_type == 'empty_response':
832
+ if not page_data:
833
+ logger.debug("Stop condition 'empty_response' triggered: page is empty")
834
+ return True
835
+
836
+ elif condition_type == 'fewer_records':
837
+ limit = params.get('limit', 100)
838
+ record_count = len(page_data)
839
+ if record_count < limit:
840
+ logger.debug(
841
+ f"Stop condition 'fewer_records' triggered: "
842
+ f"page returned {record_count} records < limit {limit}"
843
+ )
844
+ return True
845
+
846
+ elif condition_type == 'max_pages':
847
+ max_pages = condition.get('value', 1000)
848
+ current_page = params.get('page', 0)
849
+ if current_page >= max_pages:
850
+ logger.debug(f"Stop condition 'max_pages' triggered: page {current_page} >= {max_pages}")
851
+ return True
852
+
853
+ elif condition_type == 'custom':
854
+ return self._check_custom_stop_condition(condition, response_data)
855
+
856
+ return False
857
+
858
+ def _check_custom_stop_condition(
859
+ self,
860
+ condition: Dict[str, Any],
861
+ response_data: Any,
862
+ ) -> bool:
863
+ """Check custom stop condition based on response path."""
864
+ response_path = condition.get('response_path')
865
+ expected_value = condition.get('value')
866
+
867
+ if not response_path or not response_data:
868
+ return False
869
+
870
+ try:
871
+ current = response_data
872
+ for part in response_path.split('.'):
873
+ if isinstance(current, dict):
874
+ current = current.get(part)
875
+ elif isinstance(current, list) and part.isdigit():
876
+ current = current[int(part)]
877
+ else:
878
+ return False
879
+ return current == expected_value
880
+ except (KeyError, IndexError, TypeError):
881
+ return False
882
+
883
+ def _extract_link_header_url(self, response: httpx.Response) -> Optional[str]:
884
+ """Extract next URL from Link header (RFC 5988)."""
885
+ link_header = response.headers.get('Link', '')
886
+ if not link_header:
887
+ return None
888
+
889
+ # Parse Link header: <url>; rel="next"
890
+ pattern = r'<([^>]+)>;\s*rel=["\']?next["\']?'
891
+ match = re.search(pattern, link_header, re.IGNORECASE)
892
+ if match:
893
+ return match.group(1)
894
+
895
+ return None