pycharter 0.0.22__py3-none-any.whl → 0.0.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (332) hide show
  1. api/routes/v1/templates.py +43 -24
  2. pycharter/data/templates/etl/README.md +91 -0
  3. pycharter/data/templates/etl/extract_cloud_azure.yaml +23 -0
  4. pycharter/data/templates/etl/extract_cloud_gcs.yaml +22 -0
  5. pycharter/data/templates/etl/extract_cloud_s3.yaml +24 -0
  6. pycharter/data/templates/etl/extract_database.yaml +28 -0
  7. pycharter/data/templates/etl/extract_database_ssh.yaml +27 -0
  8. pycharter/data/templates/etl/extract_file_csv.yaml +17 -0
  9. pycharter/data/templates/etl/extract_file_glob.yaml +17 -0
  10. pycharter/data/templates/etl/extract_file_json.yaml +14 -0
  11. pycharter/data/templates/etl/extract_file_parquet.yaml +13 -0
  12. pycharter/data/templates/etl/extract_http_paginated.yaml +75 -0
  13. pycharter/data/templates/etl/extract_http_path_params.yaml +45 -0
  14. pycharter/data/templates/etl/extract_http_simple.yaml +52 -0
  15. pycharter/data/templates/etl/load_insert.yaml +17 -0
  16. pycharter/data/templates/etl/load_postgresql.yaml +17 -0
  17. pycharter/data/templates/etl/load_sqlite.yaml +16 -0
  18. pycharter/data/templates/etl/load_truncate_and_load.yaml +18 -0
  19. pycharter/data/templates/etl/load_upsert.yaml +28 -0
  20. pycharter/data/templates/etl/load_with_dlq.yaml +24 -0
  21. pycharter/data/templates/etl/load_with_ssh_tunnel.yaml +28 -0
  22. pycharter/data/templates/etl/pipeline_http_to_db.yaml +38 -0
  23. pycharter/data/templates/etl/transform_combined.yaml +38 -0
  24. pycharter/data/templates/etl/transform_custom_function.yaml +18 -0
  25. pycharter/data/templates/etl/transform_jsonata.yaml +20 -0
  26. pycharter/data/templates/etl/transform_simple.yaml +41 -0
  27. pycharter/db/schemas/.ipynb_checkpoints/data_contract-checkpoint.py +160 -0
  28. pycharter/etl_generator/extraction.py +47 -262
  29. pycharter/etl_generator/extractors/__init__.py +26 -0
  30. pycharter/etl_generator/extractors/base.py +70 -0
  31. pycharter/etl_generator/extractors/cloud_storage.py +454 -0
  32. pycharter/etl_generator/extractors/database.py +151 -0
  33. pycharter/etl_generator/extractors/factory.py +141 -0
  34. pycharter/etl_generator/extractors/file.py +418 -0
  35. pycharter/etl_generator/extractors/http.py +816 -0
  36. {pycharter-0.0.22.dist-info → pycharter-0.0.23.dist-info}/METADATA +6 -1
  37. pycharter-0.0.23.dist-info/RECORD +498 -0
  38. {pycharter-0.0.22.dist-info → pycharter-0.0.23.dist-info}/WHEEL +1 -1
  39. ui/static/404/index.html +1 -1
  40. ui/static/404.html +1 -1
  41. ui/static/__next.__PAGE__.txt +1 -1
  42. ui/static/__next._full.txt +1 -1
  43. ui/static/__next._head.txt +1 -1
  44. ui/static/__next._index.txt +1 -1
  45. ui/static/__next._tree.txt +1 -1
  46. ui/static/_next/static/chunks/26dfc590f7714c03.js +1 -0
  47. ui/static/_next/static/chunks/34d289e6db2ef551.js +1 -0
  48. ui/static/_next/static/chunks/99508d9d5869cc27.js +1 -0
  49. ui/static/_next/static/chunks/b313c35a6ba76574.js +1 -0
  50. ui/static/_not-found/__next._full.txt +1 -1
  51. ui/static/_not-found/__next._head.txt +1 -1
  52. ui/static/_not-found/__next._index.txt +1 -1
  53. ui/static/_not-found/__next._not-found.__PAGE__.txt +1 -1
  54. ui/static/_not-found/__next._not-found.txt +1 -1
  55. ui/static/_not-found/__next._tree.txt +1 -1
  56. ui/static/_not-found/index.html +1 -1
  57. ui/static/_not-found/index.txt +1 -1
  58. ui/static/contracts/__next._full.txt +2 -2
  59. ui/static/contracts/__next._head.txt +1 -1
  60. ui/static/contracts/__next._index.txt +1 -1
  61. ui/static/contracts/__next._tree.txt +1 -1
  62. ui/static/contracts/__next.contracts.__PAGE__.txt +2 -2
  63. ui/static/contracts/__next.contracts.txt +1 -1
  64. ui/static/contracts/index.html +1 -1
  65. ui/static/contracts/index.txt +2 -2
  66. ui/static/documentation/__next._full.txt +1 -1
  67. ui/static/documentation/__next._head.txt +1 -1
  68. ui/static/documentation/__next._index.txt +1 -1
  69. ui/static/documentation/__next._tree.txt +1 -1
  70. ui/static/documentation/__next.documentation.__PAGE__.txt +1 -1
  71. ui/static/documentation/__next.documentation.txt +1 -1
  72. ui/static/documentation/index.html +2 -2
  73. ui/static/documentation/index.txt +1 -1
  74. ui/static/index.html +1 -1
  75. ui/static/index.txt +1 -1
  76. ui/static/metadata/__next._full.txt +1 -1
  77. ui/static/metadata/__next._head.txt +1 -1
  78. ui/static/metadata/__next._index.txt +1 -1
  79. ui/static/metadata/__next._tree.txt +1 -1
  80. ui/static/metadata/__next.metadata.__PAGE__.txt +1 -1
  81. ui/static/metadata/__next.metadata.txt +1 -1
  82. ui/static/metadata/index.html +1 -1
  83. ui/static/metadata/index.txt +1 -1
  84. ui/static/quality/__next._full.txt +2 -2
  85. ui/static/quality/__next._head.txt +1 -1
  86. ui/static/quality/__next._index.txt +1 -1
  87. ui/static/quality/__next._tree.txt +1 -1
  88. ui/static/quality/__next.quality.__PAGE__.txt +2 -2
  89. ui/static/quality/__next.quality.txt +1 -1
  90. ui/static/quality/index.html +2 -2
  91. ui/static/quality/index.txt +2 -2
  92. ui/static/rules/__next._full.txt +1 -1
  93. ui/static/rules/__next._head.txt +1 -1
  94. ui/static/rules/__next._index.txt +1 -1
  95. ui/static/rules/__next._tree.txt +1 -1
  96. ui/static/rules/__next.rules.__PAGE__.txt +1 -1
  97. ui/static/rules/__next.rules.txt +1 -1
  98. ui/static/rules/index.html +1 -1
  99. ui/static/rules/index.txt +1 -1
  100. ui/static/schemas/__next._full.txt +1 -1
  101. ui/static/schemas/__next._head.txt +1 -1
  102. ui/static/schemas/__next._index.txt +1 -1
  103. ui/static/schemas/__next._tree.txt +1 -1
  104. ui/static/schemas/__next.schemas.__PAGE__.txt +1 -1
  105. ui/static/schemas/__next.schemas.txt +1 -1
  106. ui/static/schemas/index.html +1 -1
  107. ui/static/schemas/index.txt +1 -1
  108. ui/static/settings/__next._full.txt +1 -1
  109. ui/static/settings/__next._head.txt +1 -1
  110. ui/static/settings/__next._index.txt +1 -1
  111. ui/static/settings/__next._tree.txt +1 -1
  112. ui/static/settings/__next.settings.__PAGE__.txt +1 -1
  113. ui/static/settings/__next.settings.txt +1 -1
  114. ui/static/settings/index.html +1 -1
  115. ui/static/settings/index.txt +1 -1
  116. ui/static/static/404/index.html +1 -1
  117. ui/static/static/404.html +1 -1
  118. ui/static/static/__next.__PAGE__.txt +1 -1
  119. ui/static/static/__next._full.txt +2 -2
  120. ui/static/static/__next._head.txt +1 -1
  121. ui/static/static/__next._index.txt +2 -2
  122. ui/static/static/__next._tree.txt +2 -2
  123. ui/static/static/_next/static/chunks/13d4a0fbd74c1ee4.js +1 -0
  124. ui/static/static/_next/static/chunks/2edb43b48432ac04.js +441 -0
  125. ui/static/static/_next/static/chunks/d2363397e1b2bcab.css +1 -0
  126. ui/static/static/_next/static/chunks/f7d1a90dd75d2572.js +1 -0
  127. ui/static/static/_not-found/__next._full.txt +2 -2
  128. ui/static/static/_not-found/__next._head.txt +1 -1
  129. ui/static/static/_not-found/__next._index.txt +2 -2
  130. ui/static/static/_not-found/__next._not-found.__PAGE__.txt +1 -1
  131. ui/static/static/_not-found/__next._not-found.txt +1 -1
  132. ui/static/static/_not-found/__next._tree.txt +2 -2
  133. ui/static/static/_not-found/index.html +1 -1
  134. ui/static/static/_not-found/index.txt +2 -2
  135. ui/static/static/contracts/__next._full.txt +3 -3
  136. ui/static/static/contracts/__next._head.txt +1 -1
  137. ui/static/static/contracts/__next._index.txt +2 -2
  138. ui/static/static/contracts/__next._tree.txt +2 -2
  139. ui/static/static/contracts/__next.contracts.__PAGE__.txt +2 -2
  140. ui/static/static/contracts/__next.contracts.txt +1 -1
  141. ui/static/static/contracts/index.html +1 -1
  142. ui/static/static/contracts/index.txt +3 -3
  143. ui/static/static/documentation/__next._full.txt +3 -3
  144. ui/static/static/documentation/__next._head.txt +1 -1
  145. ui/static/static/documentation/__next._index.txt +2 -2
  146. ui/static/static/documentation/__next._tree.txt +2 -2
  147. ui/static/static/documentation/__next.documentation.__PAGE__.txt +2 -2
  148. ui/static/static/documentation/__next.documentation.txt +1 -1
  149. ui/static/static/documentation/index.html +2 -2
  150. ui/static/static/documentation/index.txt +3 -3
  151. ui/static/static/index.html +1 -1
  152. ui/static/static/index.txt +2 -2
  153. ui/static/static/metadata/__next._full.txt +2 -2
  154. ui/static/static/metadata/__next._head.txt +1 -1
  155. ui/static/static/metadata/__next._index.txt +2 -2
  156. ui/static/static/metadata/__next._tree.txt +2 -2
  157. ui/static/static/metadata/__next.metadata.__PAGE__.txt +1 -1
  158. ui/static/static/metadata/__next.metadata.txt +1 -1
  159. ui/static/static/metadata/index.html +1 -1
  160. ui/static/static/metadata/index.txt +2 -2
  161. ui/static/static/quality/__next._full.txt +2 -2
  162. ui/static/static/quality/__next._head.txt +1 -1
  163. ui/static/static/quality/__next._index.txt +2 -2
  164. ui/static/static/quality/__next._tree.txt +2 -2
  165. ui/static/static/quality/__next.quality.__PAGE__.txt +1 -1
  166. ui/static/static/quality/__next.quality.txt +1 -1
  167. ui/static/static/quality/index.html +2 -2
  168. ui/static/static/quality/index.txt +2 -2
  169. ui/static/static/rules/__next._full.txt +2 -2
  170. ui/static/static/rules/__next._head.txt +1 -1
  171. ui/static/static/rules/__next._index.txt +2 -2
  172. ui/static/static/rules/__next._tree.txt +2 -2
  173. ui/static/static/rules/__next.rules.__PAGE__.txt +1 -1
  174. ui/static/static/rules/__next.rules.txt +1 -1
  175. ui/static/static/rules/index.html +1 -1
  176. ui/static/static/rules/index.txt +2 -2
  177. ui/static/static/schemas/__next._full.txt +2 -2
  178. ui/static/static/schemas/__next._head.txt +1 -1
  179. ui/static/static/schemas/__next._index.txt +2 -2
  180. ui/static/static/schemas/__next._tree.txt +2 -2
  181. ui/static/static/schemas/__next.schemas.__PAGE__.txt +1 -1
  182. ui/static/static/schemas/__next.schemas.txt +1 -1
  183. ui/static/static/schemas/index.html +1 -1
  184. ui/static/static/schemas/index.txt +2 -2
  185. ui/static/static/settings/__next._full.txt +2 -2
  186. ui/static/static/settings/__next._head.txt +1 -1
  187. ui/static/static/settings/__next._index.txt +2 -2
  188. ui/static/static/settings/__next._tree.txt +2 -2
  189. ui/static/static/settings/__next.settings.__PAGE__.txt +1 -1
  190. ui/static/static/settings/__next.settings.txt +1 -1
  191. ui/static/static/settings/index.html +1 -1
  192. ui/static/static/settings/index.txt +2 -2
  193. ui/static/static/static/.gitkeep +0 -0
  194. ui/static/static/static/404/index.html +1 -0
  195. ui/static/static/static/404.html +1 -0
  196. ui/static/static/static/__next.__PAGE__.txt +10 -0
  197. ui/static/static/static/__next._full.txt +30 -0
  198. ui/static/static/static/__next._head.txt +7 -0
  199. ui/static/static/static/__next._index.txt +9 -0
  200. ui/static/static/static/__next._tree.txt +2 -0
  201. ui/static/static/static/_next/static/chunks/222442f6da32302a.js +1 -0
  202. ui/static/static/static/_next/static/chunks/247eb132b7f7b574.js +1 -0
  203. ui/static/static/static/_next/static/chunks/297d55555b71baba.js +1 -0
  204. ui/static/static/static/_next/static/chunks/2ab439ce003cd691.js +1 -0
  205. ui/static/static/static/_next/static/chunks/414e77373f8ff61c.js +1 -0
  206. ui/static/static/static/_next/static/chunks/49ca65abd26ae49e.js +1 -0
  207. ui/static/static/static/_next/static/chunks/652ad0aa26265c47.js +2 -0
  208. ui/static/static/static/_next/static/chunks/9667e7a3d359eb39.js +1 -0
  209. ui/static/static/static/_next/static/chunks/9c23f44fff36548a.js +1 -0
  210. ui/static/static/static/_next/static/chunks/a6dad97d9634a72d.js +1 -0
  211. ui/static/static/static/_next/static/chunks/b32a0963684b9933.js +4 -0
  212. ui/static/static/static/_next/static/chunks/c69f6cba366bd988.js +1 -0
  213. ui/static/static/static/_next/static/chunks/db913959c675cea6.js +1 -0
  214. ui/static/static/static/_next/static/chunks/f061a4be97bfc3b3.js +1 -0
  215. ui/static/static/static/_next/static/chunks/f2e7afeab1178138.js +1 -0
  216. ui/static/static/static/_next/static/chunks/ff1a16fafef87110.js +1 -0
  217. ui/static/static/static/_next/static/chunks/turbopack-ffcb7ab6794027ef.js +3 -0
  218. ui/static/static/static/_next/static/tNTkVW6puVXC4bAm4WrHl/_buildManifest.js +11 -0
  219. ui/static/static/static/_next/static/tNTkVW6puVXC4bAm4WrHl/_ssgManifest.js +1 -0
  220. ui/static/static/static/_not-found/__next._full.txt +17 -0
  221. ui/static/static/static/_not-found/__next._head.txt +7 -0
  222. ui/static/static/static/_not-found/__next._index.txt +9 -0
  223. ui/static/static/static/_not-found/__next._not-found.__PAGE__.txt +5 -0
  224. ui/static/static/static/_not-found/__next._not-found.txt +4 -0
  225. ui/static/static/static/_not-found/__next._tree.txt +2 -0
  226. ui/static/static/static/_not-found/index.html +1 -0
  227. ui/static/static/static/_not-found/index.txt +17 -0
  228. ui/static/static/static/contracts/__next._full.txt +21 -0
  229. ui/static/static/static/contracts/__next._head.txt +7 -0
  230. ui/static/static/static/contracts/__next._index.txt +9 -0
  231. ui/static/static/static/contracts/__next._tree.txt +2 -0
  232. ui/static/static/static/contracts/__next.contracts.__PAGE__.txt +9 -0
  233. ui/static/static/static/contracts/__next.contracts.txt +4 -0
  234. ui/static/static/static/contracts/index.html +1 -0
  235. ui/static/static/static/contracts/index.txt +21 -0
  236. ui/static/static/static/documentation/__next._full.txt +21 -0
  237. ui/static/static/static/documentation/__next._head.txt +7 -0
  238. ui/static/static/static/documentation/__next._index.txt +9 -0
  239. ui/static/static/static/documentation/__next._tree.txt +2 -0
  240. ui/static/static/static/documentation/__next.documentation.__PAGE__.txt +9 -0
  241. ui/static/static/static/documentation/__next.documentation.txt +4 -0
  242. ui/static/static/static/documentation/index.html +93 -0
  243. ui/static/static/static/documentation/index.txt +21 -0
  244. ui/static/static/static/index.html +1 -0
  245. ui/static/static/static/index.txt +30 -0
  246. ui/static/static/static/metadata/__next._full.txt +21 -0
  247. ui/static/static/static/metadata/__next._head.txt +7 -0
  248. ui/static/static/static/metadata/__next._index.txt +9 -0
  249. ui/static/static/static/metadata/__next._tree.txt +2 -0
  250. ui/static/static/static/metadata/__next.metadata.__PAGE__.txt +9 -0
  251. ui/static/static/static/metadata/__next.metadata.txt +4 -0
  252. ui/static/static/static/metadata/index.html +1 -0
  253. ui/static/static/static/metadata/index.txt +21 -0
  254. ui/static/static/static/quality/__next._full.txt +21 -0
  255. ui/static/static/static/quality/__next._head.txt +7 -0
  256. ui/static/static/static/quality/__next._index.txt +9 -0
  257. ui/static/static/static/quality/__next._tree.txt +2 -0
  258. ui/static/static/static/quality/__next.quality.__PAGE__.txt +9 -0
  259. ui/static/static/static/quality/__next.quality.txt +4 -0
  260. ui/static/static/static/quality/index.html +2 -0
  261. ui/static/static/static/quality/index.txt +21 -0
  262. ui/static/static/static/rules/__next._full.txt +21 -0
  263. ui/static/static/static/rules/__next._head.txt +7 -0
  264. ui/static/static/static/rules/__next._index.txt +9 -0
  265. ui/static/static/static/rules/__next._tree.txt +2 -0
  266. ui/static/static/static/rules/__next.rules.__PAGE__.txt +9 -0
  267. ui/static/static/static/rules/__next.rules.txt +4 -0
  268. ui/static/static/static/rules/index.html +1 -0
  269. ui/static/static/static/rules/index.txt +21 -0
  270. ui/static/static/static/schemas/__next._full.txt +21 -0
  271. ui/static/static/static/schemas/__next._head.txt +7 -0
  272. ui/static/static/static/schemas/__next._index.txt +9 -0
  273. ui/static/static/static/schemas/__next._tree.txt +2 -0
  274. ui/static/static/static/schemas/__next.schemas.__PAGE__.txt +9 -0
  275. ui/static/static/static/schemas/__next.schemas.txt +4 -0
  276. ui/static/static/static/schemas/index.html +1 -0
  277. ui/static/static/static/schemas/index.txt +21 -0
  278. ui/static/static/static/settings/__next._full.txt +21 -0
  279. ui/static/static/static/settings/__next._head.txt +7 -0
  280. ui/static/static/static/settings/__next._index.txt +9 -0
  281. ui/static/static/static/settings/__next._tree.txt +2 -0
  282. ui/static/static/static/settings/__next.settings.__PAGE__.txt +9 -0
  283. ui/static/static/static/settings/__next.settings.txt +4 -0
  284. ui/static/static/static/settings/index.html +1 -0
  285. ui/static/static/static/settings/index.txt +21 -0
  286. ui/static/static/static/validation/__next._full.txt +21 -0
  287. ui/static/static/static/validation/__next._head.txt +7 -0
  288. ui/static/static/static/validation/__next._index.txt +9 -0
  289. ui/static/static/static/validation/__next._tree.txt +2 -0
  290. ui/static/static/static/validation/__next.validation.__PAGE__.txt +9 -0
  291. ui/static/static/static/validation/__next.validation.txt +4 -0
  292. ui/static/static/static/validation/index.html +1 -0
  293. ui/static/static/static/validation/index.txt +21 -0
  294. ui/static/static/validation/__next._full.txt +2 -2
  295. ui/static/static/validation/__next._head.txt +1 -1
  296. ui/static/static/validation/__next._index.txt +2 -2
  297. ui/static/static/validation/__next._tree.txt +2 -2
  298. ui/static/static/validation/__next.validation.__PAGE__.txt +1 -1
  299. ui/static/static/validation/__next.validation.txt +1 -1
  300. ui/static/static/validation/index.html +1 -1
  301. ui/static/static/validation/index.txt +2 -2
  302. ui/static/validation/__next._full.txt +2 -2
  303. ui/static/validation/__next._head.txt +1 -1
  304. ui/static/validation/__next._index.txt +1 -1
  305. ui/static/validation/__next._tree.txt +1 -1
  306. ui/static/validation/__next.validation.__PAGE__.txt +2 -2
  307. ui/static/validation/__next.validation.txt +1 -1
  308. ui/static/validation/index.html +1 -1
  309. ui/static/validation/index.txt +2 -2
  310. pycharter/data/templates/template_transform_advanced.yaml +0 -50
  311. pycharter/data/templates/template_transform_simple.yaml +0 -59
  312. pycharter-0.0.22.dist-info/RECORD +0 -358
  313. /pycharter/data/templates/{template_coercion_rules.yaml → contract/template_coercion_rules.yaml} +0 -0
  314. /pycharter/data/templates/{template_contract.yaml → contract/template_contract.yaml} +0 -0
  315. /pycharter/data/templates/{template_metadata.yaml → contract/template_metadata.yaml} +0 -0
  316. /pycharter/data/templates/{template_schema.yaml → contract/template_schema.yaml} +0 -0
  317. /pycharter/data/templates/{template_validation_rules.yaml → contract/template_validation_rules.yaml} +0 -0
  318. {pycharter-0.0.22.dist-info → pycharter-0.0.23.dist-info}/entry_points.txt +0 -0
  319. {pycharter-0.0.22.dist-info → pycharter-0.0.23.dist-info}/licenses/LICENSE +0 -0
  320. {pycharter-0.0.22.dist-info → pycharter-0.0.23.dist-info}/top_level.txt +0 -0
  321. /ui/static/_next/static/{0rYA78L88aUyD2Uh38hhX → 2gKjNv6YvE6BcIdFthBLs}/_buildManifest.js +0 -0
  322. /ui/static/_next/static/{0rYA78L88aUyD2Uh38hhX → 2gKjNv6YvE6BcIdFthBLs}/_ssgManifest.js +0 -0
  323. /ui/static/static/_next/static/{tNTkVW6puVXC4bAm4WrHl → 0rYA78L88aUyD2Uh38hhX}/_buildManifest.js +0 -0
  324. /ui/static/static/_next/static/{tNTkVW6puVXC4bAm4WrHl → 0rYA78L88aUyD2Uh38hhX}/_ssgManifest.js +0 -0
  325. /ui/static/{_next → static/_next}/static/chunks/c4fa4f4114b7c352.js +0 -0
  326. /ui/static/static/{_next → static/_next}/static/chunks/4e310fe5005770a3.css +0 -0
  327. /ui/static/{_next → static/static/_next}/static/chunks/5e04d10c4a7b58a3.js +0 -0
  328. /ui/static/static/{_next → static/_next}/static/chunks/5fc14c00a2779dc5.js +0 -0
  329. /ui/static/{_next → static/static/_next}/static/chunks/75d88a058d8ffaa6.js +0 -0
  330. /ui/static/{_next → static/static/_next}/static/chunks/8c89634cf6bad76f.js +0 -0
  331. /ui/static/static/{_next → static/_next}/static/chunks/b584574fdc8ab13e.js +0 -0
  332. /ui/static/static/{_next → static/_next}/static/chunks/d5989c94d3614b3a.js +0 -0
@@ -0,0 +1,816 @@
1
+ """
2
+ HTTP/API extractor for ETL orchestrator.
3
+
4
+ Handles HTTP-based data extraction with support for:
5
+ - GET and POST requests
6
+ - Retry logic with exponential backoff
7
+ - Rate limiting
8
+ - Pagination (page, offset, cursor, next_url, link_header)
9
+ - Response parsing (JSON, text)
10
+ - Path parameter substitution
11
+ """
12
+
13
+ import asyncio
14
+ import logging
15
+ import re
16
+ import time
17
+ from typing import Any, AsyncIterator, Dict, List, Optional
18
+
19
+ import httpx
20
+
21
+ from pycharter.etl_generator.extractors.base import BaseExtractor
22
+ from pycharter.utils.value_injector import resolve_values
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ # Default configuration values
27
+ DEFAULT_RATE_LIMIT_DELAY = 0.2
28
+ DEFAULT_MAX_ATTEMPTS = 3
29
+ DEFAULT_BACKOFF_FACTOR = 2.0
30
+ DEFAULT_RETRY_STATUS_CODES = [429, 500, 502, 503, 504]
31
+ DEFAULT_TIMEOUT_CONNECT = 10.0
32
+ DEFAULT_TIMEOUT_READ = 30.0
33
+ DEFAULT_TIMEOUT_WRITE = 10.0
34
+ DEFAULT_TIMEOUT_POOL = 10.0
35
+
36
+ # Common response data keys
37
+ RESPONSE_DATA_KEYS = ['data', 'results', 'items', 'records', 'values']
38
+
39
+
40
+ class HTTPExtractor(BaseExtractor):
41
+ """Extractor for HTTP/API data sources."""
42
+
43
+ def validate_config(self, extract_config: Dict[str, Any]) -> None:
44
+ """Validate HTTP extractor configuration."""
45
+ if 'source_type' in extract_config and extract_config['source_type'] != 'http':
46
+ raise ValueError(f"HTTPExtractor requires source_type='http', got '{extract_config.get('source_type')}'")
47
+
48
+ # Check for required HTTP config fields
49
+ if not extract_config.get('api_endpoint') and not extract_config.get('base_url'):
50
+ # Allow if api_endpoint is a full URL
51
+ api_endpoint = extract_config.get('api_endpoint', '')
52
+ if not api_endpoint.startswith(('http://', 'https://')):
53
+ raise ValueError(
54
+ "HTTP extractor requires either 'api_endpoint' (with 'base_url') "
55
+ "or 'api_endpoint' as full URL"
56
+ )
57
+
58
+ async def extract_streaming(
59
+ self,
60
+ extract_config: Dict[str, Any],
61
+ params: Dict[str, Any],
62
+ headers: Dict[str, Any],
63
+ contract_dir: Optional[Any] = None,
64
+ batch_size: int = 1000,
65
+ max_records: Optional[int] = None,
66
+ config_context: Optional[Dict[str, Any]] = None,
67
+ ) -> AsyncIterator[List[Dict[str, Any]]]:
68
+ """
69
+ Extract data from HTTP/API source with pagination support.
70
+
71
+ Yields batches as they are extracted, preventing memory exhaustion for large datasets.
72
+ """
73
+ pagination_config = extract_config.get('pagination', {})
74
+
75
+ # If pagination is not enabled, extract all and yield in batches
76
+ if not pagination_config.get('enabled', False):
77
+ logger.info("Pagination disabled, extracting all data in single request")
78
+ all_data = await self._extract_with_retry(
79
+ extract_config, params, headers, contract_dir, config_context=config_context
80
+ )
81
+ if max_records:
82
+ logger.info(f"Limiting to {max_records} records (extracted {len(all_data)})")
83
+ all_data = all_data[:max_records]
84
+
85
+ logger.info(f"Yielding {len(all_data)} records in batches of {batch_size}")
86
+ for i in range(0, len(all_data), batch_size):
87
+ batch = all_data[i:i + batch_size]
88
+ logger.debug(f"Yielding batch {i // batch_size + 1} with {len(batch)} records")
89
+ yield batch
90
+ return
91
+
92
+ # Pagination enabled - stream pages and yield in batches
93
+ async for batch in self._extract_with_pagination(
94
+ extract_config, params, headers, contract_dir, batch_size, max_records, config_context
95
+ ):
96
+ yield batch
97
+
98
+ async def _extract_with_retry(
99
+ self,
100
+ extract_config: Dict[str, Any],
101
+ params: Dict[str, Any],
102
+ headers: Dict[str, Any],
103
+ contract_dir: Optional[Any] = None,
104
+ config_context: Optional[Dict[str, Any]] = None,
105
+ ) -> List[Dict[str, Any]]:
106
+ """Extract data from API with retry logic."""
107
+ extracted_data, _, _ = await self._extract_single_page(
108
+ extract_config, params, headers, contract_dir, return_full_response=False, config_context=config_context
109
+ )
110
+ return extracted_data
111
+
112
+ async def _extract_single_page(
113
+ self,
114
+ extract_config: Dict[str, Any],
115
+ params: Dict[str, Any],
116
+ headers: Dict[str, Any],
117
+ contract_dir: Optional[Any] = None,
118
+ return_full_response: bool = False,
119
+ config_context: Optional[Dict[str, Any]] = None,
120
+ ) -> tuple[List[Dict[str, Any]], Optional[Any], Optional[httpx.Response]]:
121
+ """Extract data from a single API request with retry logic."""
122
+ # Get configuration
123
+ base_url = extract_config.get('base_url', '')
124
+ api_endpoint = extract_config.get('api_endpoint', '')
125
+ method = extract_config.get('method', 'GET').upper()
126
+ timeout_config = extract_config.get('timeout', {})
127
+ retry_config = extract_config.get('retry', {})
128
+ response_path = extract_config.get('response_path')
129
+ response_format = extract_config.get('response_format', 'json')
130
+ rate_limit_delay = extract_config.get('rate_limit_delay', DEFAULT_RATE_LIMIT_DELAY)
131
+ body = extract_config.get('body')
132
+
133
+ # Resolve variables and convert types
134
+ source_file = str(contract_dir / "extract.yaml") if contract_dir else None
135
+ resolved_params = resolve_values(params, context=config_context, source_file=source_file)
136
+ resolved_headers = resolve_values(headers, context=config_context, source_file=source_file)
137
+ resolved_timeout_config = resolve_values(timeout_config, context=config_context, source_file=source_file)
138
+ resolved_rate_limit_delay = self._resolve_rate_limit_delay(rate_limit_delay, contract_dir, config_context)
139
+
140
+ if body:
141
+ resolved_body = resolve_values(body, context=config_context, source_file=source_file)
142
+ else:
143
+ resolved_body = None
144
+
145
+ # Extract path parameters from api_endpoint
146
+ path_params = {}
147
+ if '{' in api_endpoint:
148
+ path_param_names = re.findall(r'\{(\w+)\}', api_endpoint)
149
+ for param_name in path_param_names:
150
+ if param_name in resolved_params:
151
+ path_params[param_name] = resolved_params.pop(param_name)
152
+
153
+ # Build URL with path parameter substitution
154
+ url = self._build_request_url(base_url, api_endpoint, path_params)
155
+
156
+ # Configure timeout
157
+ timeout = self._configure_timeout(resolved_timeout_config)
158
+
159
+ # Configure retry
160
+ max_attempts = int(retry_config.get('max_attempts', DEFAULT_MAX_ATTEMPTS))
161
+ backoff_factor = float(retry_config.get('backoff_factor', DEFAULT_BACKOFF_FACTOR))
162
+ retry_on_status = retry_config.get('retry_on_status', DEFAULT_RETRY_STATUS_CODES)
163
+
164
+ # Make request with retry logic
165
+ last_exception = None
166
+ request_start_time = None
167
+
168
+ logger.info(
169
+ f"Starting HTTP extraction: {method} {url} "
170
+ f"(timeout: connect={timeout.connect}s, read={timeout.read}s, "
171
+ f"max_attempts={max_attempts})"
172
+ )
173
+ logger.debug(f"Request params: {resolved_params}")
174
+ logger.debug(f"Request headers: {dict(resolved_headers)}")
175
+
176
+ for attempt in range(max_attempts):
177
+ try:
178
+ request_start_time = time.time()
179
+ logger.debug(f"HTTP request attempt {attempt + 1}/{max_attempts} to {url}")
180
+
181
+ async with httpx.AsyncClient(timeout=timeout) as client:
182
+ if attempt > 0:
183
+ wait_time = backoff_factor ** (attempt - 1)
184
+ logger.info(f"Retrying after {wait_time:.2f}s (attempt {attempt + 1}/{max_attempts})")
185
+ await asyncio.sleep(wait_time)
186
+
187
+ request_attempt_start = time.time()
188
+ try:
189
+ response = await self._make_http_request(
190
+ client, method, url, resolved_params, resolved_headers, resolved_body
191
+ )
192
+ request_duration = time.time() - request_attempt_start
193
+ logger.info(
194
+ f"HTTP request completed: {response.status_code} "
195
+ f"({request_duration:.2f}s, attempt {attempt + 1}/{max_attempts})"
196
+ )
197
+ except httpx.TimeoutException as timeout_error:
198
+ request_duration = time.time() - request_attempt_start
199
+ timeout_info = ""
200
+ if hasattr(timeout_error, 'timeout') and isinstance(timeout_error.timeout, httpx.Timeout):
201
+ timeout_info = (
202
+ f" (connect={timeout_error.timeout.connect}s, "
203
+ f"read={timeout_error.timeout.read}s)"
204
+ )
205
+ logger.error(
206
+ f"HTTP request timeout after {request_duration:.2f}s{timeout_info}: "
207
+ f"{type(timeout_error).__name__}: {timeout_error} "
208
+ f"(attempt {attempt + 1}/{max_attempts})"
209
+ )
210
+ raise
211
+ except httpx.RequestError as request_error:
212
+ request_duration = time.time() - request_attempt_start
213
+ logger.error(
214
+ f"HTTP request error after {request_duration:.2f}s: "
215
+ f"{type(request_error).__name__}: {request_error} "
216
+ f"(attempt {attempt + 1}/{max_attempts})"
217
+ )
218
+ raise
219
+
220
+ # Check if we should retry based on status code
221
+ if response.status_code in retry_on_status and attempt < max_attempts - 1:
222
+ wait_time = backoff_factor ** attempt
223
+ logger.warning(
224
+ f"HTTP {response.status_code} received, will retry after {wait_time:.2f}s "
225
+ f"(attempt {attempt + 1}/{max_attempts})"
226
+ )
227
+ await asyncio.sleep(wait_time)
228
+ continue
229
+
230
+ # Raise for non-2xx status codes
231
+ response.raise_for_status()
232
+
233
+ # Parse response
234
+ parse_start = time.time()
235
+ if response_format == 'json':
236
+ data = response.json()
237
+ else:
238
+ data = response.text
239
+ parse_duration = time.time() - parse_start
240
+ logger.debug(f"Response parsed in {parse_duration:.3f}s")
241
+
242
+ # Extract data array
243
+ extract_start = time.time()
244
+ if response_path:
245
+ extracted_data = self._extract_by_path(data, response_path)
246
+ else:
247
+ extracted_data = self._extract_data_array(data)
248
+ extract_duration = time.time() - extract_start
249
+
250
+ total_duration = time.time() - request_start_time
251
+ logger.info(
252
+ f"Extraction successful: {len(extracted_data)} records extracted "
253
+ f"(total: {total_duration:.2f}s, parse: {parse_duration:.3f}s, "
254
+ f"extract: {extract_duration:.3f}s)"
255
+ )
256
+
257
+ # Apply rate limiting delay
258
+ if resolved_rate_limit_delay > 0:
259
+ logger.debug(f"Applying rate limit delay: {resolved_rate_limit_delay}s")
260
+ await asyncio.sleep(resolved_rate_limit_delay)
261
+
262
+ if return_full_response:
263
+ return extracted_data, data, response
264
+ return extracted_data, None, None
265
+
266
+ except httpx.HTTPStatusError as e:
267
+ last_exception = e
268
+ request_duration = time.time() - request_start_time if request_start_time else 0
269
+
270
+ logger.error(
271
+ f"HTTP status error {e.response.status_code}",
272
+ extra={
273
+ 'status_code': e.response.status_code,
274
+ 'url': url,
275
+ 'attempt': attempt + 1,
276
+ 'duration': request_duration,
277
+ },
278
+ exc_info=True
279
+ )
280
+
281
+ if e.response.status_code in retry_on_status and attempt < max_attempts - 1:
282
+ wait_time = backoff_factor ** attempt
283
+ await asyncio.sleep(wait_time)
284
+ continue
285
+ raise RuntimeError(
286
+ f"HTTP error {e.response.status_code}: {e.response.text}"
287
+ ) from e
288
+ except httpx.TimeoutException as e:
289
+ last_exception = e
290
+ request_duration = time.time() - request_start_time if request_start_time else 0
291
+
292
+ logger.error(
293
+ "HTTP timeout",
294
+ extra={
295
+ 'url': url,
296
+ 'duration': request_duration,
297
+ 'attempt': attempt + 1,
298
+ },
299
+ exc_info=True
300
+ )
301
+
302
+ if attempt < max_attempts - 1:
303
+ wait_time = backoff_factor ** attempt
304
+ await asyncio.sleep(wait_time)
305
+ continue
306
+ raise RuntimeError(f"Request timeout after {request_duration:.2f}s: {e}") from e
307
+ except httpx.RequestError as e:
308
+ last_exception = e
309
+ request_duration = time.time() - request_start_time if request_start_time else 0
310
+
311
+ logger.error(
312
+ "HTTP request error",
313
+ extra={
314
+ 'url': url,
315
+ 'duration': request_duration,
316
+ 'attempt': attempt + 1,
317
+ },
318
+ exc_info=True
319
+ )
320
+
321
+ if attempt < max_attempts - 1:
322
+ wait_time = backoff_factor ** attempt
323
+ await asyncio.sleep(wait_time)
324
+ continue
325
+ raise RuntimeError(f"Request failed: {e}") from e
326
+ except Exception as e:
327
+ request_duration = time.time() - request_start_time if request_start_time else 0
328
+
329
+ logger.error(
330
+ "Unexpected extraction error",
331
+ extra={
332
+ 'url': url,
333
+ 'duration': request_duration,
334
+ 'attempt': attempt + 1,
335
+ },
336
+ exc_info=True
337
+ )
338
+ raise RuntimeError(f"Extraction failed: {e}") from e
339
+
340
+ # If we exhausted all retries
341
+ if last_exception:
342
+ raise RuntimeError(
343
+ f"Extraction failed after {max_attempts} attempts: {last_exception}"
344
+ ) from last_exception
345
+ raise RuntimeError("Extraction failed: unknown error")
346
+
347
+ async def _extract_with_pagination(
348
+ self,
349
+ extract_config: Dict[str, Any],
350
+ params: Dict[str, Any],
351
+ headers: Dict[str, Any],
352
+ contract_dir: Optional[Any] = None,
353
+ batch_size: int = 1000,
354
+ max_records: Optional[int] = None,
355
+ config_context: Optional[Dict[str, Any]] = None,
356
+ ) -> AsyncIterator[List[Dict[str, Any]]]:
357
+ """Extract data with pagination support."""
358
+ pagination_config = extract_config.get('pagination', {})
359
+ strategy = pagination_config.get('strategy', 'page')
360
+ stop_conditions = pagination_config.get('stop_conditions', [])
361
+ page_delay = float(pagination_config.get('page_delay', 0.1))
362
+ max_pages = 1000
363
+ max_records_from_config = None
364
+
365
+ # Get max_pages and max_records from stop conditions
366
+ for condition in stop_conditions:
367
+ if condition.get('type') == 'max_pages':
368
+ max_pages = condition.get('value', 1000)
369
+ elif condition.get('type') == 'max_records':
370
+ max_records_from_config = condition.get('value')
371
+
372
+ if max_records is None:
373
+ max_records = max_records_from_config
374
+
375
+ current_batch = []
376
+ total_extracted = 0
377
+ page_count = 0
378
+ current_url = None
379
+ current_cursor = None
380
+
381
+ # Initialize pagination state
382
+ if strategy == 'page':
383
+ page_config = pagination_config.get('page', {})
384
+ current_page = page_config.get('start', 0)
385
+ page_increment = page_config.get('increment', 1)
386
+ page_param_name = page_config.get('param_name', 'page')
387
+ elif strategy == 'offset':
388
+ offset_config = pagination_config.get('offset', {})
389
+ current_offset = offset_config.get('start', 0)
390
+ offset_param_name = offset_config.get('param_name', 'offset')
391
+ increment_by = offset_config.get('increment_by', 'limit')
392
+ elif strategy == 'cursor':
393
+ cursor_config = pagination_config.get('cursor', {})
394
+ cursor_param_name = cursor_config.get('param_name', 'cursor')
395
+ cursor_response_path = cursor_config.get('response_path', 'next_cursor')
396
+ elif strategy == 'next_url':
397
+ next_url_config = pagination_config.get('next_url', {})
398
+ next_url_response_path = next_url_config.get('response_path', 'next_url')
399
+ elif strategy == 'link_header':
400
+ pass
401
+ else:
402
+ raise ValueError(f"Unsupported pagination strategy: {strategy}")
403
+
404
+ extract_config_copy = extract_config.copy()
405
+ original_endpoint = extract_config_copy.get('api_endpoint')
406
+ original_base_url = extract_config_copy.get('base_url', '')
407
+
408
+ logger.info(
409
+ f"Starting paginated extraction (strategy: {strategy}, "
410
+ f"max_pages: {max_pages}, batch_size: {batch_size}, "
411
+ f"page_delay: {page_delay}s)"
412
+ )
413
+
414
+ while page_count < max_pages:
415
+ # Check max_records limit
416
+ if max_records and total_extracted >= max_records:
417
+ logger.info(
418
+ f"Reached max_records limit ({max_records}), stopping pagination "
419
+ f"(extracted {total_extracted} records from {page_count} pages)"
420
+ )
421
+ if current_batch:
422
+ yield current_batch
423
+ return
424
+
425
+ # Update params/URL based on strategy
426
+ if strategy == 'page':
427
+ params[page_param_name] = current_page
428
+ logger.debug(f"Fetching page {current_page} (page_count: {page_count + 1}/{max_pages})")
429
+ elif strategy == 'offset':
430
+ params[offset_param_name] = current_offset
431
+ elif strategy == 'cursor' and current_cursor:
432
+ params[cursor_param_name] = current_cursor
433
+ elif strategy == 'next_url' and current_url:
434
+ extract_config_copy['api_endpoint'] = current_url
435
+ extract_config_copy['base_url'] = ''
436
+
437
+ # Make request
438
+ need_full_response = strategy in ['cursor', 'next_url', 'link_header']
439
+ try:
440
+ logger.debug(f"Extracting page {page_count + 1} (total extracted so far: {total_extracted})")
441
+ page_data, full_response_data, response_obj = await self._extract_single_page(
442
+ extract_config_copy, params, headers, contract_dir, return_full_response=need_full_response, config_context=config_context
443
+ )
444
+ logger.info(f"Page {page_count + 1} extracted: {len(page_data)} records")
445
+ except Exception as e:
446
+ logger.error(
447
+ f"Error extracting page {page_count + 1}",
448
+ extra={
449
+ 'page': page_count + 1,
450
+ 'extracted': total_extracted,
451
+ },
452
+ exc_info=True
453
+ )
454
+ if current_batch:
455
+ yield current_batch
456
+ raise
457
+
458
+ # Restore original endpoint if modified
459
+ if strategy == 'next_url' and current_url:
460
+ extract_config_copy['api_endpoint'] = original_endpoint
461
+ extract_config_copy['base_url'] = original_base_url
462
+
463
+ # Check for empty page first
464
+ if not page_data:
465
+ logger.info(f"Empty page {page_count + 1} received, stopping pagination")
466
+ if current_batch:
467
+ yield current_batch
468
+ break
469
+
470
+ # Check stop conditions
471
+ page_count += 1
472
+ limit_value = params.get('limit', 100)
473
+ record_count = len(page_data)
474
+ logger.info(
475
+ f"Evaluating stop conditions for page {page_count}: "
476
+ f"{record_count} records returned, limit={limit_value}"
477
+ )
478
+ should_stop = self._check_stop_conditions(page_data, stop_conditions, params, full_response_data)
479
+ if should_stop:
480
+ logger.info(
481
+ f"✅ Stop condition met at page {page_count} "
482
+ f"(page returned {record_count} records, limit: {limit_value})"
483
+ )
484
+ for record in page_data:
485
+ current_batch.append(record)
486
+ total_extracted += 1
487
+ if len(current_batch) >= batch_size:
488
+ yield current_batch
489
+ current_batch = []
490
+ if current_batch:
491
+ yield current_batch
492
+ break
493
+
494
+ # Add page data to current batch
495
+ for record in page_data:
496
+ current_batch.append(record)
497
+ total_extracted += 1
498
+
499
+ if len(current_batch) >= batch_size:
500
+ yield current_batch
501
+ current_batch = []
502
+
503
+ if max_records and total_extracted >= max_records:
504
+ if current_batch:
505
+ yield current_batch
506
+ return
507
+
508
+ # Extract pagination token/URL for next iteration
509
+ if strategy == 'cursor' and full_response_data:
510
+ try:
511
+ current = full_response_data
512
+ for part in cursor_response_path.split('.'):
513
+ if isinstance(current, dict):
514
+ current = current.get(part)
515
+ elif isinstance(current, list) and part.isdigit():
516
+ current = current[int(part)]
517
+ else:
518
+ current = None
519
+ break
520
+
521
+ if current and isinstance(current, str):
522
+ current_cursor = current
523
+ elif current:
524
+ current_cursor = str(current)
525
+ else:
526
+ if current_batch:
527
+ yield current_batch
528
+ break
529
+ except (KeyError, IndexError, TypeError, ValueError):
530
+ if current_batch:
531
+ yield current_batch
532
+ break
533
+
534
+ elif strategy == 'next_url' and full_response_data:
535
+ try:
536
+ current = full_response_data
537
+ for part in next_url_response_path.split('.'):
538
+ if isinstance(current, dict):
539
+ current = current.get(part)
540
+ elif isinstance(current, list) and part.isdigit():
541
+ current = current[int(part)]
542
+ else:
543
+ current = None
544
+ break
545
+
546
+ if current and isinstance(current, str):
547
+ current_url = current
548
+ else:
549
+ current_url = None
550
+
551
+ if not current_url:
552
+ if current_batch:
553
+ yield current_batch
554
+ break
555
+ except (KeyError, IndexError, TypeError, ValueError):
556
+ if current_batch:
557
+ yield current_batch
558
+ break
559
+
560
+ elif strategy == 'link_header' and response_obj:
561
+ current_url = self._extract_link_header_url(response_obj)
562
+ if not current_url:
563
+ if current_batch:
564
+ yield current_batch
565
+ break
566
+ extract_config_copy['api_endpoint'] = current_url
567
+ extract_config_copy['base_url'] = ''
568
+
569
+ # Update pagination state
570
+ if strategy == 'page':
571
+ current_page += page_increment
572
+ elif strategy == 'offset':
573
+ limit = params.get('limit', 100)
574
+ if increment_by == 'limit':
575
+ current_offset += limit
576
+ else:
577
+ current_offset += int(increment_by)
578
+
579
+ # Delay between pages
580
+ if page_delay > 0:
581
+ await asyncio.sleep(page_delay)
582
+
583
+ # Yield remaining records
584
+ if current_batch:
585
+ yield current_batch
586
+
587
+ # Helper methods
588
+ def _resolve_rate_limit_delay(
589
+ self,
590
+ rate_limit_delay: Any,
591
+ contract_dir: Optional[Any] = None,
592
+ config_context: Optional[Dict[str, Any]] = None,
593
+ ) -> float:
594
+ """Resolve and convert rate_limit_delay to float."""
595
+ if isinstance(rate_limit_delay, str):
596
+ source_file = str(contract_dir / "extract.yaml") if contract_dir else None
597
+ resolved = resolve_values(rate_limit_delay, context=config_context, source_file=source_file)
598
+ return float(resolved)
599
+ return float(rate_limit_delay)
600
+
601
+ def _build_request_url(
602
+ self,
603
+ base_url: str,
604
+ api_endpoint: str,
605
+ path_params: Optional[Dict[str, Any]] = None,
606
+ ) -> str:
607
+ """Build full request URL from base URL and endpoint."""
608
+ if api_endpoint.startswith(('http://', 'https://')):
609
+ url = api_endpoint
610
+ elif base_url:
611
+ base_url = base_url.rstrip('/')
612
+ endpoint = api_endpoint.lstrip('/')
613
+ url = f"{base_url}/{endpoint}"
614
+ else:
615
+ raise ValueError(
616
+ "Either 'api_endpoint' must be a full URL (starting with http:// or https://) "
617
+ "or 'base_url' must be provided in extract.yaml"
618
+ )
619
+
620
+ # Substitute path parameters
621
+ if path_params and '{' in url:
622
+ try:
623
+ url = url.format(**path_params)
624
+ except KeyError as e:
625
+ raise ValueError(
626
+ f"Missing required path parameter in URL: {e}. "
627
+ f"URL: {url}, Available params: {list(path_params.keys())}"
628
+ ) from e
629
+
630
+ return url
631
+
632
+ def _configure_timeout(self, timeout_config: Dict[str, Any]) -> httpx.Timeout:
633
+ """Configure HTTP timeout from config dictionary."""
634
+ timeout = httpx.Timeout(
635
+ connect=float(timeout_config.get('connect', DEFAULT_TIMEOUT_CONNECT)),
636
+ read=float(timeout_config.get('read', DEFAULT_TIMEOUT_READ)),
637
+ write=float(timeout_config.get('write', DEFAULT_TIMEOUT_WRITE)),
638
+ pool=float(timeout_config.get('pool', DEFAULT_TIMEOUT_POOL)),
639
+ )
640
+ logger.debug(
641
+ f"Configured HTTP timeout: connect={timeout.connect}s, "
642
+ f"read={timeout.read}s, write={timeout.write}s, pool={timeout.pool}s"
643
+ )
644
+ return timeout
645
+
646
+ async def _make_http_request(
647
+ self,
648
+ client: httpx.AsyncClient,
649
+ method: str,
650
+ url: str,
651
+ params: Dict[str, Any],
652
+ headers: Dict[str, Any],
653
+ body: Optional[Any] = None,
654
+ ) -> httpx.Response:
655
+ """Make HTTP request with specified method."""
656
+ method = method.upper()
657
+
658
+ logger.debug(f"Making {method} request to {url}")
659
+
660
+ try:
661
+ if method == 'GET':
662
+ return await client.get(url, params=params, headers=headers)
663
+ elif method == 'POST':
664
+ if body:
665
+ return await client.post(
666
+ url,
667
+ json=body if isinstance(body, dict) else body,
668
+ params=params,
669
+ headers=headers,
670
+ )
671
+ else:
672
+ return await client.post(url, params=params, headers=headers)
673
+ else:
674
+ raise ValueError(f"Unsupported HTTP method: {method}")
675
+ except httpx.TimeoutException as e:
676
+ timeout_info = ""
677
+ if hasattr(e, 'timeout') and isinstance(e.timeout, httpx.Timeout):
678
+ timeout_info = (
679
+ f" (connect timeout: {e.timeout.connect}s, "
680
+ f"read timeout: {e.timeout.read}s)"
681
+ )
682
+ logger.error(f"HTTP request timeout for {method} {url}{timeout_info}")
683
+ raise
684
+ except httpx.RequestError as e:
685
+ logger.error(f"HTTP request error for {method} {url}: {type(e).__name__}: {e}")
686
+ raise
687
+
688
+ def _extract_by_path(self, data: Any, path: str) -> List[Dict[str, Any]]:
689
+ """Extract data using a simple path notation (e.g., 'data.items')."""
690
+ current = data
691
+ for part in path.split('.'):
692
+ if isinstance(current, dict):
693
+ current = current.get(part)
694
+ elif isinstance(current, list) and part.isdigit():
695
+ current = current[int(part)]
696
+ else:
697
+ return []
698
+
699
+ if current is None:
700
+ return []
701
+
702
+ if isinstance(current, list):
703
+ return current
704
+ elif isinstance(current, dict):
705
+ return [current]
706
+ else:
707
+ return []
708
+
709
+ def _extract_data_array(self, data: Any) -> List[Dict[str, Any]]:
710
+ """Extract data array from response, handling common response structures."""
711
+ if isinstance(data, list):
712
+ return data
713
+ elif isinstance(data, dict):
714
+ # Try common keys for data arrays
715
+ for key in RESPONSE_DATA_KEYS:
716
+ if key in data and isinstance(data[key], list):
717
+ return data[key]
718
+ # If no array found, return as single-item list
719
+ return [data]
720
+ else:
721
+ return []
722
+
723
+ def _check_stop_conditions(
724
+ self,
725
+ page_data: List[Dict[str, Any]],
726
+ stop_conditions: List[Dict[str, Any]],
727
+ params: Dict[str, Any],
728
+ response_data: Any = None,
729
+ ) -> bool:
730
+ """Check if pagination should stop based on configured stop conditions."""
731
+ if not stop_conditions:
732
+ # Default: stop if fewer records than limit
733
+ limit = params.get('limit', 100)
734
+ return len(page_data) < limit
735
+
736
+ for condition in stop_conditions:
737
+ if self._check_stop_condition(condition, page_data, params, response_data):
738
+ return True
739
+
740
+ return False
741
+
742
+ def _check_stop_condition(
743
+ self,
744
+ condition: Dict[str, Any],
745
+ page_data: List[Dict[str, Any]],
746
+ params: Dict[str, Any],
747
+ response_data: Any = None,
748
+ ) -> bool:
749
+ """Check a single stop condition."""
750
+ condition_type = condition.get('type')
751
+
752
+ if condition_type == 'empty_response':
753
+ if not page_data:
754
+ logger.debug("Stop condition 'empty_response' triggered: page is empty")
755
+ return True
756
+
757
+ elif condition_type == 'fewer_records':
758
+ limit = params.get('limit', 100)
759
+ record_count = len(page_data)
760
+ if record_count < limit:
761
+ logger.debug(
762
+ f"Stop condition 'fewer_records' triggered: "
763
+ f"page returned {record_count} records < limit {limit}"
764
+ )
765
+ return True
766
+
767
+ elif condition_type == 'max_pages':
768
+ max_pages = condition.get('value', 1000)
769
+ current_page = params.get('page', 0)
770
+ if current_page >= max_pages:
771
+ logger.debug(f"Stop condition 'max_pages' triggered: page {current_page} >= {max_pages}")
772
+ return True
773
+
774
+ elif condition_type == 'custom':
775
+ return self._check_custom_stop_condition(condition, response_data)
776
+
777
+ return False
778
+
779
+ def _check_custom_stop_condition(
780
+ self,
781
+ condition: Dict[str, Any],
782
+ response_data: Any,
783
+ ) -> bool:
784
+ """Check custom stop condition based on response path."""
785
+ response_path = condition.get('response_path')
786
+ expected_value = condition.get('value')
787
+
788
+ if not response_path or not response_data:
789
+ return False
790
+
791
+ try:
792
+ current = response_data
793
+ for part in response_path.split('.'):
794
+ if isinstance(current, dict):
795
+ current = current.get(part)
796
+ elif isinstance(current, list) and part.isdigit():
797
+ current = current[int(part)]
798
+ else:
799
+ return False
800
+ return current == expected_value
801
+ except (KeyError, IndexError, TypeError):
802
+ return False
803
+
804
+ def _extract_link_header_url(self, response: httpx.Response) -> Optional[str]:
805
+ """Extract next URL from Link header (RFC 5988)."""
806
+ link_header = response.headers.get('Link', '')
807
+ if not link_header:
808
+ return None
809
+
810
+ # Parse Link header: <url>; rel="next"
811
+ pattern = r'<([^>]+)>;\s*rel=["\']?next["\']?'
812
+ match = re.search(pattern, link_header, re.IGNORECASE)
813
+ if match:
814
+ return match.group(1)
815
+
816
+ return None