@dataif/cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (183) hide show
  1. package/README.md +16 -0
  2. package/bin/dataif.js +623 -0
  3. package/package.json +26 -0
  4. package/scripts/build-template.mjs +72 -0
  5. package/templates/dataif/README.md +157 -0
  6. package/templates/dataif/infra/.env.example +119 -0
  7. package/templates/dataif/infra/.env.stg.example +119 -0
  8. package/templates/dataif/infra/airflow/Dockerfile +11 -0
  9. package/templates/dataif/infra/airflow/Dockerfile.release +17 -0
  10. package/templates/dataif/infra/airflow/requirements.txt +3 -0
  11. package/templates/dataif/infra/docker-compose.yml +306 -0
  12. package/templates/dataif/infra/init-db/01-init-dataif.sh +129 -0
  13. package/templates/dataif/infra/init-db/pnp-curated-views.sqlinc +444 -0
  14. package/templates/dataif/infra/init-db/pnp-raw-staging-curated.sqlinc +701 -0
  15. package/templates/dataif/infra/keycloak/Dockerfile +4 -0
  16. package/templates/dataif/infra/keycloak/realm-dataif.json +73 -0
  17. package/templates/dataif/infra/ollama/Dockerfile +9 -0
  18. package/templates/dataif/infra/ollama/bootstrap-model.sh +100 -0
  19. package/templates/dataif/infra/ollama/sabia-7b.Modelfile +14 -0
  20. package/templates/dataif/infra/postgres/Dockerfile +4 -0
  21. package/templates/dataif/pipelines/airflow/dags/generated/.gitkeep +1 -0
  22. package/templates/dataif/pipelines/airflow/dags/generated/2020_financeiro_fcc6f1f3_sync.py +9 -0
  23. package/templates/dataif/pipelines/dataif_pipelines/__init__.py +1 -0
  24. package/templates/dataif/pipelines/dataif_pipelines/airflow/__init__.py +1 -0
  25. package/templates/dataif/pipelines/dataif_pipelines/airflow/pnp_pipeline_factory.py +167 -0
  26. package/templates/dataif/pipelines/dataif_pipelines/connectors/__init__.py +1 -0
  27. package/templates/dataif/pipelines/dataif_pipelines/connectors/base/__init__.py +1 -0
  28. package/templates/dataif/pipelines/dataif_pipelines/connectors/base/connector.py +28 -0
  29. package/templates/dataif/pipelines/dataif_pipelines/connectors/base/types.py +14 -0
  30. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/__init__.py +1 -0
  31. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/config.py +19 -0
  32. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/connector.py +558 -0
  33. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/powerbi_microdados.py +728 -0
  34. package/templates/dataif/pipelines/dataif_pipelines/connectors/nilo_pecanha/transform.py +296 -0
  35. package/templates/dataif/pipelines/dataif_pipelines/jobs/__init__.py +1 -0
  36. package/templates/dataif/pipelines/dataif_pipelines/jobs/nilo_pipeline.py +112 -0
  37. package/templates/dataif/pipelines/dataif_pipelines/orchestration/__init__.py +21 -0
  38. package/templates/dataif/pipelines/dataif_pipelines/orchestration/pnp_workflow.py +783 -0
  39. package/templates/dataif/pipelines/dataif_pipelines/repositories/__init__.py +1 -0
  40. package/templates/dataif/pipelines/dataif_pipelines/repositories/pnp_raw_repository.py +860 -0
  41. package/templates/dataif/pipelines/dataif_pipelines/services/__init__.py +19 -0
  42. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_curated_service.py +66 -0
  43. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_download_service.py +534 -0
  44. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_quality_service.py +9 -0
  45. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_raw_ingestion_service.py +124 -0
  46. package/templates/dataif/pipelines/dataif_pipelines/services/pnp_staging_service.py +271 -0
  47. package/templates/dataif/pipelines/dataif_pipelines/services/powerbi_catalog_service.py +159 -0
  48. package/templates/dataif/pipelines/sql/staging/020_pnp_matriculas.sql +112 -0
  49. package/templates/dataif/pipelines/sql/staging/030_pnp_eficiencia_academica.sql +83 -0
  50. package/templates/dataif/pipelines/sql/staging/040_pnp_servidores.sql +90 -0
  51. package/templates/dataif/pipelines/sql/staging/050_pnp_financeiro.sql +72 -0
  52. package/templates/dataif/pipelines/sql/views_curated/004_mv_pnp_dashboard_fast.sql +204 -0
  53. package/templates/dataif/pipelines/sql/views_curated/010_vw_pnp_admin_ingestao.sql +51 -0
  54. package/templates/dataif/pipelines/sql/views_curated/020_vw_pnp_qualidade_dados.sql +114 -0
  55. package/templates/dataif/pipelines/sql/views_curated/030_vw_pnp_matriculas.sql +67 -0
  56. package/templates/dataif/pipelines/sql/views_curated/040_vw_pnp_eficiencia.sql +33 -0
  57. package/templates/dataif/pipelines/sql/views_curated/050_vw_pnp_servidores.sql +30 -0
  58. package/templates/dataif/pipelines/sql/views_curated/060_vw_pnp_financeiro.sql +22 -0
  59. package/templates/dataif/pipelines/sql/views_curated/070_vw_pnp_vanna.sql +115 -0
  60. package/templates/dataif/scripts/configure-env.sh +149 -0
  61. package/templates/dataif/scripts/create_metabase_pnp_dashboard.py +943 -0
  62. package/templates/dataif/scripts/create_metabase_pnp_matriculas_dashboard.py +580 -0
  63. package/templates/dataif/scripts/deploy.sh +79 -0
  64. package/templates/dataif/scripts/fix_metabase_template_tag_ids.py +91 -0
  65. package/templates/dataif/scripts/pnp_powerbi_microdados_probe.py +14 -0
  66. package/templates/dataif/scripts/pnp_validate_raw_run.py +330 -0
  67. package/templates/dataif/scripts/publish-images.sh +31 -0
  68. package/templates/dataif/scripts/sync_metabase_dashboard_field_filters.py +241 -0
  69. package/templates/dataif/scripts/use-vanna-ollama.sh +139 -0
  70. package/templates/dataif/services/api/.dockerignore +18 -0
  71. package/templates/dataif/services/api/Dockerfile +12 -0
  72. package/templates/dataif/services/api/app/__init__.py +1 -0
  73. package/templates/dataif/services/api/app/auth.py +48 -0
  74. package/templates/dataif/services/api/app/config.py +59 -0
  75. package/templates/dataif/services/api/app/keycloak_admin.py +215 -0
  76. package/templates/dataif/services/api/app/main.py +2432 -0
  77. package/templates/dataif/services/api/app/metabase_admin.py +191 -0
  78. package/templates/dataif/services/api/app/metabase_bootstrap.py +44 -0
  79. package/templates/dataif/services/api/app/metabase_embed.py +15 -0
  80. package/templates/dataif/services/api/app/pnp_dag_provisioner.py +113 -0
  81. package/templates/dataif/services/api/app/pnp_instance_repository.py +951 -0
  82. package/templates/dataif/services/api/app/pnp_powerbi.py +438 -0
  83. package/templates/dataif/services/api/app/vanna_client.py +32 -0
  84. package/templates/dataif/services/api/requirements.txt +9 -0
  85. package/templates/dataif/services/vanna/.dockerignore +18 -0
  86. package/templates/dataif/services/vanna/Dockerfile +12 -0
  87. package/templates/dataif/services/vanna/app/config.py +57 -0
  88. package/templates/dataif/services/vanna/app/main.py +108 -0
  89. package/templates/dataif/services/vanna/app/runtime_config.py +114 -0
  90. package/templates/dataif/services/vanna/app/sql_guard.py +123 -0
  91. package/templates/dataif/services/vanna/app/vanna_engine.py +382 -0
  92. package/templates/dataif/services/vanna/requirements.txt +8 -0
  93. package/templates/dataif/services/web/.dockerignore +13 -0
  94. package/templates/dataif/services/web/Dockerfile +16 -0
  95. package/templates/dataif/services/web/index.html +12 -0
  96. package/templates/dataif/services/web/nginx.conf +74 -0
  97. package/templates/dataif/services/web/package-lock.json +4397 -0
  98. package/templates/dataif/services/web/package.json +32 -0
  99. package/templates/dataif/services/web/postcss.config.mjs +5 -0
  100. package/templates/dataif/services/web/src/App.jsx +2817 -0
  101. package/templates/dataif/services/web/src/adminAuth.js +245 -0
  102. package/templates/dataif/services/web/src/assets/avatar_placeholder.png +0 -0
  103. package/templates/dataif/services/web/src/assets/github_logo_icon_229278.svg +1 -0
  104. package/templates/dataif/services/web/src/assets/if-logo.png +0 -0
  105. package/templates/dataif/services/web/src/assets/if.svg +0 -0
  106. package/templates/dataif/services/web/src/assets/pnp-horizontal.svg +1 -0
  107. package/templates/dataif/services/web/src/components/AppHeader.jsx +233 -0
  108. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/mobile-header.tsx +56 -0
  109. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-account-card.tsx +209 -0
  110. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-item-button.tsx +67 -0
  111. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-item.tsx +108 -0
  112. package/templates/dataif/services/web/src/components/application/app-navigation/base-components/nav-list.tsx +83 -0
  113. package/templates/dataif/services/web/src/components/application/app-navigation/config.ts +23 -0
  114. package/templates/dataif/services/web/src/components/application/app-navigation/header-navigation.tsx +240 -0
  115. package/templates/dataif/services/web/src/components/application/pagination/pagination-base.tsx +376 -0
  116. package/templates/dataif/services/web/src/components/application/pagination/pagination-dot.tsx +52 -0
  117. package/templates/dataif/services/web/src/components/application/pagination/pagination-line.tsx +48 -0
  118. package/templates/dataif/services/web/src/components/application/pagination/pagination.tsx +328 -0
  119. package/templates/dataif/services/web/src/components/application/tabs/tabs.tsx +223 -0
  120. package/templates/dataif/services/web/src/components/base/avatar/avatar-label-group.tsx +28 -0
  121. package/templates/dataif/services/web/src/components/base/avatar/avatar.tsx +129 -0
  122. package/templates/dataif/services/web/src/components/base/avatar/base-components/avatar-add-button.tsx +32 -0
  123. package/templates/dataif/services/web/src/components/base/avatar/base-components/avatar-company-icon.tsx +24 -0
  124. package/templates/dataif/services/web/src/components/base/avatar/base-components/avatar-online-indicator.tsx +29 -0
  125. package/templates/dataif/services/web/src/components/base/avatar/base-components/index.tsx +4 -0
  126. package/templates/dataif/services/web/src/components/base/avatar/base-components/verified-tick.tsx +32 -0
  127. package/templates/dataif/services/web/src/components/base/badges/badge-types.ts +264 -0
  128. package/templates/dataif/services/web/src/components/base/badges/badges.tsx +415 -0
  129. package/templates/dataif/services/web/src/components/base/button-group/button-group.tsx +104 -0
  130. package/templates/dataif/services/web/src/components/base/buttons/button.tsx +267 -0
  131. package/templates/dataif/services/web/src/components/base/input/hint-text.tsx +31 -0
  132. package/templates/dataif/services/web/src/components/base/input/input.tsx +269 -0
  133. package/templates/dataif/services/web/src/components/base/input/label.tsx +48 -0
  134. package/templates/dataif/services/web/src/components/base/radio-buttons/radio-buttons.tsx +127 -0
  135. package/templates/dataif/services/web/src/components/base/select/combobox.tsx +150 -0
  136. package/templates/dataif/services/web/src/components/base/select/multi-select.tsx +361 -0
  137. package/templates/dataif/services/web/src/components/base/select/popover.tsx +32 -0
  138. package/templates/dataif/services/web/src/components/base/select/select-item.tsx +95 -0
  139. package/templates/dataif/services/web/src/components/base/select/select-native.tsx +67 -0
  140. package/templates/dataif/services/web/src/components/base/select/select.tsx +144 -0
  141. package/templates/dataif/services/web/src/components/base/tags/base-components/tag-close-x.tsx +32 -0
  142. package/templates/dataif/services/web/src/components/base/tooltip/tooltip.tsx +107 -0
  143. package/templates/dataif/services/web/src/components/foundations/dot-icon.tsx +22 -0
  144. package/templates/dataif/services/web/src/components/foundations/logo/untitledui-logo-minimal.tsx +170 -0
  145. package/templates/dataif/services/web/src/components/foundations/logo/untitledui-logo.tsx +58 -0
  146. package/templates/dataif/services/web/src/hooks/use-breakpoint.ts +34 -0
  147. package/templates/dataif/services/web/src/hooks/use-resize-observer.ts +67 -0
  148. package/templates/dataif/services/web/src/main.jsx +14 -0
  149. package/templates/dataif/services/web/src/providers/theme-provider.jsx +62 -0
  150. package/templates/dataif/services/web/src/styles/globals.css +60 -0
  151. package/templates/dataif/services/web/src/styles/theme.css +1326 -0
  152. package/templates/dataif/services/web/src/styles/typography.css +430 -0
  153. package/templates/dataif/services/web/src/styles.css +1287 -0
  154. package/templates/dataif/services/web/src/utils/cx.ts +24 -0
  155. package/templates/dataif/services/web/src/utils/is-react-component.ts +33 -0
  156. package/templates/dataif/services/web/vite.config.js +14 -0
  157. package/templates/dataif/sql/ddl/001_schemas.sql +6 -0
  158. package/templates/dataif/sql/ddl/003_pnp_raw_staging_curated.sql +699 -0
  159. package/templates/dataif/sql/migrations/001_pnp_phase1_backfill.sql +3 -0
  160. package/templates/dataif/sql/migrations/002_pnp_phase2_admin_config_backfill.sql +184 -0
  161. package/templates/dataif/sql/migrations/003_pnp_phase3_raw_tabular_backfill.sql +3 -0
  162. package/templates/dataif/sql/migrations/004_pnp_phase3_raw_backfill_support_index.sql +3 -0
  163. package/templates/dataif/sql/migrations/005_pnp_phase7_staging_support_indexes.sql +2 -0
  164. package/templates/dataif/sql/migrations/006_pnp_phase7_staging_autovacuum_tuning.sql +2 -0
  165. package/templates/dataif/sql/migrations/007_pnp_phase7b_run_packages.sql +20 -0
  166. package/templates/dataif/sql/migrations/008_pnp_phase7a_pipeline_endpoints.sql +169 -0
  167. package/templates/dataif/sql/migrations/009_pnp_phase8_curated.sql +35 -0
  168. package/templates/dataif/sql/migrations/010_pnp_phase10_staging_incremental_upsert.sql +3 -0
  169. package/templates/dataif/sql/migrations/010_pnp_pipeline_uuid.sql +51 -0
  170. package/templates/dataif/sql/migrations/011_app_settings.sql +7 -0
  171. package/templates/dataif/sql/staging/020_pnp_matriculas.sql +112 -0
  172. package/templates/dataif/sql/staging/030_pnp_eficiencia_academica.sql +83 -0
  173. package/templates/dataif/sql/staging/040_pnp_servidores.sql +90 -0
  174. package/templates/dataif/sql/staging/050_pnp_financeiro.sql +72 -0
  175. package/templates/dataif/sql/views_curated/003_vw_pnp_microdados_admin.sql +160 -0
  176. package/templates/dataif/sql/views_curated/004_mv_pnp_dashboard_fast.sql +204 -0
  177. package/templates/dataif/sql/views_curated/010_vw_pnp_admin_ingestao.sql +51 -0
  178. package/templates/dataif/sql/views_curated/020_vw_pnp_qualidade_dados.sql +114 -0
  179. package/templates/dataif/sql/views_curated/030_vw_pnp_matriculas.sql +67 -0
  180. package/templates/dataif/sql/views_curated/040_vw_pnp_eficiencia.sql +33 -0
  181. package/templates/dataif/sql/views_curated/050_vw_pnp_servidores.sql +30 -0
  182. package/templates/dataif/sql/views_curated/060_vw_pnp_financeiro.sql +22 -0
  183. package/templates/dataif/sql/views_curated/070_vw_pnp_vanna.sql +115 -0
@@ -0,0 +1,860 @@
1
+ from __future__ import annotations
2
+
3
+ from collections import defaultdict
4
+ from datetime import datetime
5
+ from typing import Any
6
+
7
+ import psycopg2
8
+ from psycopg2 import sql
9
+ from psycopg2.extras import Json, RealDictCursor, execute_values
10
+
11
+
12
+ def register_run_start(
13
+ dsn: str,
14
+ *,
15
+ run_id: str,
16
+ instance_key: str | None,
17
+ airflow_dag_id: str | None = None,
18
+ airflow_dag_run_id: str | None = None,
19
+ status: str,
20
+ trigger_mode: str,
21
+ requested_by: str,
22
+ logical_date: datetime | None,
23
+ started_at: datetime,
24
+ ) -> None:
25
+ with psycopg2.connect(dsn) as conn, conn.cursor() as cur:
26
+ cur.execute(
27
+ """
28
+ INSERT INTO raw.pnp_runs (
29
+ run_id,
30
+ instance_key,
31
+ airflow_dag_id,
32
+ airflow_dag_run_id,
33
+ logical_date,
34
+ trigger_mode,
35
+ requested_by,
36
+ status,
37
+ started_at
38
+ )
39
+ VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
40
+ ON CONFLICT (run_id) DO UPDATE
41
+ SET
42
+ instance_key = EXCLUDED.instance_key,
43
+ airflow_dag_id = EXCLUDED.airflow_dag_id,
44
+ airflow_dag_run_id = EXCLUDED.airflow_dag_run_id,
45
+ logical_date = EXCLUDED.logical_date,
46
+ trigger_mode = EXCLUDED.trigger_mode,
47
+ requested_by = EXCLUDED.requested_by,
48
+ status = EXCLUDED.status,
49
+ started_at = EXCLUDED.started_at
50
+ """,
51
+ (
52
+ run_id,
53
+ instance_key,
54
+ airflow_dag_id,
55
+ airflow_dag_run_id,
56
+ logical_date,
57
+ trigger_mode,
58
+ requested_by,
59
+ status,
60
+ started_at,
61
+ ),
62
+ )
63
+
64
+
65
+ def register_run_step_start(
66
+ dsn: str,
67
+ *,
68
+ run_id: str,
69
+ instance_key: str | None,
70
+ airflow_task_id: str,
71
+ map_index: int | None,
72
+ status: str,
73
+ details: dict[str, Any],
74
+ started_at: datetime,
75
+ ) -> None:
76
+ with psycopg2.connect(dsn) as conn, conn.cursor() as cur:
77
+ cur.execute(
78
+ """
79
+ INSERT INTO raw.pnp_run_steps (
80
+ run_id,
81
+ instance_key,
82
+ airflow_task_id,
83
+ map_index,
84
+ status,
85
+ started_at,
86
+ details_json
87
+ )
88
+ VALUES (%s, %s, %s, %s, %s, %s, %s)
89
+ ON CONFLICT (run_id, airflow_task_id, map_index_key) DO UPDATE
90
+ SET
91
+ instance_key = EXCLUDED.instance_key,
92
+ status = EXCLUDED.status,
93
+ started_at = EXCLUDED.started_at,
94
+ finished_at = NULL,
95
+ records_affected = NULL,
96
+ error_message = NULL,
97
+ details_json = EXCLUDED.details_json
98
+ """,
99
+ (
100
+ run_id,
101
+ instance_key,
102
+ airflow_task_id,
103
+ map_index,
104
+ status,
105
+ started_at,
106
+ Json(details),
107
+ ),
108
+ )
109
+
110
+
111
+ def finish_run_step(
112
+ dsn: str,
113
+ *,
114
+ run_id: str,
115
+ airflow_task_id: str,
116
+ map_index: int | None,
117
+ status: str,
118
+ finished_at: datetime,
119
+ records_affected: int | None,
120
+ error_message: str | None,
121
+ details: dict[str, Any],
122
+ ) -> None:
123
+ with psycopg2.connect(dsn) as conn, conn.cursor() as cur:
124
+ cur.execute(
125
+ """
126
+ UPDATE raw.pnp_run_steps
127
+ SET
128
+ status = %s,
129
+ finished_at = %s,
130
+ records_affected = %s,
131
+ error_message = %s,
132
+ details_json = details_json || %s
133
+ WHERE run_id = %s
134
+ AND airflow_task_id = %s
135
+ AND map_index_key = COALESCE(%s, -1)
136
+ """,
137
+ (
138
+ status,
139
+ finished_at,
140
+ records_affected,
141
+ error_message,
142
+ Json(details),
143
+ run_id,
144
+ airflow_task_id,
145
+ map_index,
146
+ ),
147
+ )
148
+
149
+
150
+ def append_run_package(
151
+ dsn: str,
152
+ *,
153
+ run_id: str,
154
+ instance_key: str | None,
155
+ airflow_dag_id: str | None,
156
+ airflow_dag_run_id: str | None,
157
+ airflow_task_id: str,
158
+ package_type: str,
159
+ package_name: str,
160
+ package_status: str,
161
+ records_affected: int | None,
162
+ payload: dict[str, Any],
163
+ ) -> None:
164
+ with psycopg2.connect(dsn) as conn, conn.cursor() as cur:
165
+ cur.execute(
166
+ """
167
+ INSERT INTO raw.pnp_run_packages (
168
+ run_id,
169
+ instance_key,
170
+ airflow_dag_id,
171
+ airflow_dag_run_id,
172
+ airflow_task_id,
173
+ package_type,
174
+ package_name,
175
+ package_status,
176
+ records_affected,
177
+ payload_json
178
+ )
179
+ VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
180
+ """,
181
+ (
182
+ run_id,
183
+ instance_key,
184
+ airflow_dag_id,
185
+ airflow_dag_run_id,
186
+ airflow_task_id,
187
+ package_type,
188
+ package_name,
189
+ package_status,
190
+ records_affected,
191
+ Json(payload),
192
+ ),
193
+ )
194
+
195
+
196
+ def finish_run(
197
+ dsn: str,
198
+ *,
199
+ run_id: str,
200
+ status: str,
201
+ catalog_entry_count: int,
202
+ selected_download_count: int,
203
+ downloaded_file_count: int,
204
+ raw_record_count: int,
205
+ error_message: str | None,
206
+ run_summary: dict[str, Any],
207
+ finished_at: datetime,
208
+ ) -> None:
209
+ with psycopg2.connect(dsn) as conn, conn.cursor() as cur:
210
+ cur.execute(
211
+ """
212
+ UPDATE raw.pnp_runs
213
+ SET
214
+ status = %s,
215
+ catalog_entry_count = %s,
216
+ selected_download_count = %s,
217
+ downloaded_file_count = %s,
218
+ raw_record_count = %s,
219
+ error_message = %s,
220
+ run_summary_json = %s,
221
+ finished_at = %s
222
+ WHERE run_id = %s
223
+ """,
224
+ (
225
+ status,
226
+ catalog_entry_count,
227
+ selected_download_count,
228
+ downloaded_file_count,
229
+ raw_record_count,
230
+ error_message,
231
+ Json(run_summary),
232
+ finished_at,
233
+ run_id,
234
+ ),
235
+ )
236
+
237
+
238
+ def mark_run_downloads_failed(
239
+ dsn: str,
240
+ *,
241
+ run_id: str,
242
+ error_message: str,
243
+ ) -> int:
244
+ with psycopg2.connect(dsn) as conn, conn.cursor() as cur:
245
+ cur.execute(
246
+ """
247
+ UPDATE raw.pnp_downloads
248
+ SET
249
+ status = 'failed',
250
+ error_message = %s,
251
+ finished_at = NOW()
252
+ WHERE run_id = %s
253
+ AND status = 'running'
254
+ """,
255
+ (error_message, run_id),
256
+ )
257
+ return cur.rowcount if cur.rowcount > 0 else 0
258
+
259
+
260
+ def load_instance_runtime_config(dsn: str, *, instance_key: str) -> dict[str, Any]:
261
+ with psycopg2.connect(dsn, cursor_factory=RealDictCursor) as conn, conn.cursor() as cur:
262
+ cur.execute(
263
+ """
264
+ SELECT
265
+ pipeline_id,
266
+ instance_key,
267
+ instance_name,
268
+ connection_key,
269
+ connection_name,
270
+ page_url,
271
+ schedule,
272
+ is_active,
273
+ metadata
274
+ FROM raw.pnp_instances
275
+ WHERE instance_key = %s
276
+ AND deleted_at IS NULL
277
+ """,
278
+ (instance_key,),
279
+ )
280
+ instance_row = cur.fetchone()
281
+ if not instance_row:
282
+ raise LookupError(instance_key)
283
+
284
+ cur.execute(
285
+ """
286
+ SELECT
287
+ ano_base,
288
+ tipo_microdados,
289
+ configured_microdados_url,
290
+ selection_rank,
291
+ metadata
292
+ FROM raw.pnp_instance_selection
293
+ WHERE instance_key = %s
294
+ AND is_active = TRUE
295
+ ORDER BY COALESCE(selection_rank, 2147483647), ano_base DESC, tipo_microdados
296
+ """,
297
+ (instance_key,),
298
+ )
299
+ selection_rows = [dict(row) for row in cur.fetchall()]
300
+
301
+ cur.execute(
302
+ """
303
+ SELECT
304
+ pe.endpoint_key,
305
+ pe.selection_source,
306
+ pe.metadata AS pipeline_endpoint_metadata,
307
+ et.endpoint_name,
308
+ et.tipo_microdados,
309
+ et.raw_table_schema,
310
+ et.raw_table_name,
311
+ et.staging_table_schema,
312
+ et.staging_table_name,
313
+ et.curated_relation_schema,
314
+ et.curated_relation_name,
315
+ et.metadata AS endpoint_metadata
316
+ FROM raw.pnp_pipeline_endpoints pe
317
+ JOIN raw.pnp_endpoint_tables et
318
+ ON et.endpoint_key = pe.endpoint_key
319
+ WHERE pe.instance_key = %s
320
+ AND pe.is_active = TRUE
321
+ AND et.is_active = TRUE
322
+ ORDER BY et.endpoint_name, pe.endpoint_key
323
+ """,
324
+ (instance_key,),
325
+ )
326
+ endpoint_rows = [dict(row) for row in cur.fetchall()]
327
+
328
+ selected_downloads = [
329
+ {
330
+ "ano_base": str(row["ano_base"]),
331
+ "tipo_microdados": str(row["tipo_microdados"]),
332
+ "microdados_url": str(row["configured_microdados_url"]),
333
+ }
334
+ for row in selection_rows
335
+ if row.get("configured_microdados_url")
336
+ ]
337
+ selected_years = list(dict.fromkeys(str(row["ano_base"]) for row in selection_rows))
338
+ selected_microdados_types = list(dict.fromkeys(str(row["tipo_microdados"]) for row in selection_rows))
339
+ selected_endpoints = [str(row["endpoint_key"]) for row in endpoint_rows]
340
+ endpoint_tables = [
341
+ {
342
+ "endpoint_key": str(row["endpoint_key"]),
343
+ "endpoint_name": str(row["endpoint_name"]),
344
+ "tipo_microdados": str(row["tipo_microdados"]),
345
+ "selection_source": row.get("selection_source"),
346
+ "raw_table": f"{row['raw_table_schema']}.{row['raw_table_name']}",
347
+ "staging_table": (
348
+ f"{row['staging_table_schema']}.{row['staging_table_name']}"
349
+ if row.get("staging_table_name")
350
+ else None
351
+ ),
352
+ "curated_relation": (
353
+ f"{row['curated_relation_schema']}.{row['curated_relation_name']}"
354
+ if row.get("curated_relation_name")
355
+ else None
356
+ ),
357
+ "metadata": {
358
+ **dict(row.get("endpoint_metadata") or {}),
359
+ **dict(row.get("pipeline_endpoint_metadata") or {}),
360
+ },
361
+ }
362
+ for row in endpoint_rows
363
+ ]
364
+ return {
365
+ "pipeline_id": str(instance_row["pipeline_id"]),
366
+ "instance_key": str(instance_row["instance_key"]),
367
+ "instance_name": str(instance_row["instance_name"]),
368
+ "connection_key": instance_row.get("connection_key"),
369
+ "connection_name": instance_row.get("connection_name"),
370
+ "page_url": str(instance_row["page_url"]),
371
+ "schedule": instance_row.get("schedule"),
372
+ "is_active": bool(instance_row.get("is_active")),
373
+ "metadata": dict(instance_row.get("metadata") or {}),
374
+ "selected_years": selected_years,
375
+ "selected_microdados_types": selected_microdados_types,
376
+ "selected_endpoints": selected_endpoints,
377
+ "endpoint_tables": endpoint_tables,
378
+ "selected_downloads": selected_downloads,
379
+ "selection_rows": selection_rows,
380
+ "request_params": {
381
+ "mode": "powerbi_microdados",
382
+ "pipeline_id": str(instance_row["pipeline_id"]),
383
+ "instance_key": str(instance_row["instance_key"]),
384
+ "instance_name": str(instance_row["instance_name"]),
385
+ "selected_years": selected_years,
386
+ "selected_microdados_types": selected_microdados_types,
387
+ "selected_endpoints": selected_endpoints,
388
+ "endpoint_tables": endpoint_tables,
389
+ "selected_downloads": selected_downloads,
390
+ },
391
+ }
392
+
393
+
394
+ def list_active_instance_schedules(dsn: str) -> list[dict[str, Any]]:
395
+ with psycopg2.connect(dsn, cursor_factory=RealDictCursor) as conn, conn.cursor() as cur:
396
+ cur.execute(
397
+ """
398
+ SELECT
399
+ i.instance_key,
400
+ i.schedule,
401
+ i.updated_at,
402
+ (
403
+ SELECT MAX(r.started_at)
404
+ FROM raw.pnp_runs r
405
+ WHERE r.instance_key = i.instance_key
406
+ ) AS last_started_at,
407
+ EXISTS (
408
+ SELECT 1
409
+ FROM raw.pnp_runs r
410
+ WHERE r.instance_key = i.instance_key
411
+ AND r.status = 'running'
412
+ ) AS has_running_run
413
+ FROM raw.pnp_instances i
414
+ WHERE i.is_active = TRUE
415
+ AND i.deleted_at IS NULL
416
+ ORDER BY i.instance_key
417
+ """,
418
+ )
419
+ return [dict(row) for row in cur.fetchall()]
420
+
421
+
422
+ def load_raw_batch(
423
+ dsn: str,
424
+ *,
425
+ normalized_records: list[dict[str, Any]],
426
+ pending_assets: list[dict[str, Any]],
427
+ pending_catalog_entries: list[dict[str, Any]],
428
+ pending_run_selection: list[dict[str, Any]],
429
+ pending_downloads: list[dict[str, Any]],
430
+ pending_quarantine: list[dict[str, Any]],
431
+ write_legacy: bool = False,
432
+ ) -> dict[str, int]:
433
+ asset_count = len(pending_assets)
434
+ with psycopg2.connect(dsn, cursor_factory=RealDictCursor) as conn, conn.cursor() as cur:
435
+ _insert_catalog_entries(cur, pending_catalog_entries)
436
+ run_selection_map = _upsert_run_selection(cur, pending_run_selection)
437
+ download_id_by_url = _upsert_downloads(cur, pending_downloads, run_selection_map)
438
+ download_column_count = _upsert_download_columns(cur, pending_downloads, download_id_by_url)
439
+ domain_counts = _insert_domain_records(cur, normalized_records, download_id_by_url)
440
+ quarantine_count = _insert_quarantine(cur, pending_quarantine, download_id_by_url)
441
+
442
+ if write_legacy:
443
+ raise RuntimeError("legacy PNP compatibility support has been removed")
444
+
445
+ return {
446
+ "catalog_entry_count": len(pending_catalog_entries),
447
+ "selected_download_count": len(pending_run_selection),
448
+ "downloaded_file_count": len(pending_downloads),
449
+ "download_column_count": download_column_count,
450
+ "asset_count": asset_count,
451
+ "raw_record_count": sum(domain_counts.values()),
452
+ "quarantine_count": quarantine_count,
453
+ **{f"{domain_key}_count": count for domain_key, count in domain_counts.items()},
454
+ }
455
+
456
+
457
+ def upsert_raw_metadata(
458
+ dsn: str,
459
+ *,
460
+ pending_assets: list[dict[str, Any]],
461
+ pending_catalog_entries: list[dict[str, Any]],
462
+ pending_run_selection: list[dict[str, Any]],
463
+ pending_downloads: list[dict[str, Any]],
464
+ write_legacy: bool = False,
465
+ include_download_columns: bool = True,
466
+ ) -> dict[str, Any]:
467
+ asset_count = len(pending_assets)
468
+ with psycopg2.connect(dsn, cursor_factory=RealDictCursor) as conn, conn.cursor() as cur:
469
+ _insert_catalog_entries(cur, pending_catalog_entries)
470
+ run_selection_map = _upsert_run_selection(cur, pending_run_selection)
471
+ download_id_by_url = _upsert_downloads(cur, pending_downloads, run_selection_map)
472
+ download_column_count = 0
473
+ if include_download_columns:
474
+ download_column_count = _upsert_download_columns(cur, pending_downloads, download_id_by_url)
475
+
476
+ if write_legacy:
477
+ raise RuntimeError("legacy PNP compatibility support has been removed")
478
+
479
+ return {
480
+ "catalog_entry_count": len(pending_catalog_entries),
481
+ "selected_download_count": len(pending_run_selection),
482
+ "downloaded_file_count": len(pending_downloads),
483
+ "download_column_count": download_column_count,
484
+ "asset_count": asset_count,
485
+ "download_id_by_url": download_id_by_url,
486
+ }
487
+
488
+
489
+ def load_raw_record_chunk(
490
+ dsn: str,
491
+ *,
492
+ normalized_records: list[dict[str, Any]],
493
+ pending_quarantine: list[dict[str, Any]],
494
+ download_id_by_url: dict[str, int],
495
+ pending_assets: list[dict[str, Any]] | None = None,
496
+ write_legacy: bool = False,
497
+ ) -> dict[str, int]:
498
+ assets = pending_assets or []
499
+ asset_count = len(assets)
500
+
501
+ with psycopg2.connect(dsn, cursor_factory=RealDictCursor) as conn, conn.cursor() as cur:
502
+ domain_counts = _insert_domain_records(cur, normalized_records, download_id_by_url)
503
+ quarantine_count = _insert_quarantine(cur, pending_quarantine, download_id_by_url)
504
+
505
+ if write_legacy:
506
+ raise RuntimeError("legacy PNP compatibility support has been removed")
507
+
508
+ return {
509
+ "raw_record_count": sum(domain_counts.values()),
510
+ "quarantine_count": quarantine_count,
511
+ "asset_count": asset_count,
512
+ **{f"{domain_key}_count": count for domain_key, count in domain_counts.items()},
513
+ }
514
+
515
+
516
+ def collect_run_checks(dsn: str, run_id: str) -> dict[str, Any]:
517
+ query_map = {
518
+ "catalog_entry_count": "SELECT COUNT(*) FROM raw.pnp_catalog_entries WHERE run_id = %s",
519
+ "run_selection_count": "SELECT COUNT(*) FROM raw.pnp_run_selection WHERE run_id = %s",
520
+ "download_count": "SELECT COUNT(*) FROM raw.pnp_downloads WHERE run_id = %s",
521
+ "download_column_count": """
522
+ SELECT COUNT(*)
523
+ FROM raw.pnp_download_columns c
524
+ JOIN raw.pnp_downloads d ON d.download_id = c.download_id
525
+ WHERE d.run_id = %s
526
+ """,
527
+ "matriculas_count": "SELECT COUNT(*) FROM raw.pnp_matriculas_src WHERE run_id = %s",
528
+ "eficiencia_academica_count": "SELECT COUNT(*) FROM raw.pnp_eficiencia_academica_src WHERE run_id = %s",
529
+ "servidores_count": "SELECT COUNT(*) FROM raw.pnp_servidores_src WHERE run_id = %s",
530
+ "financeiro_count": "SELECT COUNT(*) FROM raw.pnp_financeiro_src WHERE run_id = %s",
531
+ "quarantine_count": "SELECT COUNT(*) FROM raw.pnp_ingestion_quarantine WHERE run_id = %s",
532
+ "run_package_count": "SELECT COUNT(*) FROM raw.pnp_run_packages WHERE run_id = %s",
533
+ "staging_matriculas_count": "SELECT COUNT(*) FROM staging.pnp_matriculas WHERE run_id = %s",
534
+ "staging_eficiencia_academica_count": "SELECT COUNT(*) FROM staging.pnp_eficiencia_academica WHERE run_id = %s",
535
+ "staging_servidores_count": "SELECT COUNT(*) FROM staging.pnp_servidores WHERE run_id = %s",
536
+ "staging_financeiro_count": "SELECT COUNT(*) FROM staging.pnp_financeiro WHERE run_id = %s",
537
+ "curated_admin_ingestao_count": "SELECT COUNT(*) FROM curated.vw_pnp_admin_ingestao WHERE run_id = %s",
538
+ "curated_qualidade_count": "SELECT COUNT(*) FROM curated.vw_pnp_qualidade_dados WHERE run_id = %s",
539
+ "curated_matriculas_perfil_count": "SELECT COUNT(*) FROM curated.vw_pnp_matriculas_perfil WHERE run_id = %s",
540
+ "curated_matriculas_oferta_count": "SELECT COUNT(*) FROM curated.vw_pnp_matriculas_oferta WHERE run_id = %s",
541
+ "curated_eficiencia_situacao_count": "SELECT COUNT(*) FROM curated.vw_pnp_eficiencia_situacao WHERE run_id = %s",
542
+ "curated_servidores_quadro_count": "SELECT COUNT(*) FROM curated.vw_pnp_servidores_quadro WHERE run_id = %s",
543
+ "curated_financeiro_execucao_count": "SELECT COUNT(*) FROM curated.vw_pnp_financeiro_execucao WHERE run_id = %s",
544
+ "curated_vanna_resumo_count": "SELECT COUNT(*) FROM curated.vw_pnp_vanna_resumo WHERE run_id = %s",
545
+ }
546
+
547
+ result: dict[str, Any] = {"run_id": run_id}
548
+ with psycopg2.connect(dsn) as conn, conn.cursor() as cur:
549
+ for key, query in query_map.items():
550
+ cur.execute(query, (run_id,))
551
+ result[key] = int(cur.fetchone()[0])
552
+
553
+ cur.execute(
554
+ """
555
+ SELECT
556
+ status,
557
+ deduplicated_record_count,
558
+ quality_status,
559
+ quality_summary_json
560
+ FROM staging.pnp_ingestion_runs
561
+ WHERE run_id = %s
562
+ """,
563
+ (run_id,),
564
+ )
565
+ staging_row = cur.fetchone()
566
+ if staging_row:
567
+ result["staging_status"] = staging_row[0]
568
+ result["staging_deduplicated_record_count"] = int(staging_row[1] or 0)
569
+ result["staging_quality_status"] = staging_row[2]
570
+ result["staging_quality_summary"] = staging_row[3]
571
+
572
+ result["raw_count"] = (
573
+ result["matriculas_count"]
574
+ + result["eficiencia_academica_count"]
575
+ + result["servidores_count"]
576
+ + result["financeiro_count"]
577
+ )
578
+ return result
579
+
580
+
581
+ def _insert_catalog_entries(cur, rows: list[dict[str, Any]]) -> None:
582
+ if not rows:
583
+ return
584
+
585
+ execute_values(
586
+ cur,
587
+ """
588
+ INSERT INTO raw.pnp_catalog_entries (
589
+ run_id,
590
+ instance_key,
591
+ ano_base,
592
+ tipo_microdados,
593
+ microdados_url,
594
+ resource_key,
595
+ visual_id,
596
+ api_base_url,
597
+ catalog_hash,
598
+ is_selected
599
+ ) VALUES %s
600
+ ON CONFLICT (run_id, ano_base, tipo_microdados, microdados_url) DO UPDATE
601
+ SET
602
+ resource_key = EXCLUDED.resource_key,
603
+ visual_id = EXCLUDED.visual_id,
604
+ api_base_url = EXCLUDED.api_base_url,
605
+ catalog_hash = EXCLUDED.catalog_hash,
606
+ is_selected = EXCLUDED.is_selected,
607
+ captured_at = NOW()
608
+ """,
609
+ [
610
+ (
611
+ row["run_id"],
612
+ row.get("instance_key"),
613
+ row["ano_base"],
614
+ row["tipo_microdados"],
615
+ row["microdados_url"],
616
+ row.get("resource_key"),
617
+ row.get("visual_id"),
618
+ row.get("api_base_url"),
619
+ row.get("catalog_hash"),
620
+ bool(row.get("is_selected")),
621
+ )
622
+ for row in rows
623
+ ],
624
+ page_size=500,
625
+ )
626
+
627
+
628
+ def _upsert_run_selection(cur, rows: list[dict[str, Any]]) -> dict[str, int]:
629
+ if not rows:
630
+ return {}
631
+
632
+ execute_values(
633
+ cur,
634
+ """
635
+ INSERT INTO raw.pnp_run_selection (
636
+ run_id,
637
+ instance_key,
638
+ ano_base,
639
+ tipo_microdados,
640
+ microdados_url,
641
+ selection_source,
642
+ selection_rank,
643
+ details_json
644
+ ) VALUES %s
645
+ ON CONFLICT (run_id, ano_base, tipo_microdados, microdados_url) DO UPDATE
646
+ SET
647
+ selection_source = EXCLUDED.selection_source,
648
+ selection_rank = EXCLUDED.selection_rank,
649
+ details_json = EXCLUDED.details_json,
650
+ selected_at = NOW()
651
+ RETURNING run_selection_id, microdados_url
652
+ """,
653
+ [
654
+ (
655
+ row["run_id"],
656
+ row.get("instance_key"),
657
+ row["ano_base"],
658
+ row["tipo_microdados"],
659
+ row["microdados_url"],
660
+ row.get("selection_source"),
661
+ row.get("selection_rank"),
662
+ Json(row.get("details_json") or {}),
663
+ )
664
+ for row in rows
665
+ ],
666
+ page_size=500,
667
+ )
668
+ return {str(row["microdados_url"]): int(row["run_selection_id"]) for row in cur.fetchall()}
669
+
670
+
671
+ def _upsert_downloads(cur, rows: list[dict[str, Any]], run_selection_map: dict[str, int]) -> dict[str, int]:
672
+ if not rows:
673
+ return {}
674
+
675
+ execute_values(
676
+ cur,
677
+ """
678
+ INSERT INTO raw.pnp_downloads (
679
+ run_id,
680
+ instance_key,
681
+ run_selection_id,
682
+ ano_base,
683
+ tipo_microdados,
684
+ microdados_url,
685
+ source_file_name,
686
+ source_file_sha256,
687
+ content_type,
688
+ size_bytes,
689
+ row_count_raw,
690
+ status,
691
+ error_message,
692
+ details_json
693
+ ) VALUES %s
694
+ ON CONFLICT (run_id, microdados_url) DO UPDATE
695
+ SET
696
+ run_selection_id = EXCLUDED.run_selection_id,
697
+ source_file_name = EXCLUDED.source_file_name,
698
+ source_file_sha256 = EXCLUDED.source_file_sha256,
699
+ content_type = EXCLUDED.content_type,
700
+ size_bytes = EXCLUDED.size_bytes,
701
+ row_count_raw = EXCLUDED.row_count_raw,
702
+ status = EXCLUDED.status,
703
+ error_message = EXCLUDED.error_message,
704
+ details_json = EXCLUDED.details_json,
705
+ finished_at = NOW()
706
+ RETURNING download_id, microdados_url
707
+ """,
708
+ [
709
+ (
710
+ row["run_id"],
711
+ row.get("instance_key"),
712
+ run_selection_map.get(row["microdados_url"]),
713
+ row["ano_base"],
714
+ row["tipo_microdados"],
715
+ row["microdados_url"],
716
+ row.get("source_file_name"),
717
+ row.get("source_file_sha256"),
718
+ row.get("content_type"),
719
+ row.get("size_bytes"),
720
+ row.get("row_count_raw"),
721
+ row.get("status") or "success",
722
+ row.get("error_message"),
723
+ Json(row.get("details_json") or {}),
724
+ )
725
+ for row in rows
726
+ ],
727
+ page_size=200,
728
+ )
729
+ return {str(row["microdados_url"]): int(row["download_id"]) for row in cur.fetchall()}
730
+
731
+
732
+ def _upsert_download_columns(cur, rows: list[dict[str, Any]], download_id_by_url: dict[str, int]) -> int:
733
+ values: list[tuple[Any, ...]] = []
734
+ for row in rows:
735
+ download_id = download_id_by_url.get(str(row["microdados_url"]))
736
+ if not download_id:
737
+ continue
738
+ for position, column_name in enumerate(row.get("headers") or (), start=1):
739
+ values.append(
740
+ (
741
+ download_id,
742
+ position,
743
+ column_name,
744
+ row.get("normalized_headers", {}).get(column_name) or column_name,
745
+ )
746
+ )
747
+
748
+ if not values:
749
+ return 0
750
+
751
+ execute_values(
752
+ cur,
753
+ """
754
+ INSERT INTO raw.pnp_download_columns (
755
+ download_id,
756
+ column_position,
757
+ column_name,
758
+ normalized_column_name
759
+ ) VALUES %s
760
+ ON CONFLICT (download_id, column_position) DO UPDATE
761
+ SET
762
+ column_name = EXCLUDED.column_name,
763
+ normalized_column_name = EXCLUDED.normalized_column_name,
764
+ captured_at = NOW()
765
+ """,
766
+ values,
767
+ page_size=500,
768
+ )
769
+ return len(values)
770
+
771
+
772
+ def _insert_domain_records(cur, rows: list[dict[str, Any]], download_id_by_url: dict[str, int]) -> dict[str, int]:
773
+ grouped: dict[str, list[dict[str, Any]]] = defaultdict(list)
774
+ for row in rows:
775
+ grouped[str(row["raw_table_name"])].append(row)
776
+
777
+ counts: dict[str, int] = {}
778
+ for table_name, table_rows in grouped.items():
779
+ data_columns = list((table_rows[0].get("field_values") or {}).keys())
780
+ insert_columns = [
781
+ "run_id",
782
+ "instance_key",
783
+ "download_id",
784
+ "record_hash",
785
+ "source_record_id",
786
+ "source_row_number",
787
+ "source_file_name",
788
+ "source_file_sha256",
789
+ "source_url",
790
+ "ano_base",
791
+ "tipo_microdados",
792
+ *data_columns,
793
+ ]
794
+ values = [
795
+ (
796
+ row["run_id"],
797
+ row.get("instance_key"),
798
+ download_id_by_url.get(str(row["source_url"])),
799
+ row["record_hash"],
800
+ row.get("source_record_id"),
801
+ row.get("source_row_number"),
802
+ row.get("source_file_name"),
803
+ row.get("source_file_sha256"),
804
+ row["source_url"],
805
+ row.get("ano_base"),
806
+ row["tipo_microdados"],
807
+ *[row.get("field_values", {}).get(column_name) for column_name in data_columns],
808
+ )
809
+ for row in table_rows
810
+ ]
811
+
812
+ statement = sql.SQL(
813
+ """
814
+ INSERT INTO {} ({}) VALUES %s
815
+ ON CONFLICT (run_id, download_id, source_row_number) DO NOTHING
816
+ """
817
+ ).format(
818
+ sql.Identifier("raw", table_name),
819
+ sql.SQL(", ").join(sql.Identifier(column_name) for column_name in insert_columns),
820
+ )
821
+ execute_values(cur, statement.as_string(cur), values, page_size=1000)
822
+ counts[table_name.replace("pnp_", "").replace("_src", "")] = len(values)
823
+
824
+ return counts
825
+
826
+
827
+ def _insert_quarantine(cur, rows: list[dict[str, Any]], download_id_by_url: dict[str, int]) -> int:
828
+ if not rows:
829
+ return 0
830
+
831
+ execute_values(
832
+ cur,
833
+ """
834
+ INSERT INTO raw.pnp_ingestion_quarantine (
835
+ run_id,
836
+ instance_key,
837
+ download_id,
838
+ source_row_number,
839
+ error_type,
840
+ error_message,
841
+ raw_line_text,
842
+ details_json
843
+ ) VALUES %s
844
+ """,
845
+ [
846
+ (
847
+ row["run_id"],
848
+ row.get("instance_key"),
849
+ download_id_by_url.get(str(row.get("source_url") or "")),
850
+ row.get("source_row_number"),
851
+ row.get("error_type"),
852
+ row.get("error_message"),
853
+ row.get("raw_line_text"),
854
+ Json(row.get("details_json") or {}),
855
+ )
856
+ for row in rows
857
+ ],
858
+ page_size=200,
859
+ )
860
+ return len(rows)