paasta-tools 1.21.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (348) hide show
  1. k8s_itests/__init__.py +0 -0
  2. k8s_itests/test_autoscaling.py +23 -0
  3. k8s_itests/utils.py +38 -0
  4. paasta_tools/__init__.py +20 -0
  5. paasta_tools/adhoc_tools.py +142 -0
  6. paasta_tools/api/__init__.py +13 -0
  7. paasta_tools/api/api.py +330 -0
  8. paasta_tools/api/api_docs/swagger.json +2323 -0
  9. paasta_tools/api/client.py +106 -0
  10. paasta_tools/api/settings.py +33 -0
  11. paasta_tools/api/tweens/__init__.py +6 -0
  12. paasta_tools/api/tweens/auth.py +125 -0
  13. paasta_tools/api/tweens/profiling.py +108 -0
  14. paasta_tools/api/tweens/request_logger.py +124 -0
  15. paasta_tools/api/views/__init__.py +13 -0
  16. paasta_tools/api/views/autoscaler.py +100 -0
  17. paasta_tools/api/views/exception.py +45 -0
  18. paasta_tools/api/views/flink.py +73 -0
  19. paasta_tools/api/views/instance.py +395 -0
  20. paasta_tools/api/views/pause_autoscaler.py +71 -0
  21. paasta_tools/api/views/remote_run.py +113 -0
  22. paasta_tools/api/views/resources.py +76 -0
  23. paasta_tools/api/views/service.py +35 -0
  24. paasta_tools/api/views/version.py +25 -0
  25. paasta_tools/apply_external_resources.py +79 -0
  26. paasta_tools/async_utils.py +109 -0
  27. paasta_tools/autoscaling/__init__.py +0 -0
  28. paasta_tools/autoscaling/autoscaling_service_lib.py +57 -0
  29. paasta_tools/autoscaling/forecasting.py +106 -0
  30. paasta_tools/autoscaling/max_all_k8s_services.py +41 -0
  31. paasta_tools/autoscaling/pause_service_autoscaler.py +77 -0
  32. paasta_tools/autoscaling/utils.py +52 -0
  33. paasta_tools/bounce_lib.py +184 -0
  34. paasta_tools/broadcast_log_to_services.py +62 -0
  35. paasta_tools/cassandracluster_tools.py +210 -0
  36. paasta_tools/check_autoscaler_max_instances.py +212 -0
  37. paasta_tools/check_cassandracluster_services_replication.py +35 -0
  38. paasta_tools/check_flink_services_health.py +203 -0
  39. paasta_tools/check_kubernetes_api.py +57 -0
  40. paasta_tools/check_kubernetes_services_replication.py +141 -0
  41. paasta_tools/check_oom_events.py +244 -0
  42. paasta_tools/check_services_replication_tools.py +324 -0
  43. paasta_tools/check_spark_jobs.py +234 -0
  44. paasta_tools/cleanup_kubernetes_cr.py +138 -0
  45. paasta_tools/cleanup_kubernetes_crd.py +145 -0
  46. paasta_tools/cleanup_kubernetes_jobs.py +344 -0
  47. paasta_tools/cleanup_tron_namespaces.py +96 -0
  48. paasta_tools/cli/__init__.py +13 -0
  49. paasta_tools/cli/authentication.py +85 -0
  50. paasta_tools/cli/cli.py +260 -0
  51. paasta_tools/cli/cmds/__init__.py +13 -0
  52. paasta_tools/cli/cmds/autoscale.py +143 -0
  53. paasta_tools/cli/cmds/check.py +334 -0
  54. paasta_tools/cli/cmds/cook_image.py +147 -0
  55. paasta_tools/cli/cmds/get_docker_image.py +76 -0
  56. paasta_tools/cli/cmds/get_image_version.py +172 -0
  57. paasta_tools/cli/cmds/get_latest_deployment.py +93 -0
  58. paasta_tools/cli/cmds/info.py +155 -0
  59. paasta_tools/cli/cmds/itest.py +117 -0
  60. paasta_tools/cli/cmds/list.py +66 -0
  61. paasta_tools/cli/cmds/list_clusters.py +42 -0
  62. paasta_tools/cli/cmds/list_deploy_queue.py +171 -0
  63. paasta_tools/cli/cmds/list_namespaces.py +84 -0
  64. paasta_tools/cli/cmds/local_run.py +1396 -0
  65. paasta_tools/cli/cmds/logs.py +1601 -0
  66. paasta_tools/cli/cmds/mark_for_deployment.py +1988 -0
  67. paasta_tools/cli/cmds/mesh_status.py +174 -0
  68. paasta_tools/cli/cmds/pause_service_autoscaler.py +107 -0
  69. paasta_tools/cli/cmds/push_to_registry.py +275 -0
  70. paasta_tools/cli/cmds/remote_run.py +252 -0
  71. paasta_tools/cli/cmds/rollback.py +347 -0
  72. paasta_tools/cli/cmds/secret.py +549 -0
  73. paasta_tools/cli/cmds/security_check.py +59 -0
  74. paasta_tools/cli/cmds/spark_run.py +1400 -0
  75. paasta_tools/cli/cmds/start_stop_restart.py +401 -0
  76. paasta_tools/cli/cmds/status.py +2302 -0
  77. paasta_tools/cli/cmds/validate.py +1012 -0
  78. paasta_tools/cli/cmds/wait_for_deployment.py +275 -0
  79. paasta_tools/cli/fsm/__init__.py +13 -0
  80. paasta_tools/cli/fsm/autosuggest.py +82 -0
  81. paasta_tools/cli/fsm/template/README.md +8 -0
  82. paasta_tools/cli/fsm/template/cookiecutter.json +7 -0
  83. paasta_tools/cli/fsm/template/{{cookiecutter.service}}/kubernetes-PROD.yaml +91 -0
  84. paasta_tools/cli/fsm/template/{{cookiecutter.service}}/monitoring.yaml +20 -0
  85. paasta_tools/cli/fsm/template/{{cookiecutter.service}}/service.yaml +8 -0
  86. paasta_tools/cli/fsm/template/{{cookiecutter.service}}/smartstack.yaml +6 -0
  87. paasta_tools/cli/fsm_cmd.py +121 -0
  88. paasta_tools/cli/paasta_tabcomplete.sh +23 -0
  89. paasta_tools/cli/schemas/adhoc_schema.json +199 -0
  90. paasta_tools/cli/schemas/autoscaling_schema.json +91 -0
  91. paasta_tools/cli/schemas/autotuned_defaults/cassandracluster_schema.json +37 -0
  92. paasta_tools/cli/schemas/autotuned_defaults/kubernetes_schema.json +89 -0
  93. paasta_tools/cli/schemas/deploy_schema.json +173 -0
  94. paasta_tools/cli/schemas/eks_schema.json +970 -0
  95. paasta_tools/cli/schemas/kubernetes_schema.json +970 -0
  96. paasta_tools/cli/schemas/rollback_schema.json +160 -0
  97. paasta_tools/cli/schemas/service_schema.json +25 -0
  98. paasta_tools/cli/schemas/smartstack_schema.json +322 -0
  99. paasta_tools/cli/schemas/tron_schema.json +699 -0
  100. paasta_tools/cli/utils.py +1118 -0
  101. paasta_tools/clusterman.py +21 -0
  102. paasta_tools/config_utils.py +385 -0
  103. paasta_tools/contrib/__init__.py +0 -0
  104. paasta_tools/contrib/bounce_log_latency_parser.py +68 -0
  105. paasta_tools/contrib/check_manual_oapi_changes.sh +24 -0
  106. paasta_tools/contrib/check_orphans.py +306 -0
  107. paasta_tools/contrib/create_dynamodb_table.py +35 -0
  108. paasta_tools/contrib/create_paasta_playground.py +105 -0
  109. paasta_tools/contrib/emit_allocated_cpu_metrics.py +50 -0
  110. paasta_tools/contrib/get_running_task_allocation.py +346 -0
  111. paasta_tools/contrib/habitat_fixer.py +86 -0
  112. paasta_tools/contrib/ide_helper.py +316 -0
  113. paasta_tools/contrib/is_pod_healthy_in_proxy.py +139 -0
  114. paasta_tools/contrib/is_pod_healthy_in_smartstack.py +50 -0
  115. paasta_tools/contrib/kill_bad_containers.py +109 -0
  116. paasta_tools/contrib/mass-deploy-tag.sh +44 -0
  117. paasta_tools/contrib/mock_patch_checker.py +86 -0
  118. paasta_tools/contrib/paasta_update_soa_memcpu.py +520 -0
  119. paasta_tools/contrib/render_template.py +129 -0
  120. paasta_tools/contrib/rightsizer_soaconfigs_update.py +348 -0
  121. paasta_tools/contrib/service_shard_remove.py +157 -0
  122. paasta_tools/contrib/service_shard_update.py +373 -0
  123. paasta_tools/contrib/shared_ip_check.py +77 -0
  124. paasta_tools/contrib/timeouts_metrics_prom.py +64 -0
  125. paasta_tools/delete_kubernetes_deployments.py +89 -0
  126. paasta_tools/deployment_utils.py +44 -0
  127. paasta_tools/docker_wrapper.py +234 -0
  128. paasta_tools/docker_wrapper_imports.py +13 -0
  129. paasta_tools/drain_lib.py +351 -0
  130. paasta_tools/dump_locally_running_services.py +71 -0
  131. paasta_tools/eks_tools.py +119 -0
  132. paasta_tools/envoy_tools.py +373 -0
  133. paasta_tools/firewall.py +504 -0
  134. paasta_tools/firewall_logging.py +154 -0
  135. paasta_tools/firewall_update.py +172 -0
  136. paasta_tools/flink_tools.py +345 -0
  137. paasta_tools/flinkeks_tools.py +90 -0
  138. paasta_tools/frameworks/__init__.py +0 -0
  139. paasta_tools/frameworks/adhoc_scheduler.py +71 -0
  140. paasta_tools/frameworks/constraints.py +87 -0
  141. paasta_tools/frameworks/native_scheduler.py +652 -0
  142. paasta_tools/frameworks/native_service_config.py +301 -0
  143. paasta_tools/frameworks/task_store.py +245 -0
  144. paasta_tools/generate_all_deployments +9 -0
  145. paasta_tools/generate_authenticating_services.py +94 -0
  146. paasta_tools/generate_deployments_for_service.py +255 -0
  147. paasta_tools/generate_services_file.py +114 -0
  148. paasta_tools/generate_services_yaml.py +30 -0
  149. paasta_tools/hacheck.py +76 -0
  150. paasta_tools/instance/__init__.py +0 -0
  151. paasta_tools/instance/hpa_metrics_parser.py +122 -0
  152. paasta_tools/instance/kubernetes.py +1362 -0
  153. paasta_tools/iptables.py +240 -0
  154. paasta_tools/kafkacluster_tools.py +143 -0
  155. paasta_tools/kubernetes/__init__.py +0 -0
  156. paasta_tools/kubernetes/application/__init__.py +0 -0
  157. paasta_tools/kubernetes/application/controller_wrappers.py +476 -0
  158. paasta_tools/kubernetes/application/tools.py +90 -0
  159. paasta_tools/kubernetes/bin/__init__.py +0 -0
  160. paasta_tools/kubernetes/bin/kubernetes_remove_evicted_pods.py +164 -0
  161. paasta_tools/kubernetes/bin/paasta_cleanup_remote_run_resources.py +135 -0
  162. paasta_tools/kubernetes/bin/paasta_cleanup_stale_nodes.py +181 -0
  163. paasta_tools/kubernetes/bin/paasta_secrets_sync.py +758 -0
  164. paasta_tools/kubernetes/remote_run.py +558 -0
  165. paasta_tools/kubernetes_tools.py +4679 -0
  166. paasta_tools/list_kubernetes_service_instances.py +128 -0
  167. paasta_tools/list_tron_namespaces.py +60 -0
  168. paasta_tools/long_running_service_tools.py +678 -0
  169. paasta_tools/mac_address.py +44 -0
  170. paasta_tools/marathon_dashboard.py +0 -0
  171. paasta_tools/mesos/__init__.py +0 -0
  172. paasta_tools/mesos/cfg.py +46 -0
  173. paasta_tools/mesos/cluster.py +60 -0
  174. paasta_tools/mesos/exceptions.py +59 -0
  175. paasta_tools/mesos/framework.py +77 -0
  176. paasta_tools/mesos/log.py +48 -0
  177. paasta_tools/mesos/master.py +306 -0
  178. paasta_tools/mesos/mesos_file.py +169 -0
  179. paasta_tools/mesos/parallel.py +52 -0
  180. paasta_tools/mesos/slave.py +115 -0
  181. paasta_tools/mesos/task.py +94 -0
  182. paasta_tools/mesos/util.py +69 -0
  183. paasta_tools/mesos/zookeeper.py +37 -0
  184. paasta_tools/mesos_maintenance.py +848 -0
  185. paasta_tools/mesos_tools.py +1051 -0
  186. paasta_tools/metrics/__init__.py +0 -0
  187. paasta_tools/metrics/metastatus_lib.py +1110 -0
  188. paasta_tools/metrics/metrics_lib.py +217 -0
  189. paasta_tools/monitoring/__init__.py +13 -0
  190. paasta_tools/monitoring/check_k8s_api_performance.py +110 -0
  191. paasta_tools/monitoring_tools.py +652 -0
  192. paasta_tools/monkrelaycluster_tools.py +146 -0
  193. paasta_tools/nrtsearchservice_tools.py +143 -0
  194. paasta_tools/nrtsearchserviceeks_tools.py +68 -0
  195. paasta_tools/oom_logger.py +321 -0
  196. paasta_tools/paasta_deploy_tron_jobs +3 -0
  197. paasta_tools/paasta_execute_docker_command.py +123 -0
  198. paasta_tools/paasta_native_serviceinit.py +21 -0
  199. paasta_tools/paasta_service_config_loader.py +201 -0
  200. paasta_tools/paastaapi/__init__.py +29 -0
  201. paasta_tools/paastaapi/api/__init__.py +3 -0
  202. paasta_tools/paastaapi/api/autoscaler_api.py +302 -0
  203. paasta_tools/paastaapi/api/default_api.py +569 -0
  204. paasta_tools/paastaapi/api/remote_run_api.py +604 -0
  205. paasta_tools/paastaapi/api/resources_api.py +157 -0
  206. paasta_tools/paastaapi/api/service_api.py +1736 -0
  207. paasta_tools/paastaapi/api_client.py +818 -0
  208. paasta_tools/paastaapi/apis/__init__.py +22 -0
  209. paasta_tools/paastaapi/configuration.py +455 -0
  210. paasta_tools/paastaapi/exceptions.py +137 -0
  211. paasta_tools/paastaapi/model/__init__.py +5 -0
  212. paasta_tools/paastaapi/model/adhoc_launch_history.py +176 -0
  213. paasta_tools/paastaapi/model/autoscaler_count_msg.py +176 -0
  214. paasta_tools/paastaapi/model/deploy_queue.py +178 -0
  215. paasta_tools/paastaapi/model/deploy_queue_service_instance.py +194 -0
  216. paasta_tools/paastaapi/model/envoy_backend.py +185 -0
  217. paasta_tools/paastaapi/model/envoy_location.py +184 -0
  218. paasta_tools/paastaapi/model/envoy_status.py +181 -0
  219. paasta_tools/paastaapi/model/flink_cluster_overview.py +188 -0
  220. paasta_tools/paastaapi/model/flink_config.py +173 -0
  221. paasta_tools/paastaapi/model/flink_job.py +186 -0
  222. paasta_tools/paastaapi/model/flink_job_details.py +192 -0
  223. paasta_tools/paastaapi/model/flink_jobs.py +175 -0
  224. paasta_tools/paastaapi/model/float_and_error.py +173 -0
  225. paasta_tools/paastaapi/model/hpa_metric.py +176 -0
  226. paasta_tools/paastaapi/model/inline_object.py +170 -0
  227. paasta_tools/paastaapi/model/inline_response200.py +170 -0
  228. paasta_tools/paastaapi/model/inline_response2001.py +170 -0
  229. paasta_tools/paastaapi/model/instance_bounce_status.py +200 -0
  230. paasta_tools/paastaapi/model/instance_mesh_status.py +186 -0
  231. paasta_tools/paastaapi/model/instance_status.py +220 -0
  232. paasta_tools/paastaapi/model/instance_status_adhoc.py +187 -0
  233. paasta_tools/paastaapi/model/instance_status_cassandracluster.py +173 -0
  234. paasta_tools/paastaapi/model/instance_status_flink.py +173 -0
  235. paasta_tools/paastaapi/model/instance_status_kafkacluster.py +173 -0
  236. paasta_tools/paastaapi/model/instance_status_kubernetes.py +263 -0
  237. paasta_tools/paastaapi/model/instance_status_kubernetes_autoscaling_status.py +187 -0
  238. paasta_tools/paastaapi/model/instance_status_kubernetes_v2.py +197 -0
  239. paasta_tools/paastaapi/model/instance_status_tron.py +204 -0
  240. paasta_tools/paastaapi/model/instance_tasks.py +182 -0
  241. paasta_tools/paastaapi/model/integer_and_error.py +173 -0
  242. paasta_tools/paastaapi/model/kubernetes_container.py +178 -0
  243. paasta_tools/paastaapi/model/kubernetes_container_v2.py +219 -0
  244. paasta_tools/paastaapi/model/kubernetes_healthcheck.py +176 -0
  245. paasta_tools/paastaapi/model/kubernetes_pod.py +201 -0
  246. paasta_tools/paastaapi/model/kubernetes_pod_event.py +176 -0
  247. paasta_tools/paastaapi/model/kubernetes_pod_v2.py +213 -0
  248. paasta_tools/paastaapi/model/kubernetes_replica_set.py +185 -0
  249. paasta_tools/paastaapi/model/kubernetes_version.py +202 -0
  250. paasta_tools/paastaapi/model/remote_run_outcome.py +189 -0
  251. paasta_tools/paastaapi/model/remote_run_start.py +185 -0
  252. paasta_tools/paastaapi/model/remote_run_stop.py +176 -0
  253. paasta_tools/paastaapi/model/remote_run_token.py +173 -0
  254. paasta_tools/paastaapi/model/resource.py +187 -0
  255. paasta_tools/paastaapi/model/resource_item.py +187 -0
  256. paasta_tools/paastaapi/model/resource_value.py +176 -0
  257. paasta_tools/paastaapi/model/smartstack_backend.py +191 -0
  258. paasta_tools/paastaapi/model/smartstack_location.py +181 -0
  259. paasta_tools/paastaapi/model/smartstack_status.py +181 -0
  260. paasta_tools/paastaapi/model/task_tail_lines.py +176 -0
  261. paasta_tools/paastaapi/model_utils.py +1879 -0
  262. paasta_tools/paastaapi/models/__init__.py +62 -0
  263. paasta_tools/paastaapi/rest.py +287 -0
  264. paasta_tools/prune_completed_pods.py +220 -0
  265. paasta_tools/puppet_service_tools.py +59 -0
  266. paasta_tools/py.typed +1 -0
  267. paasta_tools/remote_git.py +127 -0
  268. paasta_tools/run-paasta-api-in-dev-mode.py +57 -0
  269. paasta_tools/run-paasta-api-playground.py +51 -0
  270. paasta_tools/secret_providers/__init__.py +66 -0
  271. paasta_tools/secret_providers/vault.py +214 -0
  272. paasta_tools/secret_tools.py +277 -0
  273. paasta_tools/setup_istio_mesh.py +353 -0
  274. paasta_tools/setup_kubernetes_cr.py +412 -0
  275. paasta_tools/setup_kubernetes_crd.py +138 -0
  276. paasta_tools/setup_kubernetes_internal_crd.py +154 -0
  277. paasta_tools/setup_kubernetes_job.py +353 -0
  278. paasta_tools/setup_prometheus_adapter_config.py +1028 -0
  279. paasta_tools/setup_tron_namespace.py +248 -0
  280. paasta_tools/slack.py +75 -0
  281. paasta_tools/smartstack_tools.py +676 -0
  282. paasta_tools/spark_tools.py +283 -0
  283. paasta_tools/synapse_srv_namespaces_fact.py +42 -0
  284. paasta_tools/tron/__init__.py +0 -0
  285. paasta_tools/tron/client.py +158 -0
  286. paasta_tools/tron/tron_command_context.py +194 -0
  287. paasta_tools/tron/tron_timeutils.py +101 -0
  288. paasta_tools/tron_tools.py +1448 -0
  289. paasta_tools/utils.py +4307 -0
  290. paasta_tools/yaml_tools.py +44 -0
  291. paasta_tools-1.21.3.data/scripts/apply_external_resources.py +79 -0
  292. paasta_tools-1.21.3.data/scripts/bounce_log_latency_parser.py +68 -0
  293. paasta_tools-1.21.3.data/scripts/check_autoscaler_max_instances.py +212 -0
  294. paasta_tools-1.21.3.data/scripts/check_cassandracluster_services_replication.py +35 -0
  295. paasta_tools-1.21.3.data/scripts/check_flink_services_health.py +203 -0
  296. paasta_tools-1.21.3.data/scripts/check_kubernetes_api.py +57 -0
  297. paasta_tools-1.21.3.data/scripts/check_kubernetes_services_replication.py +141 -0
  298. paasta_tools-1.21.3.data/scripts/check_manual_oapi_changes.sh +24 -0
  299. paasta_tools-1.21.3.data/scripts/check_oom_events.py +244 -0
  300. paasta_tools-1.21.3.data/scripts/check_orphans.py +306 -0
  301. paasta_tools-1.21.3.data/scripts/check_spark_jobs.py +234 -0
  302. paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_cr.py +138 -0
  303. paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_crd.py +145 -0
  304. paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_jobs.py +344 -0
  305. paasta_tools-1.21.3.data/scripts/create_dynamodb_table.py +35 -0
  306. paasta_tools-1.21.3.data/scripts/create_paasta_playground.py +105 -0
  307. paasta_tools-1.21.3.data/scripts/delete_kubernetes_deployments.py +89 -0
  308. paasta_tools-1.21.3.data/scripts/emit_allocated_cpu_metrics.py +50 -0
  309. paasta_tools-1.21.3.data/scripts/generate_all_deployments +9 -0
  310. paasta_tools-1.21.3.data/scripts/generate_authenticating_services.py +94 -0
  311. paasta_tools-1.21.3.data/scripts/generate_deployments_for_service.py +255 -0
  312. paasta_tools-1.21.3.data/scripts/generate_services_file.py +114 -0
  313. paasta_tools-1.21.3.data/scripts/generate_services_yaml.py +30 -0
  314. paasta_tools-1.21.3.data/scripts/get_running_task_allocation.py +346 -0
  315. paasta_tools-1.21.3.data/scripts/habitat_fixer.py +86 -0
  316. paasta_tools-1.21.3.data/scripts/ide_helper.py +316 -0
  317. paasta_tools-1.21.3.data/scripts/is_pod_healthy_in_proxy.py +139 -0
  318. paasta_tools-1.21.3.data/scripts/is_pod_healthy_in_smartstack.py +50 -0
  319. paasta_tools-1.21.3.data/scripts/kill_bad_containers.py +109 -0
  320. paasta_tools-1.21.3.data/scripts/kubernetes_remove_evicted_pods.py +164 -0
  321. paasta_tools-1.21.3.data/scripts/mass-deploy-tag.sh +44 -0
  322. paasta_tools-1.21.3.data/scripts/mock_patch_checker.py +86 -0
  323. paasta_tools-1.21.3.data/scripts/paasta_cleanup_remote_run_resources.py +135 -0
  324. paasta_tools-1.21.3.data/scripts/paasta_cleanup_stale_nodes.py +181 -0
  325. paasta_tools-1.21.3.data/scripts/paasta_deploy_tron_jobs +3 -0
  326. paasta_tools-1.21.3.data/scripts/paasta_execute_docker_command.py +123 -0
  327. paasta_tools-1.21.3.data/scripts/paasta_secrets_sync.py +758 -0
  328. paasta_tools-1.21.3.data/scripts/paasta_tabcomplete.sh +23 -0
  329. paasta_tools-1.21.3.data/scripts/paasta_update_soa_memcpu.py +520 -0
  330. paasta_tools-1.21.3.data/scripts/render_template.py +129 -0
  331. paasta_tools-1.21.3.data/scripts/rightsizer_soaconfigs_update.py +348 -0
  332. paasta_tools-1.21.3.data/scripts/service_shard_remove.py +157 -0
  333. paasta_tools-1.21.3.data/scripts/service_shard_update.py +373 -0
  334. paasta_tools-1.21.3.data/scripts/setup_istio_mesh.py +353 -0
  335. paasta_tools-1.21.3.data/scripts/setup_kubernetes_cr.py +412 -0
  336. paasta_tools-1.21.3.data/scripts/setup_kubernetes_crd.py +138 -0
  337. paasta_tools-1.21.3.data/scripts/setup_kubernetes_internal_crd.py +154 -0
  338. paasta_tools-1.21.3.data/scripts/setup_kubernetes_job.py +353 -0
  339. paasta_tools-1.21.3.data/scripts/setup_prometheus_adapter_config.py +1028 -0
  340. paasta_tools-1.21.3.data/scripts/shared_ip_check.py +77 -0
  341. paasta_tools-1.21.3.data/scripts/synapse_srv_namespaces_fact.py +42 -0
  342. paasta_tools-1.21.3.data/scripts/timeouts_metrics_prom.py +64 -0
  343. paasta_tools-1.21.3.dist-info/LICENSE +201 -0
  344. paasta_tools-1.21.3.dist-info/METADATA +74 -0
  345. paasta_tools-1.21.3.dist-info/RECORD +348 -0
  346. paasta_tools-1.21.3.dist-info/WHEEL +5 -0
  347. paasta_tools-1.21.3.dist-info/entry_points.txt +20 -0
  348. paasta_tools-1.21.3.dist-info/top_level.txt +2 -0
@@ -0,0 +1,1400 @@
1
+ import argparse
2
+ import json
3
+ import logging
4
+ import os
5
+ import re
6
+ import shlex
7
+ import socket
8
+ import sys
9
+ from configparser import ConfigParser
10
+ from typing import Any
11
+ from typing import cast
12
+ from typing import Dict
13
+ from typing import List
14
+ from typing import Mapping
15
+ from typing import Optional
16
+ from typing import Set
17
+ from typing import Tuple
18
+ from typing import Union
19
+
20
+ from service_configuration_lib import read_service_configuration
21
+ from service_configuration_lib import read_yaml_file
22
+ from service_configuration_lib import spark_config
23
+ from service_configuration_lib.spark_config import get_aws_credentials
24
+ from service_configuration_lib.spark_config import get_grafana_url
25
+ from service_configuration_lib.spark_config import get_resources_requested
26
+ from service_configuration_lib.spark_config import get_spark_hourly_cost
27
+ from service_configuration_lib.spark_config import UnsupportedClusterManagerException
28
+
29
+ from paasta_tools.cli.authentication import get_service_auth_token
30
+ from paasta_tools.cli.cmds.check import makefile_responds_to
31
+ from paasta_tools.cli.cmds.cook_image import paasta_cook_image
32
+ from paasta_tools.cli.utils import get_instance_config
33
+ from paasta_tools.cli.utils import lazy_choices_completer
34
+ from paasta_tools.cli.utils import list_instances
35
+ from paasta_tools.clusterman import get_clusterman_metrics
36
+ from paasta_tools.kubernetes_tools import get_service_account_name
37
+ from paasta_tools.spark_tools import auto_add_timeout_for_spark_job
38
+ from paasta_tools.spark_tools import create_spark_config_str
39
+ from paasta_tools.spark_tools import DEFAULT_SPARK_RUNTIME_TIMEOUT
40
+ from paasta_tools.spark_tools import DEFAULT_SPARK_SERVICE
41
+ from paasta_tools.spark_tools import get_volumes_from_spark_k8s_configs
42
+ from paasta_tools.spark_tools import get_webui_url
43
+ from paasta_tools.spark_tools import inject_spark_conf_str
44
+ from paasta_tools.tron_tools import load_tron_instance_configs
45
+ from paasta_tools.utils import _run
46
+ from paasta_tools.utils import DEFAULT_SOA_DIR
47
+ from paasta_tools.utils import filter_templates_from_config
48
+ from paasta_tools.utils import get_k8s_url_for_cluster
49
+ from paasta_tools.utils import get_possible_launched_by_user_variable_from_env
50
+ from paasta_tools.utils import get_username
51
+ from paasta_tools.utils import InstanceConfig
52
+ from paasta_tools.utils import is_using_unprivileged_containers
53
+ from paasta_tools.utils import list_services
54
+ from paasta_tools.utils import load_system_paasta_config
55
+ from paasta_tools.utils import NoConfigurationForServiceError
56
+ from paasta_tools.utils import NoDeploymentsAvailable
57
+ from paasta_tools.utils import NoDockerImageError
58
+ from paasta_tools.utils import PaastaColors
59
+ from paasta_tools.utils import PaastaNotConfiguredError
60
+ from paasta_tools.utils import PoolsNotConfiguredError
61
+ from paasta_tools.utils import SystemPaastaConfig
62
+ from paasta_tools.utils import validate_pool
63
+
64
+
65
+ DEFAULT_AWS_REGION = "us-west-2"
66
+ DEFAULT_SPARK_WORK_DIR = "/spark_driver"
67
+ DEFAULT_SPARK_DOCKER_IMAGE_PREFIX = "paasta-spark-run"
68
+ DEFAULT_SPARK_DOCKER_REGISTRY = "docker-dev.yelpcorp.com"
69
+ SENSITIVE_ENV = ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_SESSION_TOKEN"]
70
+ clusterman_metrics, CLUSTERMAN_YAML_FILE_PATH = get_clusterman_metrics()
71
+ CLUSTER_MANAGER_K8S = "kubernetes"
72
+ CLUSTER_MANAGER_LOCAL = "local"
73
+ CLUSTER_MANAGERS = {CLUSTER_MANAGER_K8S, CLUSTER_MANAGER_LOCAL}
74
+ DEFAULT_DOCKER_SHM_SIZE = "64m"
75
+ # Reference: https://spark.apache.org/docs/latest/configuration.html#application-properties
76
+ DEFAULT_DRIVER_CORES_BY_SPARK = 1
77
+ DEFAULT_DRIVER_MEMORY_BY_SPARK = "1g"
78
+ # Extra room for memory overhead and for any other running inside container
79
+ DOCKER_RESOURCE_ADJUSTMENT_FACTOR = 2
80
+
81
+ DEPRECATED_OPTS = {
82
+ "j": "spark.jars",
83
+ "jars": "spark.jars",
84
+ }
85
+
86
+ SPARK_COMMANDS = {"pyspark", "spark-submit"}
87
+
88
+ # config looks as follows:
89
+ # [default]
90
+ # aws_access_key_id = ...
91
+ # aws_secret_access_key = ...
92
+ SPARK_DRIVER_IAM_USER = (
93
+ "/nail/etc/spark_driver_k8s_role_assumer/spark_driver_k8s_role_assumer.ini"
94
+ )
95
+
96
+ log = logging.getLogger(__name__)
97
+
98
+
99
+ class DeprecatedAction(argparse.Action):
100
+ def __init__(self, option_strings, dest, nargs="?", **kwargs):
101
+ super().__init__(option_strings, dest, nargs=nargs, **kwargs)
102
+
103
+ def __call__(self, parser, namespace, values, option_string=None):
104
+ print(
105
+ PaastaColors.red(
106
+ f"Use of {option_string} is deprecated. "
107
+ + (
108
+ f"Please use {DEPRECATED_OPTS.get(option_string.strip('-'), '')}=value in --spark-args."
109
+ if option_string.strip("-") in DEPRECATED_OPTS
110
+ else ""
111
+ )
112
+ )
113
+ )
114
+
115
+
116
+ def add_subparser(subparsers):
117
+ list_parser = subparsers.add_parser(
118
+ "spark-run",
119
+ help="Run Spark on the PaaSTA cluster",
120
+ description=(
121
+ "'paasta spark-run' launches a Spark cluster on PaaSTA. "
122
+ "It analyzes soa-configs and command line arguments to invoke "
123
+ "a 'docker run'. By default, it will pull the Spark service "
124
+ "image from the registry unless the --build option is used.\n\n"
125
+ ),
126
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
127
+ )
128
+ # Deprecated args kept to avoid failures
129
+ # TODO: Remove these deprecated args later
130
+ list_parser.add_argument(
131
+ "--jars",
132
+ help=argparse.SUPPRESS,
133
+ action=DeprecatedAction,
134
+ )
135
+ list_parser.add_argument(
136
+ "--executor-memory",
137
+ help=argparse.SUPPRESS,
138
+ action=DeprecatedAction,
139
+ )
140
+ list_parser.add_argument(
141
+ "--executor-cores",
142
+ help=argparse.SUPPRESS,
143
+ action=DeprecatedAction,
144
+ )
145
+ list_parser.add_argument(
146
+ "--max-cores",
147
+ help=argparse.SUPPRESS,
148
+ action=DeprecatedAction,
149
+ )
150
+ list_parser.add_argument(
151
+ "-e",
152
+ "--enable-compact-bin-packing",
153
+ help=argparse.SUPPRESS,
154
+ action=DeprecatedAction,
155
+ )
156
+ list_parser.add_argument(
157
+ "--enable-dra",
158
+ help=argparse.SUPPRESS,
159
+ action=DeprecatedAction,
160
+ )
161
+ list_parser.add_argument(
162
+ "--force-use-eks",
163
+ help=argparse.SUPPRESS,
164
+ action=DeprecatedAction,
165
+ )
166
+ list_parser.add_argument(
167
+ "--get-eks-token-via-iam-user",
168
+ help="Use IAM user to get EKS token for long running spark-run jobs",
169
+ action="store_true",
170
+ )
171
+
172
+ group = list_parser.add_mutually_exclusive_group()
173
+ group.add_argument(
174
+ "-b",
175
+ "--build",
176
+ help="Build the docker image from scratch using the local Makefile's cook-image target.",
177
+ action="store_true",
178
+ default=False,
179
+ )
180
+ group.add_argument(
181
+ "-I",
182
+ "--image",
183
+ help="Use the provided image to start the Spark driver and executors.",
184
+ )
185
+ list_parser.add_argument(
186
+ "--docker-memory-limit",
187
+ help=(
188
+ "Set docker memory limit. Should be greater than driver memory. Defaults to 2x spark.driver.memory. Example: 2g, 500m, Max: 64g"
189
+ " Note: If memory limit provided is greater than associated with the batch instance, it will default to max memory of the box."
190
+ ),
191
+ default=None,
192
+ )
193
+ list_parser.add_argument(
194
+ "--docker-cpu-limit",
195
+ help=(
196
+ "Set docker cpus limit. Should be greater than driver cores. Defaults to 1x spark.driver.cores."
197
+ " Note: The job will fail if the limit provided is greater than number of cores present on batch box (8 for production batch boxes)."
198
+ ),
199
+ default=None,
200
+ )
201
+
202
+ list_parser.add_argument(
203
+ "--docker-shm-size",
204
+ help=(
205
+ "Set docker shared memory size limit for the driver's container. This is the same as setting docker run --shm-size and the shared"
206
+ " memory is mounted to /dev/shm in the container. Anything written to the shared memory mount point counts towards the docker memory"
207
+ " limit for the driver's container. Therefore, this should be less than --docker-memory-limit."
208
+ f" Defaults to {DEFAULT_DOCKER_SHM_SIZE}. Example: 8g, 256m"
209
+ " Note: this option is mainly useful when training TensorFlow models in the driver, with multiple GPUs using NCCL. The shared memory"
210
+ f" space is used to sync gradient updates between GPUs during training. The default value of {DEFAULT_DOCKER_SHM_SIZE} is typically not large enough for"
211
+ " this inter-gpu communication to run efficiently. We recommend a starting value of 8g to ensure that the entire set of model parameters"
212
+ " can fit in the shared memory. This can be less if you are training a smaller model (<1g parameters) or more if you are using a larger model (>2.5g parameters)"
213
+ " If you are observing low, average GPU utilization during epoch training (<65-70 percent) you can also try increasing this value; you may be"
214
+ " resource constrained when GPUs sync training weights between mini-batches (there are other potential bottlenecks that could cause this as well)."
215
+ " A tool such as nvidia-smi can be use to check GPU utilization."
216
+ " This option also adds the --ulimit memlock=-1 to the docker run command since this is recommended for TensorFlow applications that use NCCL."
217
+ " Please refer to docker run documentation for more details on --shm-size and --ulimit memlock=-1."
218
+ ),
219
+ default=None,
220
+ )
221
+ list_parser.add_argument(
222
+ "--force-spark-resource-configs",
223
+ help=(
224
+ "Skip the resource/instances recalculation. "
225
+ "This is strongly not recommended."
226
+ ),
227
+ action="store_true",
228
+ default=False,
229
+ )
230
+ list_parser.add_argument(
231
+ "--docker-registry",
232
+ help="Docker registry to push the Spark image built.",
233
+ default=None,
234
+ )
235
+
236
+ list_parser.add_argument(
237
+ "-s",
238
+ "--service",
239
+ help="The name of the service from which the Spark image is built.",
240
+ default=DEFAULT_SPARK_SERVICE,
241
+ ).completer = lazy_choices_completer(list_services)
242
+
243
+ list_parser.add_argument(
244
+ "-i",
245
+ "--instance",
246
+ help="Start a docker run for a particular instance of the service.",
247
+ default="adhoc",
248
+ ).completer = lazy_choices_completer(list_instances)
249
+
250
+ try:
251
+ system_paasta_config = load_system_paasta_config()
252
+ valid_clusters = system_paasta_config.get_clusters()
253
+ default_spark_cluster = system_paasta_config.get_spark_run_config().get(
254
+ "default_cluster"
255
+ )
256
+ default_spark_pool = system_paasta_config.get_spark_run_config().get(
257
+ "default_pool"
258
+ )
259
+ except PaastaNotConfiguredError:
260
+ default_spark_cluster = "pnw-devc-spark"
261
+ default_spark_pool = "batch"
262
+ valid_clusters = ["pnw-devc-spark", "pnw-prod-spark"]
263
+
264
+ list_parser.add_argument(
265
+ "-c",
266
+ "--cluster",
267
+ help="The name of the cluster you wish to run Spark on.",
268
+ choices=valid_clusters,
269
+ default=default_spark_cluster,
270
+ )
271
+
272
+ list_parser.add_argument(
273
+ "-p",
274
+ "--pool",
275
+ help="Name of the resource pool to run the Spark job.",
276
+ default=default_spark_pool,
277
+ )
278
+
279
+ list_parser.add_argument(
280
+ "-w",
281
+ "--work-dir",
282
+ default="{}:{}".format(os.getcwd(), DEFAULT_SPARK_WORK_DIR),
283
+ help="The read-write volume to mount in format local_abs_dir:container_abs_dir",
284
+ )
285
+
286
+ list_parser.add_argument(
287
+ "-y",
288
+ "--yelpsoa-config-root",
289
+ dest="yelpsoa_config_root",
290
+ help="A directory from which yelpsoa-configs should be read from.",
291
+ default=DEFAULT_SOA_DIR,
292
+ )
293
+
294
+ list_parser.add_argument(
295
+ "-C",
296
+ "--cmd",
297
+ help="Run the spark-shell, pyspark, spark-submit, jupyter-lab, or history-server command.",
298
+ )
299
+
300
+ list_parser.add_argument(
301
+ "--timeout-job-runtime",
302
+ type=str,
303
+ help="Timeout value which will be added before spark-submit. Job will exit if it doesn't finish in given "
304
+ "runtime. Recommended value: 2 * expected runtime. Example: 1h, 30m",
305
+ default=DEFAULT_SPARK_RUNTIME_TIMEOUT,
306
+ )
307
+
308
+ list_parser.add_argument(
309
+ "-d",
310
+ "--dry-run",
311
+ help="Shows the arguments supplied to docker as json.",
312
+ action="store_true",
313
+ default=False,
314
+ )
315
+
316
+ list_parser.add_argument(
317
+ "--spark-args",
318
+ help="Spark configurations documented in https://spark.apache.org/docs/latest/configuration.html, "
319
+ 'separated by space. For example, --spark-args "spark.executor.cores=1 spark.executor.memory=7g '
320
+ 'spark.executor.instances=2".',
321
+ )
322
+
323
+ list_parser.add_argument(
324
+ "--nvidia",
325
+ help="Use nvidia docker runtime for Spark driver process (requires GPU)",
326
+ action="store_true",
327
+ default=False,
328
+ )
329
+
330
+ list_parser.add_argument(
331
+ "--mrjob",
332
+ help="Pass Spark arguments to invoked command in the format expected by mrjobs",
333
+ action="store_true",
334
+ default=False,
335
+ )
336
+
337
+ list_parser.add_argument(
338
+ "--cluster-manager",
339
+ help="Specify which cluster manager to use. Support for certain cluster managers may be experimental",
340
+ dest="cluster_manager",
341
+ choices=CLUSTER_MANAGERS,
342
+ default=CLUSTER_MANAGER_K8S,
343
+ )
344
+
345
+ list_parser.add_argument(
346
+ "--tronfig",
347
+ help="Load the Tron config yaml. Use with --job-id.",
348
+ type=str,
349
+ default=None,
350
+ )
351
+
352
+ list_parser.add_argument(
353
+ "--job-id",
354
+ help="Tron job id <job_name>.<action_name> in the Tronfig to run. Use wuth --tronfig.",
355
+ type=str,
356
+ default=None,
357
+ )
358
+
359
+ list_parser.add_argument(
360
+ "--use-service-auth-token",
361
+ help=(
362
+ "Acquire service authentication token for the underlying instance,"
363
+ " and set it in the container environment"
364
+ ),
365
+ action="store_true",
366
+ dest="use_service_auth_token",
367
+ required=False,
368
+ default=False,
369
+ )
370
+
371
+ list_parser.add_argument(
372
+ "--uses-bulkdata",
373
+ help="Mount /nail/bulkdata in the container",
374
+ action="store_true",
375
+ default=False,
376
+ )
377
+
378
+ aws_group = list_parser.add_argument_group(
379
+ title="AWS credentials options",
380
+ description="If --aws-credentials-yaml is specified, it overrides all "
381
+ "other options. Otherwise, if -s/--service is specified, spark-run "
382
+ "looks for service credentials in /etc/boto_cfg/[service].yaml. If "
383
+ "it does not find the service credentials or no service is "
384
+ "specified, spark-run falls back to the boto default behavior "
385
+ "(checking ~/.aws/credentials, ~/.boto, etc).",
386
+ )
387
+
388
+ aws_group.add_argument(
389
+ "--aws-credentials-yaml",
390
+ help="Load aws keys from the provided yaml file. The yaml file must "
391
+ "have keys for aws_access_key_id and aws_secret_access_key.",
392
+ )
393
+
394
+ aws_group.add_argument(
395
+ "--aws-profile",
396
+ help="Name of the AWS profile to load credentials from. Only used when "
397
+ "--aws-credentials-yaml is not specified and --service is either "
398
+ "not specified or the service does not have credentials in "
399
+ "/etc/boto_cfg",
400
+ )
401
+
402
+ aws_group.add_argument(
403
+ "--aws-region",
404
+ help=f"Specify an aws region. If the region is not specified, we will"
405
+ f"default to using {DEFAULT_AWS_REGION}.",
406
+ default=DEFAULT_AWS_REGION,
407
+ )
408
+
409
+ aws_group.add_argument(
410
+ "--assume-aws-role",
411
+ help=(
412
+ "Takes an AWS IAM role ARN and attempts to create a session using "
413
+ "spark_role_assumer"
414
+ ),
415
+ )
416
+
417
+ aws_group.add_argument(
418
+ "--aws-role-duration",
419
+ help=(
420
+ "Duration in seconds for the role if --assume-aws-role provided. "
421
+ "The maximum is 43200, but by default, roles may only allow 3600."
422
+ ),
423
+ type=int,
424
+ default=43200,
425
+ )
426
+
427
+ aws_group.add_argument(
428
+ "--use-web-identity",
429
+ help=(
430
+ "If the current environment contains AWS_ROLE_ARN and "
431
+ "AWS_WEB_IDENTITY_TOKEN_FILE, creates a session to use. These "
432
+ "ENV vars must be present, and will be in the context of a pod-"
433
+ "identity enabled pod."
434
+ ),
435
+ action="store_true",
436
+ default=False,
437
+ )
438
+
439
+ aws_group.add_argument(
440
+ "--force-pod-identity",
441
+ help=(
442
+ "Normally the spark executor will use the pod identity defined "
443
+ "for the relevant instance in yelpsoa-configs. If the instance "
444
+ "isn't setup there yet, you can override the IAM role arn here."
445
+ " However, it must already be set for a different instance of "
446
+ "the service. Must be used with --executor-pod-identity."
447
+ ),
448
+ default=None,
449
+ )
450
+
451
+ aws_group.add_argument(
452
+ "--executor-pod-identity",
453
+ help=(
454
+ "Launch the executor pod with pod-identity derived from "
455
+ "the iam_role settings attached to the instance settings in "
456
+ "SOA configs. See also --force-pod-identity."
457
+ ),
458
+ action="store_true",
459
+ default=False,
460
+ )
461
+
462
+ jupyter_group = list_parser.add_argument_group(
463
+ title="Jupyter kernel culling options",
464
+ description="Idle kernels will be culled by default. Idle "
465
+ "kernels with connections can be overridden not to be culled.",
466
+ )
467
+
468
+ jupyter_group.add_argument(
469
+ "--cull-idle-timeout",
470
+ type=int,
471
+ default=7200,
472
+ help="Timeout (in seconds) after which a kernel is considered idle and "
473
+ "ready to be culled.",
474
+ )
475
+
476
+ jupyter_group.add_argument(
477
+ "--not-cull-connected",
478
+ action="store_true",
479
+ default=False,
480
+ help="By default, connected idle kernels are culled after timeout. "
481
+ "They can be skipped if not-cull-connected is specified.",
482
+ )
483
+
484
+ list_parser.set_defaults(command=paasta_spark_run)
485
+
486
+
487
+ def sanitize_container_name(container_name):
488
+ # container_name only allows [a-zA-Z0-9][a-zA-Z0-9_.-]
489
+ return re.sub("[^a-zA-Z0-9_.-]", "_", re.sub("^[^a-zA-Z0-9]+", "", container_name))
490
+
491
+
492
+ def get_docker_run_cmd(
493
+ container_name,
494
+ volumes,
495
+ env,
496
+ docker_img,
497
+ docker_cmd,
498
+ nvidia,
499
+ docker_memory_limit,
500
+ docker_shm_size,
501
+ docker_cpu_limit,
502
+ ):
503
+ print(
504
+ f"Setting docker memory, shared memory, and cpu limits as {docker_memory_limit}, {docker_shm_size}, and {docker_cpu_limit} core(s) respectively."
505
+ )
506
+ cmd = ["paasta_docker_wrapper", "run"]
507
+ cmd.append(f"--memory={docker_memory_limit}")
508
+ if docker_shm_size is not None:
509
+ cmd.append(f"--shm-size={docker_shm_size}")
510
+ cmd.append("--ulimit")
511
+ cmd.append("memlock=-1")
512
+ cmd.append(f"--cpus={docker_cpu_limit}")
513
+ cmd.append("--rm")
514
+ cmd.append("--net=host")
515
+
516
+ non_interactive_cmd = ["spark-submit", "history-server"]
517
+ if not any(c in docker_cmd for c in non_interactive_cmd):
518
+ cmd.append("--interactive=true")
519
+ if sys.stdout.isatty():
520
+ cmd.append("--tty=true")
521
+
522
+ container_user = (
523
+ # root inside container == current user outside
524
+ (0, 0)
525
+ if is_using_unprivileged_containers()
526
+ else (os.geteuid(), os.getegid())
527
+ )
528
+ cmd.append("--user=%d:%d" % container_user)
529
+ cmd.append("--name=%s" % sanitize_container_name(container_name))
530
+ for k, v in env.items():
531
+ cmd.append("--env")
532
+ if k in SENSITIVE_ENV:
533
+ cmd.append(k)
534
+ else:
535
+ cmd.append(f"{k}={v}")
536
+ if is_using_unprivileged_containers():
537
+ cmd.append("--env")
538
+ cmd.append(f"HOME=/nail/home/{get_username()}")
539
+ if nvidia:
540
+ cmd.append("--env")
541
+ cmd.append("NVIDIA_VISIBLE_DEVICES=all")
542
+ cmd.append("--runtime=nvidia")
543
+ for volume in volumes:
544
+ cmd.append("--volume=%s" % volume)
545
+ cmd.append("%s" % docker_img)
546
+ cmd.extend(("sh", "-c", docker_cmd))
547
+
548
+ return cmd
549
+
550
+
551
+ def get_docker_image(
552
+ args: argparse.Namespace, instance_config: InstanceConfig
553
+ ) -> Optional[str]:
554
+ """
555
+ Since the Docker image digest used to launch the Spark cluster is obtained by inspecting local
556
+ Docker images, we need to ensure that the Docker image exists locally or is pulled in all scenarios.
557
+ """
558
+ # docker image is built locally then pushed
559
+ if args.build:
560
+ return build_and_push_docker_image(args)
561
+
562
+ docker_url = ""
563
+ if args.image:
564
+ docker_url = args.image
565
+ else:
566
+ try:
567
+ docker_url = instance_config.get_docker_url()
568
+ except NoDockerImageError:
569
+ print(
570
+ PaastaColors.red(
571
+ "Error: No sha has been marked for deployment for the %s deploy group.\n"
572
+ "Please ensure this service has either run through a jenkins pipeline "
573
+ "or paasta mark-for-deployment has been run for %s\n"
574
+ % (instance_config.get_deploy_group(), args.service)
575
+ ),
576
+ sep="",
577
+ file=sys.stderr,
578
+ )
579
+ return None
580
+
581
+ print(
582
+ "Please wait while the image (%s) is pulled (times out after 5m)..."
583
+ % docker_url,
584
+ file=sys.stderr,
585
+ )
586
+ # Need sudo for credentials when pulling images from paasta docker registry (docker-paasta.yelpcorp.com)
587
+ # However, in CI env, we can't connect to docker via root and we can pull with user `jenkins`
588
+ is_ci_env = "CI" in os.environ
589
+ cmd_prefix = "" if is_ci_env else "sudo -H "
590
+ retcode, _ = _run(f"{cmd_prefix}docker pull {docker_url}", stream=True, timeout=300)
591
+ if retcode != 0:
592
+ print(
593
+ "\nPull failed. Are you authorized to run docker commands?",
594
+ file=sys.stderr,
595
+ )
596
+ return None
597
+ return docker_url
598
+
599
+
600
+ def get_smart_paasta_instance_name(args):
601
+ if os.environ.get("TRON_JOB_NAMESPACE"):
602
+ tron_job = os.environ.get("TRON_JOB_NAME")
603
+ tron_action = os.environ.get("TRON_ACTION")
604
+ return f"{tron_job}.{tron_action}"
605
+ else:
606
+ how_submitted = None
607
+ if args.mrjob:
608
+ how_submitted = "mrjob"
609
+ else:
610
+ for spark_cmd in SPARK_COMMANDS:
611
+ if spark_cmd in args.cmd:
612
+ how_submitted = spark_cmd
613
+ break
614
+ how_submitted = how_submitted or "other"
615
+ return f"{args.instance}_{get_username()}_{how_submitted}"
616
+
617
+
618
+ def get_spark_env(
619
+ args: argparse.Namespace,
620
+ spark_conf_str: str,
621
+ aws_creds: Tuple[Optional[str], Optional[str], Optional[str]],
622
+ ui_port: str,
623
+ system_paasta_config: SystemPaastaConfig,
624
+ ) -> Dict[str, str]:
625
+ """Create the env config dict to configure on the docker container"""
626
+
627
+ spark_env = {}
628
+ access_key, secret_key, session_token = aws_creds
629
+ if access_key:
630
+ spark_env["AWS_ACCESS_KEY_ID"] = access_key
631
+ spark_env["AWS_SECRET_ACCESS_KEY"] = secret_key
632
+ if session_token is not None:
633
+ spark_env["AWS_SESSION_TOKEN"] = session_token
634
+
635
+ spark_env["AWS_DEFAULT_REGION"] = args.aws_region
636
+ spark_env["PAASTA_LAUNCHED_BY"] = get_possible_launched_by_user_variable_from_env()
637
+ spark_env["PAASTA_INSTANCE_TYPE"] = "spark"
638
+
639
+ # Run spark (and mesos framework) as root.
640
+ spark_env["SPARK_USER"] = "root"
641
+ spark_env["SPARK_OPTS"] = spark_conf_str
642
+
643
+ # Default configs to start the jupyter notebook server
644
+ if args.cmd == "jupyter-lab":
645
+ spark_env["JUPYTER_RUNTIME_DIR"] = "/source/.jupyter"
646
+ spark_env["JUPYTER_DATA_DIR"] = "/source/.jupyter"
647
+ spark_env["JUPYTER_CONFIG_DIR"] = "/source/.jupyter"
648
+ elif args.cmd == "history-server":
649
+ dirs = args.work_dir.split(":")
650
+ spark_env["SPARK_LOG_DIR"] = dirs[1]
651
+ if not args.spark_args or not args.spark_args.startswith(
652
+ "spark.history.fs.logDirectory"
653
+ ):
654
+ print(
655
+ "history-server requires spark.history.fs.logDirectory in spark-args",
656
+ file=sys.stderr,
657
+ )
658
+ sys.exit(1)
659
+ spark_env["SPARK_HISTORY_OPTS"] = (
660
+ f"-D{args.spark_args} " f"-Dspark.history.ui.port={ui_port}"
661
+ )
662
+ spark_env["SPARK_DAEMON_CLASSPATH"] = "/opt/spark/extra_jars/*"
663
+ spark_env["SPARK_NO_DAEMONIZE"] = "true"
664
+
665
+ if args.get_eks_token_via_iam_user:
666
+ with open(SPARK_DRIVER_IAM_USER) as f:
667
+ config = ConfigParser()
668
+ config.read_file(f)
669
+
670
+ # these env variables are consumed by a script specified in the spark kubeconfig - and which will result in a tightly-scoped IAM identity being used for EKS cluster access
671
+ spark_env["GET_EKS_TOKEN_AWS_ACCESS_KEY_ID"] = config["default"][
672
+ "aws_access_key_id"
673
+ ]
674
+ spark_env["GET_EKS_TOKEN_AWS_SECRET_ACCESS_KEY"] = config["default"][
675
+ "aws_secret_access_key"
676
+ ]
677
+
678
+ spark_env["KUBECONFIG"] = system_paasta_config.get_spark_iam_user_kubeconfig()
679
+ else:
680
+ spark_env["KUBECONFIG"] = system_paasta_config.get_spark_kubeconfig()
681
+
682
+ return spark_env
683
+
684
+
685
+ def get_all_iam_roles_for_service(
686
+ service: str,
687
+ cluster: str,
688
+ ) -> Set[str]:
689
+ tron_instance_configs = load_tron_instance_configs(service, cluster)
690
+ roles = set()
691
+ for action in tron_instance_configs:
692
+ role = action.get_iam_role()
693
+ if role:
694
+ roles.add(role)
695
+ return roles
696
+
697
+
698
+ def _parse_user_spark_args(
699
+ spark_args: str,
700
+ ) -> Dict[str, str]:
701
+
702
+ user_spark_opts = {}
703
+ if spark_args:
704
+ for spark_arg in spark_args.split():
705
+ fields = spark_arg.split("=", 1)
706
+ if len(fields) != 2:
707
+ print(
708
+ PaastaColors.red(
709
+ "Spark option %s is not in format option=value." % spark_arg
710
+ ),
711
+ file=sys.stderr,
712
+ )
713
+ sys.exit(1)
714
+ user_spark_opts[fields[0]] = fields[1]
715
+
716
+ return user_spark_opts
717
+
718
+
719
+ def run_docker_container(
720
+ container_name,
721
+ volumes,
722
+ environment,
723
+ docker_img,
724
+ docker_cmd,
725
+ dry_run,
726
+ nvidia,
727
+ docker_memory_limit,
728
+ docker_shm_size,
729
+ docker_cpu_limit,
730
+ ) -> int:
731
+
732
+ docker_run_args = dict(
733
+ container_name=container_name,
734
+ volumes=volumes,
735
+ env=environment,
736
+ docker_img=docker_img,
737
+ docker_cmd=docker_cmd,
738
+ nvidia=nvidia,
739
+ docker_memory_limit=docker_memory_limit,
740
+ docker_shm_size=docker_shm_size,
741
+ docker_cpu_limit=docker_cpu_limit,
742
+ )
743
+ docker_run_cmd = get_docker_run_cmd(**docker_run_args)
744
+ if dry_run:
745
+ print(json.dumps(docker_run_cmd))
746
+ return 0
747
+
748
+ merged_env = {**os.environ, **environment}
749
+ os.execlpe("paasta_docker_wrapper", *docker_run_cmd, merged_env)
750
+ return 0
751
+
752
+
753
+ def get_spark_app_name(original_docker_cmd: Union[Any, str, List[str]]) -> str:
754
+ """Use submitted batch name as default spark_run job name"""
755
+ docker_cmds = (
756
+ shlex.split(original_docker_cmd)
757
+ if isinstance(original_docker_cmd, str)
758
+ else original_docker_cmd
759
+ )
760
+ spark_app_name = None
761
+ after_spark_submit = False
762
+ for arg in docker_cmds:
763
+ if arg == "spark-submit":
764
+ after_spark_submit = True
765
+ elif after_spark_submit and arg.endswith(".py"):
766
+ batch_name = arg.split("/")[-1].replace(".py", "")
767
+ spark_app_name = "paasta_" + batch_name
768
+ break
769
+ elif arg == "jupyter-lab":
770
+ spark_app_name = "paasta_jupyter"
771
+ break
772
+
773
+ if spark_app_name is None:
774
+ spark_app_name = "paasta_spark_run"
775
+
776
+ spark_app_name += f"_{get_username()}"
777
+
778
+ return spark_app_name
779
+
780
+
781
+ def _calculate_docker_memory_limit(
782
+ spark_conf: Mapping[str, str], memory_limit: Optional[str]
783
+ ) -> str:
784
+ """In Order of preference:
785
+ 1. Argument: --docker-memory-limit
786
+ 2. --spark-args or spark-submit: spark.driver.memory
787
+ 3. Default
788
+ """
789
+ if memory_limit:
790
+ return memory_limit
791
+
792
+ try:
793
+ docker_memory_limit_str = spark_conf.get(
794
+ "spark.driver.memory", DEFAULT_DRIVER_MEMORY_BY_SPARK
795
+ )
796
+ adjustment_factor = DOCKER_RESOURCE_ADJUSTMENT_FACTOR
797
+ match = re.match(r"([0-9]+)([a-z]*)", docker_memory_limit_str)
798
+ memory_val = int(match[1]) * adjustment_factor
799
+ memory_unit = match[2]
800
+ docker_memory_limit = f"{memory_val}{memory_unit}"
801
+ except Exception as e:
802
+ # For any reason it fails, continue with default value
803
+ print(
804
+ f"ERROR: Failed to parse docker memory limit. Error: {e}. Example values: 1g, 200m."
805
+ )
806
+ raise
807
+
808
+ return docker_memory_limit
809
+
810
+
811
+ def _calculate_docker_shared_memory_size(shm_size: Optional[str]) -> str:
812
+ """In Order of preference:
813
+ 1. Argument: --docker-shm-size
814
+ 3. Default
815
+ """
816
+ if shm_size:
817
+ return shm_size
818
+
819
+ return DEFAULT_DOCKER_SHM_SIZE
820
+
821
+
822
+ def _calculate_docker_cpu_limit(
823
+ spark_conf: Mapping[str, str], cpu_limit: Optional[str]
824
+ ) -> str:
825
+ """In Order of preference:
826
+ 1. Argument: --docker-cpu-limit
827
+ 2. --spark-args or spark-submit: spark.driver.cores
828
+ 3. Default
829
+ """
830
+ return (
831
+ cpu_limit
832
+ if cpu_limit
833
+ else spark_conf.get("spark.driver.cores", str(DEFAULT_DRIVER_CORES_BY_SPARK))
834
+ )
835
+
836
+
837
+ def configure_and_run_docker_container(
838
+ args: argparse.Namespace,
839
+ docker_img: str,
840
+ instance_config: InstanceConfig,
841
+ system_paasta_config: SystemPaastaConfig,
842
+ spark_conf: Dict[str, str],
843
+ aws_creds: Tuple[Optional[str], Optional[str], Optional[str]],
844
+ cluster_manager: str,
845
+ pod_template_path: str,
846
+ extra_driver_envs: Dict[str, str] = dict(),
847
+ ) -> int:
848
+ docker_memory_limit = _calculate_docker_memory_limit(
849
+ spark_conf, args.docker_memory_limit
850
+ )
851
+ docker_shm_size = _calculate_docker_shared_memory_size(args.docker_shm_size)
852
+ docker_cpu_limit = _calculate_docker_cpu_limit(
853
+ spark_conf,
854
+ args.docker_cpu_limit,
855
+ )
856
+
857
+ if cluster_manager in {CLUSTER_MANAGER_K8S, CLUSTER_MANAGER_LOCAL}:
858
+ # service_configuration_lib puts volumes into the k8s
859
+ # configs for local mode
860
+ volumes = get_volumes_from_spark_k8s_configs(spark_conf)
861
+ else:
862
+ raise UnsupportedClusterManagerException(cluster_manager)
863
+
864
+ volumes.append("%s:rw" % args.work_dir)
865
+ volumes.append("/nail/home:/nail/home:rw")
866
+
867
+ if pod_template_path:
868
+ volumes.append(f"{pod_template_path}:{pod_template_path}:rw")
869
+
870
+ # NOTE: we mount a directory here since the kubeconfig we're transitioning to requires a helper script that will co-exist in the same directory
871
+ kubeconfig_dir = os.path.dirname(system_paasta_config.get_spark_kubeconfig())
872
+ volumes.append(f"{kubeconfig_dir}:{kubeconfig_dir}:ro")
873
+
874
+ environment = instance_config.get_env_dictionary() # type: ignore
875
+ spark_conf_str = create_spark_config_str(spark_conf, is_mrjob=args.mrjob)
876
+ environment.update(
877
+ get_spark_env(
878
+ args=args,
879
+ spark_conf_str=spark_conf_str,
880
+ aws_creds=aws_creds,
881
+ ui_port=spark_conf["spark.ui.port"],
882
+ system_paasta_config=system_paasta_config,
883
+ )
884
+ ) # type:ignore
885
+ environment.update(extra_driver_envs)
886
+
887
+ if args.use_service_auth_token:
888
+ environment["YELP_SVC_AUTHZ_TOKEN"] = get_service_auth_token()
889
+
890
+ webui_url = get_webui_url(spark_conf["spark.ui.port"])
891
+ webui_url_msg = PaastaColors.green(f"\nSpark monitoring URL: ") + f"{webui_url}\n"
892
+
893
+ docker_cmd = get_docker_cmd(args, instance_config, spark_conf_str)
894
+ if "history-server" in docker_cmd:
895
+ print(PaastaColors.green(f"\nSpark history server URL: ") + f"{webui_url}\n")
896
+ elif any(c in docker_cmd for c in ["pyspark", "spark-shell", "spark-submit"]):
897
+ grafana_url = get_grafana_url(spark_conf)
898
+ dashboard_url_msg = (
899
+ PaastaColors.green(f"\nGrafana dashboard: ") + f"{grafana_url}\n"
900
+ )
901
+ print(webui_url_msg)
902
+ print(dashboard_url_msg)
903
+ log.info(webui_url_msg)
904
+ log.info(dashboard_url_msg)
905
+ spark_conf_builder = spark_config.SparkConfBuilder()
906
+ history_server_url = spark_conf_builder.get_history_url(spark_conf)
907
+ if history_server_url:
908
+ history_server_url_msg = (
909
+ f"\nAfter the job is finished, you can find the spark UI from {history_server_url}\n"
910
+ "Check y/spark-recent-history for faster access to prod logs\n"
911
+ )
912
+ print(history_server_url_msg)
913
+ log.info(history_server_url_msg)
914
+ print(f"Selected cluster manager: {cluster_manager}\n")
915
+
916
+ if clusterman_metrics and _should_get_resource_requirements(docker_cmd, args.mrjob):
917
+ resources = get_resources_requested(spark_conf)
918
+ hourly_cost = get_spark_hourly_cost(
919
+ clusterman_metrics,
920
+ resources,
921
+ spark_conf["spark.executorEnv.PAASTA_CLUSTER"],
922
+ args.pool,
923
+ )
924
+ message = (
925
+ f"Resource request ({resources['cpus']} cpus and {resources['mem']} MB memory total)"
926
+ f" is estimated to cost ${hourly_cost} per hour"
927
+ )
928
+ if clusterman_metrics.util.costs.should_warn(hourly_cost):
929
+ print(PaastaColors.red(f"WARNING: {message}"))
930
+ else:
931
+ print(message)
932
+
933
+ return run_docker_container(
934
+ container_name=spark_conf["spark.app.name"],
935
+ volumes=volumes,
936
+ environment=environment,
937
+ docker_img=docker_img,
938
+ docker_cmd=docker_cmd,
939
+ dry_run=args.dry_run,
940
+ nvidia=args.nvidia,
941
+ docker_memory_limit=docker_memory_limit,
942
+ docker_shm_size=docker_shm_size,
943
+ docker_cpu_limit=docker_cpu_limit,
944
+ )
945
+
946
+
947
+ def _should_get_resource_requirements(docker_cmd: str, is_mrjob: bool) -> bool:
948
+ return is_mrjob or any(
949
+ c in docker_cmd for c in ["pyspark", "spark-shell", "spark-submit"]
950
+ )
951
+
952
+
953
+ def get_docker_cmd(
954
+ args: argparse.Namespace, instance_config: InstanceConfig, spark_conf_str: str
955
+ ) -> str:
956
+ original_docker_cmd = str(args.cmd or instance_config.get_cmd())
957
+
958
+ if args.mrjob:
959
+ return original_docker_cmd + " " + spark_conf_str
960
+ # Default cli options to start the jupyter notebook server.
961
+ elif original_docker_cmd == "jupyter-lab":
962
+ cull_opts = (
963
+ "--MappingKernelManager.cull_idle_timeout=%s " % args.cull_idle_timeout
964
+ )
965
+ if args.not_cull_connected is False:
966
+ cull_opts += "--MappingKernelManager.cull_connected=True "
967
+
968
+ return "SHELL=bash USER={} /source/virtualenv_run_jupyter/bin/jupyter-lab -y --ip={} {}".format(
969
+ get_username(), socket.getfqdn(), cull_opts
970
+ )
971
+ elif original_docker_cmd == "history-server":
972
+ return "start-history-server.sh"
973
+ # Spark options are passed as options to pyspark and spark-shell.
974
+ # For jupyter, environment variable SPARK_OPTS is set instead.
975
+ else:
976
+ return inject_spark_conf_str(original_docker_cmd, spark_conf_str)
977
+
978
+
979
+ def _get_adhoc_docker_registry(service: str, soa_dir: str = DEFAULT_SOA_DIR) -> str:
980
+ if service is None:
981
+ raise NotImplementedError('"None" is not a valid service')
982
+
983
+ service_configuration = read_service_configuration(service, soa_dir)
984
+ return service_configuration.get("docker_registry", DEFAULT_SPARK_DOCKER_REGISTRY)
985
+
986
+
987
+ def build_and_push_docker_image(args: argparse.Namespace) -> Optional[str]:
988
+ """
989
+ Build an image if the default Spark service image is not preferred.
990
+ The image needs to be pushed to a registry for the Spark executors
991
+ to pull.
992
+ """
993
+ if not makefile_responds_to("cook-image"):
994
+ print(
995
+ "A local Makefile with a 'cook-image' target is required for --build",
996
+ file=sys.stderr,
997
+ )
998
+ return None
999
+
1000
+ default_tag = "{}-{}".format(DEFAULT_SPARK_DOCKER_IMAGE_PREFIX, get_username())
1001
+ docker_tag = os.environ.get("DOCKER_TAG", default_tag)
1002
+ os.environ["DOCKER_TAG"] = docker_tag
1003
+
1004
+ cook_return = paasta_cook_image(
1005
+ args=None, service=args.service, soa_dir=args.yelpsoa_config_root
1006
+ )
1007
+ if cook_return != 0:
1008
+ return None
1009
+
1010
+ registry_uri = args.docker_registry or _get_adhoc_docker_registry(
1011
+ service=args.service,
1012
+ soa_dir=args.yelpsoa_config_root,
1013
+ )
1014
+
1015
+ docker_url = f"{registry_uri}/{docker_tag}"
1016
+ command = f"docker tag {docker_tag} {docker_url}"
1017
+ print(PaastaColors.grey(command))
1018
+ retcode, _ = _run(command, stream=True)
1019
+ if retcode != 0:
1020
+ return None
1021
+
1022
+ if registry_uri != DEFAULT_SPARK_DOCKER_REGISTRY:
1023
+ command = "sudo -H docker push %s" % docker_url
1024
+ else:
1025
+ command = "docker push %s" % docker_url
1026
+
1027
+ print(PaastaColors.grey(command))
1028
+ retcode, output = _run(command, stream=False)
1029
+ if retcode != 0:
1030
+ return None
1031
+
1032
+ # With unprivileged docker, the digest on the remote registry may not match the digest
1033
+ # in the local environment. Because of this, we have to parse the digest message from the
1034
+ # server response and use downstream when launching spark executors
1035
+
1036
+ # Output from `docker push` with unprivileged docker looks like
1037
+ # Using default tag: latest
1038
+ # The push refers to repository [docker-dev.yelpcorp.com/paasta-spark-run-dpopes:latest]
1039
+ # latest: digest: sha256:0a43aa65174a400bd280d48d460b73eb49b0ded4072c9e173f919543bf693557
1040
+
1041
+ # With privileged docker, the last line has an extra "size: 123"
1042
+ # latest: digest: sha256:0a43aa65174a400bd280d48d460b73eb49b0ded4072c9e173f919543bf693557 size: 52
1043
+
1044
+ digest_line = output.split("\n")[-1]
1045
+ digest_match = re.match(r"[^:]*: [^:]*: (?P<digest>[^\s]*)", digest_line)
1046
+ if not digest_match:
1047
+ raise ValueError(f"Could not determine digest from output: {output}")
1048
+ digest = digest_match.group("digest")
1049
+
1050
+ image_url = f"{docker_url}@{digest}"
1051
+
1052
+ # If the local digest doesn't match the remote digest AND the registry is
1053
+ # non-default (which requires requires authentication, and consequently sudo),
1054
+ # downstream `docker run` commands will fail trying to authenticate.
1055
+ # To work around this, we can proactively `sudo docker pull` here so that
1056
+ # the image exists locally and can be `docker run` without sudo
1057
+ if registry_uri != DEFAULT_SPARK_DOCKER_REGISTRY:
1058
+ command = f"sudo -H docker pull {image_url}"
1059
+ print(PaastaColors.grey(command))
1060
+ retcode, output = _run(command, stream=False)
1061
+ if retcode != 0:
1062
+ raise NoDockerImageError(f"Could not pull {image_url}: {output}")
1063
+
1064
+ return image_url
1065
+
1066
+
1067
+ def validate_work_dir(s):
1068
+ dirs = s.split(":")
1069
+ if len(dirs) != 2:
1070
+ print(
1071
+ "work-dir %s is not in format local_abs_dir:container_abs_dir" % s,
1072
+ file=sys.stderr,
1073
+ )
1074
+ sys.exit(1)
1075
+
1076
+ for d in dirs:
1077
+ if not os.path.isabs(d):
1078
+ print("%s is not an absolute path" % d, file=sys.stderr)
1079
+ sys.exit(1)
1080
+
1081
+
1082
+ def parse_tronfig(tronfig_path: str, job_id: str) -> Optional[Dict[str, Any]]:
1083
+ splitted = job_id.split(".")
1084
+ if len(splitted) != 2:
1085
+ return None
1086
+ job_name, action_name = splitted
1087
+
1088
+ file_content = read_yaml_file(tronfig_path)
1089
+ jobs = filter_templates_from_config(file_content)
1090
+ if job_name not in jobs or action_name not in jobs[job_name].get("actions", {}):
1091
+ return None
1092
+ return jobs[job_name]["actions"][action_name]
1093
+
1094
+
1095
+ def update_args_from_tronfig(args: argparse.Namespace) -> Optional[Dict[str, str]]:
1096
+ """
1097
+ Load and check the following config fields from the provided Tronfig.
1098
+ - executor
1099
+ - pool
1100
+ - iam_role
1101
+ - iam_role_provider
1102
+ - force_spark_resource_configs
1103
+ - max_runtime
1104
+ - command
1105
+ - env
1106
+ - spark_args
1107
+
1108
+ Returns: environment variables dictionary or None if failed.
1109
+ """
1110
+ action_dict = parse_tronfig(args.tronfig, args.job_id)
1111
+ if action_dict is None:
1112
+ print(
1113
+ PaastaColors.red(f"Unable to get configs from job-id: {args.job_id}"),
1114
+ file=sys.stderr,
1115
+ )
1116
+ return None
1117
+
1118
+ # executor === spark
1119
+ if action_dict.get("executor", "") != "spark":
1120
+ print(
1121
+ PaastaColors.red("Invalid Tronfig: executor should be 'spark'"),
1122
+ file=sys.stderr,
1123
+ )
1124
+ return None
1125
+
1126
+ # iam_role / aws_profile
1127
+ if (
1128
+ "iam_role" in action_dict
1129
+ and action_dict.get("iam_role_provider", "aws") != "aws"
1130
+ ):
1131
+ print(
1132
+ PaastaColors.red("Invalid Tronfig: iam_role_provider should be 'aws'"),
1133
+ file=sys.stderr,
1134
+ )
1135
+ return None
1136
+
1137
+ # Other args: map Tronfig YAML fields to spark-run CLI args
1138
+ fields_to_args = {
1139
+ "pool": "pool",
1140
+ "iam_role": "assume_aws_role",
1141
+ "force_spark_resource_configs": "force_spark_resource_configs",
1142
+ "max_runtime": "timeout_job_runtime",
1143
+ "command": "cmd",
1144
+ "spark_args": "spark_args",
1145
+ }
1146
+ for field_name, arg_name in fields_to_args.items():
1147
+ if field_name in action_dict:
1148
+ value = action_dict[field_name]
1149
+
1150
+ # Convert spark_args values from dict to a string "k1=v1 k2=v2"
1151
+ if field_name == "spark_args":
1152
+ value = " ".join([f"{k}={v}" for k, v in dict(value).items()])
1153
+
1154
+ # Beautify for printing
1155
+ arg_name_str = (f"--{arg_name.replace('_', '-')}").ljust(31, " ")
1156
+
1157
+ # Only load iam_role value if --aws-profile is not set
1158
+ if field_name == "iam_role" and args.aws_profile is not None:
1159
+ print(
1160
+ PaastaColors.yellow(
1161
+ f"Ignoring Tronfig: `{field_name} : {value}`, since `--aws-profile` is provided. "
1162
+ f"We are giving higher priority to `--aws-profile` in case of paasta spark-run adhoc runs."
1163
+ ),
1164
+ )
1165
+ continue
1166
+
1167
+ if hasattr(args, arg_name):
1168
+ print(
1169
+ PaastaColors.yellow(
1170
+ f"Overwriting args with Tronfig: {arg_name_str} => {field_name} : {value}"
1171
+ ),
1172
+ )
1173
+ setattr(args, arg_name, value)
1174
+
1175
+ # env (currently paasta spark-run does not support Spark driver secrets environment variables)
1176
+ return action_dict.get("env", dict())
1177
+
1178
+
1179
+ def paasta_spark_run(args: argparse.Namespace) -> int:
1180
+ if args.get_eks_token_via_iam_user and os.getuid() != 0:
1181
+ print("Re-executing paasta spark-run with sudo..", file=sys.stderr)
1182
+ # argv[0] is treated as command name, so prepending "sudo"
1183
+ os.execvp("sudo", ["sudo"] + sys.argv)
1184
+ return # will not reach unless above function is mocked
1185
+
1186
+ driver_envs_from_tronfig: Dict[str, str] = dict()
1187
+ if args.tronfig is not None:
1188
+ if args.job_id is None:
1189
+ print(
1190
+ PaastaColors.red("Missing --job-id when --tronfig is provided"),
1191
+ file=sys.stderr,
1192
+ )
1193
+ return False
1194
+ driver_envs_from_tronfig = update_args_from_tronfig(args)
1195
+ if driver_envs_from_tronfig is None:
1196
+ return False
1197
+
1198
+ # argparse does not work as expected with both default and
1199
+ # type=validate_work_dir.
1200
+ validate_work_dir(args.work_dir)
1201
+
1202
+ try:
1203
+ system_paasta_config = load_system_paasta_config()
1204
+ except PaastaNotConfiguredError:
1205
+ print(
1206
+ PaastaColors.yellow(
1207
+ "Warning: Couldn't load config files from '/etc/paasta'. This indicates"
1208
+ "PaaSTA is not configured locally on this host, and local-run may not behave"
1209
+ "the same way it would behave on a server configured for PaaSTA."
1210
+ ),
1211
+ sep="\n",
1212
+ )
1213
+ system_paasta_config = SystemPaastaConfig({"volumes": []}, "/etc/paasta")
1214
+
1215
+ if args.cmd == "jupyter-lab" and not args.build and not args.image:
1216
+ print(
1217
+ PaastaColors.red(
1218
+ "The jupyter-lab command requires a prebuilt image with -I or --image."
1219
+ ),
1220
+ file=sys.stderr,
1221
+ )
1222
+ return 1
1223
+
1224
+ # validate pool
1225
+ try:
1226
+ if not validate_pool(args.cluster, args.pool, system_paasta_config):
1227
+ print(
1228
+ PaastaColors.red(
1229
+ f"Invalid --pool value. List of valid pools for cluster `{args.cluster}`: "
1230
+ f"{system_paasta_config.get_pools_for_cluster(args.cluster)}"
1231
+ ),
1232
+ file=sys.stderr,
1233
+ )
1234
+ return 1
1235
+ except PoolsNotConfiguredError:
1236
+ log.warning(
1237
+ PaastaColors.yellow(
1238
+ f"Could not fetch allowed_pools for `{args.cluster}`. Skipping pool validation.\n"
1239
+ )
1240
+ )
1241
+
1242
+ # annoyingly, there's two layers of aliases: one for the soaconfigs to read from
1243
+ # (that's this alias lookup) - and then another layer later when figuring out what
1244
+ # k8s server url to use ;_;
1245
+ cluster = system_paasta_config.get_cluster_aliases().get(args.cluster, args.cluster)
1246
+ # Use the default spark:client instance configs if not provided
1247
+ try:
1248
+ instance_config = get_instance_config(
1249
+ service=args.service,
1250
+ instance=args.instance,
1251
+ cluster=cluster,
1252
+ load_deployments=args.build is False and args.image is None,
1253
+ soa_dir=args.yelpsoa_config_root,
1254
+ )
1255
+ # If the spark job has uses_bulkdata set then propagate it to the instance_config
1256
+ # If not, then whatever the instance_config has will be used
1257
+ if args.uses_bulkdata:
1258
+ instance_config.config_dict["uses_bulkdata"] = args.uses_bulkdata
1259
+ except NoConfigurationForServiceError as e:
1260
+ print(str(e), file=sys.stderr)
1261
+ return 1
1262
+ except NoDeploymentsAvailable:
1263
+ print(
1264
+ PaastaColors.red(
1265
+ "Error: No deployments.json found in %(soa_dir)s/%(service)s."
1266
+ "You can generate this by running:"
1267
+ "generate_deployments_for_service -d %(soa_dir)s -s %(service)s"
1268
+ % {"soa_dir": args.yelpsoa_config_root, "service": args.service}
1269
+ ),
1270
+ sep="\n",
1271
+ file=sys.stderr,
1272
+ )
1273
+ return 1
1274
+
1275
+ if not args.cmd and not instance_config.get_cmd():
1276
+ print(
1277
+ "A command is required, pyspark, spark-shell, spark-submit or jupyter",
1278
+ file=sys.stderr,
1279
+ )
1280
+ return 1
1281
+
1282
+ service_account_name = None
1283
+ iam_role = instance_config.get_iam_role()
1284
+ if args.executor_pod_identity and not (iam_role or args.force_pod_identity):
1285
+ print(
1286
+ "--executor-pod-identity set but no iam_role settings found.",
1287
+ file=sys.stderr,
1288
+ )
1289
+ return 1
1290
+ if args.executor_pod_identity:
1291
+ if args.force_pod_identity:
1292
+ if args.yelpsoa_config_root != DEFAULT_SOA_DIR:
1293
+ print(
1294
+ "--force-pod-identity cannot be used with --yelpsoa-config-root",
1295
+ file=sys.stderr,
1296
+ )
1297
+ return 1
1298
+
1299
+ allowed_iam_roles = get_all_iam_roles_for_service(
1300
+ args.service, args.cluster
1301
+ )
1302
+ if args.force_pod_identity not in allowed_iam_roles:
1303
+ print(
1304
+ f"{args.force_pod_identity} is not an allowed role for this service. "
1305
+ f"Allowed roles are: {allowed_iam_roles}.",
1306
+ file=sys.stderr,
1307
+ )
1308
+ return 1
1309
+
1310
+ service_account_name = get_service_account_name(args.force_pod_identity)
1311
+ else:
1312
+ service_account_name = get_service_account_name(iam_role)
1313
+ if (
1314
+ not args.aws_credentials_yaml
1315
+ and not args.aws_profile
1316
+ and not args.assume_aws_role
1317
+ ):
1318
+ args.aws_credentials_yaml = (
1319
+ system_paasta_config.get_default_spark_iam_user()
1320
+ )
1321
+ log.info(f"Running executor with service account {service_account_name}")
1322
+
1323
+ aws_creds = get_aws_credentials(
1324
+ service=args.service,
1325
+ aws_credentials_yaml=args.aws_credentials_yaml,
1326
+ profile_name=args.aws_profile,
1327
+ assume_aws_role_arn=args.assume_aws_role,
1328
+ session_duration=args.aws_role_duration,
1329
+ use_web_identity=args.use_web_identity,
1330
+ )
1331
+
1332
+ # If executor pods use a service account, they don't need static aws creds
1333
+ # but the driver still does
1334
+ if service_account_name:
1335
+ executor_aws_creds = None
1336
+ else:
1337
+ executor_aws_creds = aws_creds
1338
+
1339
+ docker_image_digest = get_docker_image(args, instance_config)
1340
+ if docker_image_digest is None:
1341
+ return 1
1342
+
1343
+ volumes = instance_config.get_volumes(
1344
+ system_paasta_config.get_volumes(),
1345
+ )
1346
+ app_base_name = get_spark_app_name(args.cmd or instance_config.get_cmd())
1347
+
1348
+ user_spark_opts = _parse_user_spark_args(args.spark_args)
1349
+
1350
+ args.cmd = auto_add_timeout_for_spark_job(args.cmd, args.timeout_job_runtime)
1351
+
1352
+ # This is required if configs are provided as part of `spark-submit`
1353
+ # Other way to provide is with --spark-args
1354
+ sub_cmds = args.cmd.split(" ") # spark.driver.memory=10g
1355
+ for cmd in sub_cmds:
1356
+ if cmd.startswith("spark.driver.memory") or cmd.startswith(
1357
+ "spark.driver.cores"
1358
+ ):
1359
+ key, value = cmd.split("=")
1360
+ user_spark_opts[key] = value
1361
+
1362
+ paasta_instance = get_smart_paasta_instance_name(args)
1363
+
1364
+ k8s_server_address = get_k8s_url_for_cluster(args.cluster)
1365
+ paasta_cluster = system_paasta_config.get_eks_cluster_aliases().get(
1366
+ args.cluster, args.cluster
1367
+ )
1368
+
1369
+ spark_conf_builder = spark_config.SparkConfBuilder()
1370
+ spark_conf = spark_conf_builder.get_spark_conf(
1371
+ cluster_manager=args.cluster_manager,
1372
+ spark_app_base_name=app_base_name,
1373
+ docker_img=docker_image_digest,
1374
+ user_spark_opts=user_spark_opts,
1375
+ paasta_cluster=paasta_cluster,
1376
+ paasta_pool=args.pool,
1377
+ paasta_service=args.service,
1378
+ paasta_instance=paasta_instance,
1379
+ extra_volumes=cast(List[Mapping[str, str]], volumes),
1380
+ aws_creds=executor_aws_creds,
1381
+ aws_region=args.aws_region,
1382
+ force_spark_resource_configs=args.force_spark_resource_configs,
1383
+ use_eks=True,
1384
+ k8s_server_address=k8s_server_address,
1385
+ service_account_name=service_account_name,
1386
+ )
1387
+
1388
+ return configure_and_run_docker_container(
1389
+ args,
1390
+ docker_img=docker_image_digest,
1391
+ instance_config=instance_config,
1392
+ system_paasta_config=system_paasta_config,
1393
+ spark_conf=spark_conf,
1394
+ aws_creds=aws_creds,
1395
+ cluster_manager=args.cluster_manager,
1396
+ pod_template_path=spark_conf.get(
1397
+ "spark.kubernetes.executor.podTemplateFile", ""
1398
+ ),
1399
+ extra_driver_envs=driver_envs_from_tronfig,
1400
+ )