opencode-skills-collection 3.1.0 → 3.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (291) hide show
  1. package/bundled-skills/.antigravity-install-manifest.json +84 -1
  2. package/bundled-skills/2slides-ppt-generator/SKILL.md +8 -7
  3. package/bundled-skills/android-cli/SKILL.md +19 -7
  4. package/bundled-skills/android-ui-journey-testing/SKILL.md +191 -0
  5. package/bundled-skills/apple-notes-search/SKILL.md +12 -2
  6. package/bundled-skills/ask-matt/SKILL.md +92 -0
  7. package/bundled-skills/atlas-ledger/SKILL.md +8 -0
  8. package/bundled-skills/bugs-are-annoying/SKILL.md +137 -0
  9. package/bundled-skills/codebase-design/DEEPENING.md +37 -0
  10. package/bundled-skills/codebase-design/DESIGN-IT-TWICE.md +44 -0
  11. package/bundled-skills/codebase-design/SKILL.md +145 -0
  12. package/bundled-skills/codex-fable5/SKILL.md +10 -2
  13. package/bundled-skills/competitor-analysis/LICENSE.txt +21 -0
  14. package/bundled-skills/competitor-analysis/SKILL.md +434 -0
  15. package/bundled-skills/competitor-analysis/references/battle-card-subagent.md +127 -0
  16. package/bundled-skills/competitor-analysis/references/battle-card.md +91 -0
  17. package/bundled-skills/competitor-analysis/references/example-research.md +130 -0
  18. package/bundled-skills/competitor-analysis/references/report-template.html +127 -0
  19. package/bundled-skills/competitor-analysis/references/research-patterns.md +217 -0
  20. package/bundled-skills/competitor-analysis/references/workflow.md +434 -0
  21. package/bundled-skills/competitor-analysis/scripts/capture_screenshots.mjs +142 -0
  22. package/bundled-skills/competitor-analysis/scripts/compile_report.mjs +929 -0
  23. package/bundled-skills/competitor-analysis/scripts/extract_vs_names.mjs +140 -0
  24. package/bundled-skills/competitor-analysis/scripts/gate_candidates.mjs +224 -0
  25. package/bundled-skills/competitor-analysis/scripts/list_urls.mjs +90 -0
  26. package/bundled-skills/competitor-analysis/scripts/md_utils.mjs +50 -0
  27. package/bundled-skills/competitor-analysis/scripts/merge_partials.mjs +291 -0
  28. package/bundled-skills/competitor-analysis/scripts/package.json +6 -0
  29. package/bundled-skills/design-it/3d-ui/SKILL.md +259 -0
  30. package/bundled-skills/design-it/SKILL.md +170 -0
  31. package/bundled-skills/design-it/ai-native-ui/SKILL.md +295 -0
  32. package/bundled-skills/design-it/aurora-ui/SKILL.md +307 -0
  33. package/bundled-skills/design-it/bento-ui/SKILL.md +314 -0
  34. package/bundled-skills/design-it/brutalism/SKILL.md +270 -0
  35. package/bundled-skills/design-it/brutalist-typography/SKILL.md +287 -0
  36. package/bundled-skills/design-it/card-based-design/SKILL.md +262 -0
  37. package/bundled-skills/design-it/claymorphism/SKILL.md +287 -0
  38. package/bundled-skills/design-it/color-blocking/SKILL.md +278 -0
  39. package/bundled-skills/design-it/command-center-ui/SKILL.md +345 -0
  40. package/bundled-skills/design-it/cyber-y2k/SKILL.md +312 -0
  41. package/bundled-skills/design-it/cyberpunk-ui/SKILL.md +262 -0
  42. package/bundled-skills/design-it/dark-mode/SKILL.md +289 -0
  43. package/bundled-skills/design-it/dashboard-design/SKILL.md +331 -0
  44. package/bundled-skills/design-it/data-dense-design/SKILL.md +322 -0
  45. package/bundled-skills/design-it/duotone-design/SKILL.md +248 -0
  46. package/bundled-skills/design-it/editorial-design/SKILL.md +328 -0
  47. package/bundled-skills/design-it/flat-design/SKILL.md +221 -0
  48. package/bundled-skills/design-it/flat-design-2/SKILL.md +240 -0
  49. package/bundled-skills/design-it/floating-ui/SKILL.md +299 -0
  50. package/bundled-skills/design-it/frutiger-aero/SKILL.md +274 -0
  51. package/bundled-skills/design-it/glassmorphism/SKILL.md +272 -0
  52. package/bundled-skills/design-it/gradient-design/SKILL.md +309 -0
  53. package/bundled-skills/design-it/high-contrast/SKILL.md +288 -0
  54. package/bundled-skills/design-it/holographic-ui/SKILL.md +310 -0
  55. package/bundled-skills/design-it/isometric-design/SKILL.md +228 -0
  56. package/bundled-skills/design-it/layered-design/SKILL.md +247 -0
  57. package/bundled-skills/design-it/material-design/SKILL.md +275 -0
  58. package/bundled-skills/design-it/maximalism/SKILL.md +297 -0
  59. package/bundled-skills/design-it/minimalism/SKILL.md +267 -0
  60. package/bundled-skills/design-it/monochromatic-ui/SKILL.md +296 -0
  61. package/bundled-skills/design-it/neo-brutalism/SKILL.md +270 -0
  62. package/bundled-skills/design-it/neumorphism/SKILL.md +248 -0
  63. package/bundled-skills/design-it/retro-design/SKILL.md +283 -0
  64. package/bundled-skills/design-it/retro-futurism/SKILL.md +259 -0
  65. package/bundled-skills/design-it/sci-fi-interface/SKILL.md +309 -0
  66. package/bundled-skills/design-it/skeuomorphism/SKILL.md +280 -0
  67. package/bundled-skills/design-it/soft-pastel/SKILL.md +307 -0
  68. package/bundled-skills/design-it/spatial-computing-ui/SKILL.md +300 -0
  69. package/bundled-skills/design-it/spatial-design/SKILL.md +268 -0
  70. package/bundled-skills/design-it/swiss-design/SKILL.md +293 -0
  71. package/bundled-skills/design-it/synthwave/SKILL.md +257 -0
  72. package/bundled-skills/design-it/tile-design/SKILL.md +297 -0
  73. package/bundled-skills/design-it/typography-first/SKILL.md +247 -0
  74. package/bundled-skills/design-it/vaporwave/SKILL.md +331 -0
  75. package/bundled-skills/design-it/vibrant-maximalism/SKILL.md +291 -0
  76. package/bundled-skills/design-it/widget-based-design/SKILL.md +274 -0
  77. package/bundled-skills/design-it/y2k-design/SKILL.md +268 -0
  78. package/bundled-skills/diagnosing-bugs/SKILL.md +165 -0
  79. package/bundled-skills/diagnosing-bugs/scripts/hitl-loop.template.sh +41 -0
  80. package/bundled-skills/docs/contributors/skill-scoring.md +235 -0
  81. package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
  82. package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
  83. package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
  84. package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
  85. package/bundled-skills/docs/users/bundles.md +145 -1
  86. package/bundled-skills/docs/users/claude-code-skills.md +1 -1
  87. package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
  88. package/bundled-skills/docs/users/getting-started.md +1 -1
  89. package/bundled-skills/docs/users/kiro-integration.md +1 -1
  90. package/bundled-skills/docs/users/specialized-plugin-roadmap.md +11 -4
  91. package/bundled-skills/docs/users/usage.md +4 -4
  92. package/bundled-skills/docs/users/visual-guide.md +4 -4
  93. package/bundled-skills/domain-modeling/ADR-FORMAT.md +47 -0
  94. package/bundled-skills/domain-modeling/CONTEXT-FORMAT.md +60 -0
  95. package/bundled-skills/domain-modeling/SKILL.md +105 -0
  96. package/bundled-skills/dos-verify-done-claims/SKILL.md +16 -4
  97. package/bundled-skills/ecl-harness-engineer/agents/creator-config.md +1 -1
  98. package/bundled-skills/ecl-harness-engineer/references/environment-config-guide.md +2 -2
  99. package/bundled-skills/ecl-harness-engineer/references/environment-detection-guide.md +4 -4
  100. package/bundled-skills/event-staffing-ordering/SKILL.md +4 -0
  101. package/bundled-skills/grill-me/SKILL.md +36 -0
  102. package/bundled-skills/grill-with-docs/SKILL.md +36 -0
  103. package/bundled-skills/grilling/SKILL.md +39 -0
  104. package/bundled-skills/handoff/SKILL.md +45 -0
  105. package/bundled-skills/image-generator/.env.example +7 -0
  106. package/bundled-skills/image-generator/SKILL.md +509 -0
  107. package/bundled-skills/improve-codebase-architecture/HTML-REPORT.md +123 -0
  108. package/bundled-skills/improve-codebase-architecture/SKILL.md +97 -0
  109. package/bundled-skills/learn/SKILL.md +156 -0
  110. package/bundled-skills/lesson-generator/SKILL.md +90 -0
  111. package/bundled-skills/llm-council/.env.example +7 -0
  112. package/bundled-skills/llm-council/SKILL.md +602 -0
  113. package/bundled-skills/loop-library/SKILL.md +208 -0
  114. package/bundled-skills/loop-library/agents/openai.yaml +4 -0
  115. package/bundled-skills/loop-library/references/catalog.md +270 -0
  116. package/bundled-skills/lovable-cleanup/SKILL.md +9 -7
  117. package/bundled-skills/macos-screen-recorder/SKILL.md +9 -1
  118. package/bundled-skills/mailtrap-managing-contacts/SKILL.md +112 -0
  119. package/bundled-skills/mailtrap-sending-emails/SKILL.md +167 -0
  120. package/bundled-skills/mailtrap-setting-up-sending-domain/SKILL.md +77 -0
  121. package/bundled-skills/mailtrap-testing-with-sandbox/SKILL.md +110 -0
  122. package/bundled-skills/prototype/LOGIC.md +79 -0
  123. package/bundled-skills/prototype/SKILL.md +62 -0
  124. package/bundled-skills/prototype/UI.md +112 -0
  125. package/bundled-skills/screenstudio-alt/SKILL.md +9 -1
  126. package/bundled-skills/setup-matt-pocock-skills/SKILL.md +158 -0
  127. package/bundled-skills/setup-matt-pocock-skills/domain.md +51 -0
  128. package/bundled-skills/setup-matt-pocock-skills/issue-tracker-github.md +34 -0
  129. package/bundled-skills/setup-matt-pocock-skills/issue-tracker-gitlab.md +35 -0
  130. package/bundled-skills/setup-matt-pocock-skills/issue-tracker-local.md +19 -0
  131. package/bundled-skills/setup-matt-pocock-skills/triage-labels.md +15 -0
  132. package/bundled-skills/survey-generator/LICENSE +21 -0
  133. package/bundled-skills/survey-generator/SKILL.md +143 -0
  134. package/bundled-skills/survey-generator/build_artifact.py +208 -0
  135. package/bundled-skills/survey-generator/examples/agentic-engineering/research_bundle.json +1196 -0
  136. package/bundled-skills/survey-generator/examples/agentic-engineering/survey.html +706 -0
  137. package/bundled-skills/survey-generator/style_spec.json +85 -0
  138. package/bundled-skills/survey-generator/templates/research_bundle_template.json +69 -0
  139. package/bundled-skills/tdd/SKILL.md +139 -0
  140. package/bundled-skills/tdd/mocking.md +59 -0
  141. package/bundled-skills/tdd/refactoring.md +10 -0
  142. package/bundled-skills/tdd/tests.md +61 -0
  143. package/bundled-skills/teach/GLOSSARY-FORMAT.md +35 -0
  144. package/bundled-skills/teach/LEARNING-RECORD-FORMAT.md +46 -0
  145. package/bundled-skills/teach/MISSION-FORMAT.md +31 -0
  146. package/bundled-skills/teach/RESOURCES-FORMAT.md +32 -0
  147. package/bundled-skills/teach/SKILL.md +169 -0
  148. package/bundled-skills/to-issues/SKILL.md +115 -0
  149. package/bundled-skills/to-prd/SKILL.md +104 -0
  150. package/bundled-skills/tools-page-seo-optimizer/SKILL.md +616 -0
  151. package/bundled-skills/triage/AGENT-BRIEF.md +207 -0
  152. package/bundled-skills/triage/OUT-OF-SCOPE.md +105 -0
  153. package/bundled-skills/triage/SKILL.md +143 -0
  154. package/bundled-skills/vibecode-production-qa-validator/SKILL.md +371 -141
  155. package/bundled-skills/wiki-builder/SKILL.md +157 -0
  156. package/bundled-skills/wiki-builder/agents/openai.yaml +5 -0
  157. package/bundled-skills/wiki-builder/references/wiki-flavors.md +98 -0
  158. package/bundled-skills/wiki-builder/scripts/init_wiki.sh +105 -0
  159. package/bundled-skills/wiki-builder/templates/index.md +20 -0
  160. package/bundled-skills/wiki-builder/templates/maintenance-log.md +7 -0
  161. package/bundled-skills/wiki-builder/templates/prompts/compile-concept-page.md +12 -0
  162. package/bundled-skills/wiki-builder/templates/prompts/compile-index.md +11 -0
  163. package/bundled-skills/wiki-builder/templates/prompts/compile-source-page.md +12 -0
  164. package/bundled-skills/wiki-builder/templates/prompts/lint-wiki.md +10 -0
  165. package/bundled-skills/wiki-builder/templates/prompts/query-and-file.md +11 -0
  166. package/bundled-skills/wiki-builder/templates/sources.md +9 -0
  167. package/bundled-skills/wiki-builder/templates/wiki.config.md +53 -0
  168. package/bundled-skills/writing-great-skills/GLOSSARY.md +181 -0
  169. package/bundled-skills/writing-great-skills/SKILL.md +111 -0
  170. package/bundled-skills/yao-meta-skill/SKILL.md +86 -0
  171. package/bundled-skills/yao-meta-skill/agents/interface.yaml +26 -0
  172. package/bundled-skills/yao-meta-skill/manifest.json +24 -0
  173. package/bundled-skills/yao-meta-skill/references/artifact-design-doctrine.md +49 -0
  174. package/bundled-skills/yao-meta-skill/references/authoring-discipline.md +78 -0
  175. package/bundled-skills/yao-meta-skill/references/autonomous-adaptation.md +65 -0
  176. package/bundled-skills/yao-meta-skill/references/distribution-registry-method.md +60 -0
  177. package/bundled-skills/yao-meta-skill/references/eval-playbook.md +69 -0
  178. package/bundled-skills/yao-meta-skill/references/gate-selection.md +68 -0
  179. package/bundled-skills/yao-meta-skill/references/governance.md +134 -0
  180. package/bundled-skills/yao-meta-skill/references/human-review-template.md +54 -0
  181. package/bundled-skills/yao-meta-skill/references/intent-dialogue.md +138 -0
  182. package/bundled-skills/yao-meta-skill/references/iteration-philosophy.md +30 -0
  183. package/bundled-skills/yao-meta-skill/references/non-skill-decision-tree.md +39 -0
  184. package/bundled-skills/yao-meta-skill/references/operating-modes.md +107 -0
  185. package/bundled-skills/yao-meta-skill/references/output-eval-method.md +113 -0
  186. package/bundled-skills/yao-meta-skill/references/output-quality-risk.md +41 -0
  187. package/bundled-skills/yao-meta-skill/references/output-visual-quality.md +53 -0
  188. package/bundled-skills/yao-meta-skill/references/packaging-contracts.md +70 -0
  189. package/bundled-skills/yao-meta-skill/references/pattern-extraction-doctrine.md +76 -0
  190. package/bundled-skills/yao-meta-skill/references/platform-capability-matrix.md +49 -0
  191. package/bundled-skills/yao-meta-skill/references/prompt-engineering-doctrine.md +76 -0
  192. package/bundled-skills/yao-meta-skill/references/qa-ladder.md +57 -0
  193. package/bundled-skills/yao-meta-skill/references/reference-scan.md +126 -0
  194. package/bundled-skills/yao-meta-skill/references/regression-cause-taxonomy.md +80 -0
  195. package/bundled-skills/yao-meta-skill/references/resource-boundaries.md +120 -0
  196. package/bundled-skills/yao-meta-skill/references/review-studio-method.md +87 -0
  197. package/bundled-skills/yao-meta-skill/references/review-waiver-method.md +76 -0
  198. package/bundled-skills/yao-meta-skill/references/runtime-conformance-method.md +21 -0
  199. package/bundled-skills/yao-meta-skill/references/skill-archetypes.md +86 -0
  200. package/bundled-skills/yao-meta-skill/references/skill-atlas-method.md +35 -0
  201. package/bundled-skills/yao-meta-skill/references/skill-engineering-method.md +210 -0
  202. package/bundled-skills/yao-meta-skill/references/skill-ir-method.md +41 -0
  203. package/bundled-skills/yao-meta-skill/references/skillops-decision-policy.md +53 -0
  204. package/bundled-skills/yao-meta-skill/references/systems-thinking-doctrine.md +75 -0
  205. package/bundled-skills/yao-meta-skill/references/telemetry-drift-method.md +182 -0
  206. package/bundled-skills/yao-meta-skill/references/trust-security-method.md +79 -0
  207. package/bundled-skills/yao-meta-skill/references/user-memory-policy.md +35 -0
  208. package/bundled-skills/youtube-notetaker/SKILL.md +209 -0
  209. package/bundled-skills/youtube-notetaker/reference/artifact.html +269 -0
  210. package/bundled-skills/youtube-notetaker/scripts/contact_sheet.py +53 -0
  211. package/bundled-skills/youtube-notetaker/scripts/detect_slides.sh +19 -0
  212. package/bundled-skills/youtube-notetaker/scripts/download.sh +24 -0
  213. package/bundled-skills/youtube-notetaker/scripts/extract_slides.py +43 -0
  214. package/bundled-skills/youtube-notetaker/scripts/serve.py +222 -0
  215. package/bundled-skills/youtube-notetaker/scripts/setup.sh +27 -0
  216. package/bundled-skills/youtube-notetaker/scripts/verify.sh +31 -0
  217. package/bundled-skills/youtube-notetaker/scripts/vtt_to_transcript.py +59 -0
  218. package/bundled-skills/youtube-notetaker/scripts/write_library_item.py +69 -0
  219. package/package.json +1 -1
  220. package/skills_index.json +2013 -330
  221. package/bundled-skills/ai-md/SKILL.md +0 -523
  222. package/bundled-skills/atlas-contract/SKILL.md +0 -650
  223. package/bundled-skills/busybox-on-windows/SKILL.md +0 -40
  224. package/bundled-skills/monte-carlo-prevent/SKILL.md +0 -257
  225. package/bundled-skills/monte-carlo-prevent/references/TROUBLESHOOTING.md +0 -23
  226. package/bundled-skills/monte-carlo-prevent/references/parameters.md +0 -32
  227. package/bundled-skills/monte-carlo-prevent/references/workflows.md +0 -478
  228. package/bundled-skills/monte-carlo-push-ingestion/SKILL.md +0 -372
  229. package/bundled-skills/monte-carlo-push-ingestion/references/anomaly-detection.md +0 -87
  230. package/bundled-skills/monte-carlo-push-ingestion/references/custom-lineage.md +0 -203
  231. package/bundled-skills/monte-carlo-push-ingestion/references/direct-http-api.md +0 -207
  232. package/bundled-skills/monte-carlo-push-ingestion/references/prerequisites.md +0 -150
  233. package/bundled-skills/monte-carlo-push-ingestion/references/push-lineage.md +0 -160
  234. package/bundled-skills/monte-carlo-push-ingestion/references/push-metadata.md +0 -158
  235. package/bundled-skills/monte-carlo-push-ingestion/references/push-query-logs.md +0 -219
  236. package/bundled-skills/monte-carlo-push-ingestion/references/validation.md +0 -257
  237. package/bundled-skills/monte-carlo-push-ingestion/scripts/sample_verify.py +0 -357
  238. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_lineage.py +0 -70
  239. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_metadata.py +0 -65
  240. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_and_push_query_logs.py +0 -70
  241. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_lineage.py +0 -214
  242. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_metadata.py +0 -160
  243. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/collect_query_logs.py +0 -164
  244. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_lineage.py +0 -198
  245. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_metadata.py +0 -193
  246. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery/push_query_logs.py +0 -207
  247. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_metadata.py +0 -71
  248. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_and_push_query_logs.py +0 -64
  249. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_metadata.py +0 -253
  250. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/collect_query_logs.py +0 -149
  251. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_metadata.py +0 -190
  252. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/bigquery-iceberg/push_query_logs.py +0 -208
  253. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_lineage.py +0 -83
  254. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_metadata.py +0 -77
  255. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_and_push_query_logs.py +0 -83
  256. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_lineage.py +0 -240
  257. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_metadata.py +0 -212
  258. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/collect_query_logs.py +0 -204
  259. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_lineage.py +0 -192
  260. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_metadata.py +0 -178
  261. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/databricks/push_query_logs.py +0 -200
  262. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_lineage.py +0 -119
  263. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_metadata.py +0 -119
  264. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_and_push_query_logs.py +0 -117
  265. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_lineage.py +0 -265
  266. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_metadata.py +0 -313
  267. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/collect_query_logs.py +0 -284
  268. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_lineage.py +0 -309
  269. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_metadata.py +0 -245
  270. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/hive/push_query_logs.py +0 -255
  271. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_lineage.py +0 -78
  272. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_metadata.py +0 -80
  273. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_and_push_query_logs.py +0 -88
  274. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_lineage.py +0 -235
  275. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_metadata.py +0 -219
  276. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/collect_query_logs.py +0 -239
  277. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_lineage.py +0 -178
  278. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_metadata.py +0 -178
  279. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/redshift/push_query_logs.py +0 -196
  280. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_lineage.py +0 -154
  281. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_metadata.py +0 -137
  282. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_and_push_query_logs.py +0 -137
  283. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_lineage.py +0 -349
  284. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_metadata.py +0 -329
  285. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/collect_query_logs.py +0 -254
  286. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_lineage.py +0 -307
  287. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_metadata.py +0 -228
  288. package/bundled-skills/monte-carlo-push-ingestion/scripts/templates/snowflake/push_query_logs.py +0 -248
  289. package/bundled-skills/monte-carlo-push-ingestion/scripts/test_template_sdk_usage.py +0 -340
  290. package/bundled-skills/skill-optimizer/SKILL.md +0 -271
  291. package/bundled-skills/using-superpowers/SKILL.md +0 -98
@@ -1,117 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Collect Hive query logs from a local log file and push them to Monte Carlo
4
- in one step.
5
-
6
- Thin wrapper that calls ``collect()`` from ``collect_query_logs`` followed by
7
- ``push()`` from ``push_query_logs``, then writes the final manifest (with
8
- ``resource_uuid`` and ``invocation_id``) to ``--output-file``.
9
-
10
- Substitution points
11
- -------------------
12
- - MCD_INGEST_ID (env) / --key-id (CLI) : Monte Carlo ingestion key ID
13
- - MCD_INGEST_TOKEN (env) / --key-token (CLI) : Monte Carlo ingestion key token
14
- - MCD_RESOURCE_UUID (env) / --resource-uuid (CLI) : MC resource UUID (optional for query logs)
15
- - --log-file path to local HiveServer2 log (default: /tmp/root/hive.log)
16
- - --op-logs-dir optional directory of per-query <queryId>.log files
17
-
18
- Prerequisites
19
- -------------
20
- pip install pycarlo python-dateutil python-dotenv
21
-
22
- Usage
23
- -----
24
- python collect_and_push_query_logs.py \\
25
- --key-id <MCD_INGEST_ID> \\
26
- --key-token <MCD_INGEST_TOKEN> \\
27
- --resource-uuid <MCD_RESOURCE_UUID> \\
28
- --log-file /tmp/root/hive.log \\
29
- [--op-logs-dir /var/log/hive/operation_logs]
30
- """
31
-
32
- import argparse
33
- import json
34
- import os
35
-
36
- from collect_query_logs import collect
37
- from push_query_logs import DEFAULT_BATCH_SIZE, DEFAULT_TIMEOUT_SECONDS, push
38
-
39
-
40
- def main() -> None:
41
- parser = argparse.ArgumentParser(
42
- description="Collect Hive query logs from a local log file and push to Monte Carlo",
43
- )
44
- # Collect args
45
- parser.add_argument(
46
- "--log-file",
47
- default="/tmp/root/hive.log",
48
- help="Path to local HiveServer2 log file (default: /tmp/root/hive.log)", # ← SUBSTITUTE: your log path
49
- )
50
- parser.add_argument(
51
- "--op-logs-dir",
52
- default=None,
53
- help=(
54
- "Directory containing per-query Hive operation logs (<queryId>.log). "
55
- "When provided, returned_rows is populated from SelectOperator RECORDS_OUT counts."
56
- ),
57
- # ← SUBSTITUTE: e.g. /var/log/hive/operation_logs or wherever Hive writes op logs
58
- )
59
- # Push / MC args
60
- parser.add_argument(
61
- "--key-id",
62
- default=os.environ.get("MCD_INGEST_ID"),
63
- help="Monte Carlo ingestion key ID (env: MCD_INGEST_ID)",
64
- )
65
- parser.add_argument(
66
- "--key-token",
67
- default=os.environ.get("MCD_INGEST_TOKEN"),
68
- help="Monte Carlo ingestion key token (env: MCD_INGEST_TOKEN)",
69
- )
70
- parser.add_argument(
71
- "--resource-uuid",
72
- default=os.environ.get("MCD_RESOURCE_UUID"),
73
- help="Monte Carlo resource UUID (optional for query logs) (env: MCD_RESOURCE_UUID)",
74
- )
75
- parser.add_argument(
76
- "--output-file",
77
- default="query_logs_output.json",
78
- help="Path to write the output manifest (default: query_logs_output.json)",
79
- )
80
- parser.add_argument(
81
- "--batch-size",
82
- type=int,
83
- default=DEFAULT_BATCH_SIZE,
84
- metavar="N",
85
- help=f"Max events per POST (default: {DEFAULT_BATCH_SIZE})",
86
- )
87
- parser.add_argument(
88
- "--timeout",
89
- type=int,
90
- default=DEFAULT_TIMEOUT_SECONDS,
91
- metavar="SEC",
92
- help=f"HTTP timeout per request in seconds (default: {DEFAULT_TIMEOUT_SECONDS})",
93
- )
94
- args = parser.parse_args()
95
-
96
- if not args.key_id or not args.key_token:
97
- parser.error("--key-id and --key-token are required (or set MCD_INGEST_ID / MCD_INGEST_TOKEN)")
98
-
99
- manifest = collect(log_file=args.log_file, op_logs_dir=args.op_logs_dir)
100
-
101
- push(
102
- manifest=manifest,
103
- key_id=args.key_id,
104
- key_token=args.key_token,
105
- resource_uuid=args.resource_uuid,
106
- batch_size=args.batch_size,
107
- timeout_seconds=args.timeout,
108
- )
109
-
110
- with open(args.output_file, "w") as fh:
111
- json.dump(manifest, fh, indent=2)
112
- print(f"Query log manifest written to {args.output_file}")
113
- print("Done.")
114
-
115
-
116
- if __name__ == "__main__":
117
- main()
@@ -1,265 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Extract table and column lineage from a local HiveServer2 log file — collection only.
4
-
5
- Reads a plain-text Hive log file (not compressed), extracts SQL query blocks
6
- from "Executing command" / "Starting command" entries, detects CTAS and
7
- INSERT INTO ... SELECT patterns to build lineage edges, then writes a JSON
8
- manifest file.
9
-
10
- Can be run standalone via CLI or imported (use the ``collect()`` function).
11
-
12
- Substitution points
13
- -------------------
14
- - --log-file path to local HiveServer2 log (default: /tmp/root/hive.log)
15
-
16
- Prerequisites
17
- -------------
18
- pip install python-dotenv
19
-
20
- Usage
21
- -----
22
- python collect_lineage.py \\
23
- --log-file /tmp/root/hive.log \\
24
- --output-file lineage_output.json
25
- """
26
-
27
- from __future__ import annotations
28
-
29
- import argparse
30
- import json
31
- import re
32
- from dataclasses import dataclass, field
33
- from datetime import datetime, timezone
34
-
35
- # ← SUBSTITUTE: set RESOURCE_TYPE to match your Monte Carlo connection type
36
- RESOURCE_TYPE = "data-lake"
37
-
38
- # Regex for CTAS: CREATE TABLE [IF NOT EXISTS] db.table AS SELECT ... FROM db.table
39
- _CTAS_RE = re.compile(
40
- r"CREATE\s+TABLE\s+(?:IF\s+NOT\s+EXISTS\s+)?"
41
- r"(?P<dest_db>\w+)\.(?P<dest_table>\w+)"
42
- r".*?AS\s+SELECT\s+(?P<select_cols>.+?)\s+FROM\s+(?P<src_db>\w+)\.(?P<src_table>\w+)",
43
- re.IGNORECASE | re.DOTALL,
44
- )
45
-
46
- # Regex for INSERT INTO/OVERWRITE db.table SELECT ... FROM db.table
47
- _INSERT_RE = re.compile(
48
- r"INSERT\s+(?:INTO|OVERWRITE)\s+(?:TABLE\s+)?(?P<dest_db>\w+)\.(?P<dest_table>\w+)"
49
- r".*?SELECT\s+(?P<select_cols>.+?)\s+FROM\s+(?P<src_db>\w+)\.(?P<src_table>\w+)",
50
- re.IGNORECASE | re.DOTALL,
51
- )
52
-
53
- # Regex to detect additional JOIN sources beyond the primary FROM clause
54
- _JOIN_RE = re.compile(r"JOIN\s+(?P<src_db>\w+)\.(?P<src_table>\w+)", re.IGNORECASE)
55
-
56
- # Simple column alias extraction: [alias.]col [AS dest]
57
- _COL_RE = re.compile(r"(?:(\w+)\.)?(\w+)(?:\s+AS\s+(\w+))?", re.IGNORECASE)
58
-
59
- # Hive string literals — strip before scanning so words inside 'status' AS ...
60
- # are not treated as column refs
61
- _STR_LITERAL_RE = re.compile(r"'(?:''|[^'])*'")
62
-
63
- # ROW_NUMBER() OVER (...) AS alias — whole expression has no single source column;
64
- # removing it avoids bogus tokens in col_mappings
65
- _WINDOW_AS_ALIAS_RE = re.compile(
66
- r"\b(?:ROW_NUMBER|RANK|DENSE_RANK|NTILE)\s*\(\s*\)\s+OVER\s*\([^)]*\)\s+AS\s+\w+",
67
- re.IGNORECASE,
68
- )
69
-
70
- # Regex to pull query text out of Hive log "Executing/Starting command" lines
71
- _COMMAND_START_RE = re.compile(
72
- r"(?:Executing|Starting)\s+command\(queryId=\S*\):\s+(?P<query>.+?)(?=\n\d{4}-\d{2}-\d{2}|\Z)",
73
- re.DOTALL,
74
- )
75
-
76
- # Tokens that are almost never real column names — SQL keywords, functions, casts, etc.
77
- _SQL_SCAN_NOISE = frozenset(
78
- {
79
- "ROW_NUMBER", "RANK", "DENSE_RANK", "NTILE", "OVER", "PARTITION",
80
- "ORDER", "BY", "CASE", "WHEN", "THEN", "ELSE", "END", "AND", "OR",
81
- "NOT", "IN", "IS", "DISTINCT", "CAST", "CONVERT", "CURRENT_TIMESTAMP",
82
- "CURRENT_DATE", "TRUE", "FALSE", "NULL", "BETWEEN", "LIKE", "EXISTS",
83
- "ASC", "DESC", "LIMIT", "OFFSET", "GROUP", "HAVING", "UNION", "ALL",
84
- "INNER", "LEFT", "RIGHT", "FULL", "OUTER", "CROSS", "JOIN", "ON",
85
- "WHERE", "SELECT", "FROM", "AS", "STRING", "BIGINT", "INT", "SMALLINT",
86
- "TINYINT", "DOUBLE", "FLOAT", "REAL", "DECIMAL", "BOOLEAN", "DATE",
87
- "TIMESTAMP", "VARCHAR", "CHAR", "BINARY", "ARRAY", "MAP", "STRUCT",
88
- "SUM", "AVG", "COUNT", "MIN", "MAX", "STDDEV", "VARIANCE", "VAR_POP",
89
- "COALESCE", "IF", "SUBSTRING", "YEAR", "MONTH", "DAY", "LEAD", "LAG",
90
- "FIRST_VALUE", "LAST_VALUE",
91
- }
92
- )
93
-
94
-
95
- @dataclass
96
- class _LineageEdge:
97
- dest_db: str
98
- dest_table: str
99
- sources: list[tuple[str, str]] = field(default_factory=list)
100
- # col_mappings: (dest_col, src_table, src_col)
101
- col_mappings: list[tuple[str, str, str]] = field(default_factory=list)
102
-
103
-
104
- def _prepare_select_for_col_scan(select_clause: str) -> str:
105
- """Remove literals and window headers so _COL_RE sees fewer false positives."""
106
- s = _STR_LITERAL_RE.sub(" ", select_clause)
107
- s = _WINDOW_AS_ALIAS_RE.sub(" ", s)
108
- return s
109
-
110
-
111
- def _dedupe_col_mappings(mappings: list[tuple[str, str, str]]) -> list[tuple[str, str, str]]:
112
- seen: set[tuple[str, str, str]] = set()
113
- out: list[tuple[str, str, str]] = []
114
- for t in mappings:
115
- if t in seen:
116
- continue
117
- seen.add(t)
118
- out.append(t)
119
- return out
120
-
121
-
122
- def _extract_query_blocks(log_text: str) -> list[str]:
123
- """Extract individual SQL query strings from a Hive log file."""
124
- return [m.group("query").strip() for m in _COMMAND_START_RE.finditer(log_text)]
125
-
126
-
127
- def _parse_select_cols(select_clause: str, src_table: str) -> list[tuple[str, str, str]]:
128
- """
129
- Lightweight column mapping: for each `alias.col AS dest` or `col AS dest`
130
- in the SELECT clause, return (dest_col, src_table, src_col).
131
-
132
- Strips string literals and window function headers first to reduce false
133
- positives, and filters out SQL keywords/noise tokens.
134
- """
135
- prepared = _prepare_select_for_col_scan(select_clause)
136
- mappings = []
137
- for m in _COL_RE.finditer(prepared):
138
- src_col = m.group(2)
139
- dest_col = m.group(3) or src_col
140
- if src_col.upper() in ("FROM", "SELECT", "WHERE", "JOIN", "ON", "AS", "*"):
141
- continue
142
- if src_col.upper() in _SQL_SCAN_NOISE or dest_col.upper() in _SQL_SCAN_NOISE:
143
- continue
144
- # After stripping 'literal' AS col, we get " AS col" — skip bare (col, col) with no source expr.
145
- if dest_col == src_col:
146
- prefix = prepared[: m.start()].rstrip()
147
- if prefix.upper().endswith("AS"):
148
- continue
149
- mappings.append((dest_col, src_table, src_col))
150
- return _dedupe_col_mappings(mappings)
151
-
152
-
153
- def _parse_edges(queries: list[str]) -> list[_LineageEdge]:
154
- """Parse SQL query strings into _LineageEdge objects."""
155
- edges: dict[str, _LineageEdge] = {}
156
-
157
- for sql in queries:
158
- # Strip string literals to avoid false table/column matches inside quoted strings
159
- sql_clean = re.sub(r"\s+", " ", _STR_LITERAL_RE.sub(" ", sql)).strip()
160
-
161
- for pattern in (_CTAS_RE, _INSERT_RE):
162
- m = pattern.search(sql_clean)
163
- if not m:
164
- continue
165
-
166
- dest_db = m.group("dest_db").lower()
167
- dest_table = m.group("dest_table").lower()
168
- src_db = m.group("src_db").lower()
169
- src_table = m.group("src_table").lower()
170
- select_cols = m.group("select_cols")
171
-
172
- key = f"{dest_db}.{dest_table}"
173
- if key not in edges:
174
- edges[key] = _LineageEdge(dest_db=dest_db, dest_table=dest_table)
175
-
176
- edge = edges[key]
177
- src_pair = (src_db, src_table)
178
- if src_pair not in edge.sources:
179
- edge.sources.append(src_pair)
180
-
181
- # Pick up additional JOIN sources
182
- for jm in _JOIN_RE.finditer(sql_clean):
183
- jp = (jm.group("src_db").lower(), jm.group("src_table").lower())
184
- if jp not in edge.sources:
185
- edge.sources.append(jp)
186
-
187
- edge.col_mappings.extend(_parse_select_cols(select_cols, src_table))
188
- break # matched one pattern, move to next query
189
-
190
- # Deduplicate column mappings per edge (same INSERT may appear many times in HS2 logs)
191
- for e in edges.values():
192
- e.col_mappings = _dedupe_col_mappings(e.col_mappings)
193
-
194
- return list(edges.values())
195
-
196
-
197
- def collect(log_file: str) -> dict:
198
- """
199
- Parse lineage edges from a HiveServer2 log file and return a manifest dict.
200
-
201
- Args:
202
- log_file: Path to a local HiveServer2 log file.
203
-
204
- Returns:
205
- Manifest dict with keys: resource_type, collected_at, edges.
206
- Each edge has destination, sources, and col_mappings lists.
207
- """
208
- print(f"Reading Hive log file: {log_file} ...")
209
- with open(log_file, errors="replace") as fh:
210
- log_text = fh.read()
211
-
212
- queries = _extract_query_blocks(log_text)
213
- print(f" Extracted {len(queries)} query block(s).")
214
-
215
- edges = _parse_edges(queries)
216
- print(f" Parsed {len(edges)} lineage edge(s).")
217
-
218
- manifest = {
219
- "resource_type": RESOURCE_TYPE,
220
- "collected_at": datetime.now(tz=timezone.utc).isoformat(),
221
- "edges": [
222
- {
223
- "destination": {"database": e.dest_db, "table": e.dest_table},
224
- "sources": [{"database": sdb, "table": stbl} for sdb, stbl in e.sources],
225
- "col_mappings": [
226
- {"dest_col": dc, "src_table": st, "src_col": sc}
227
- for dc, st, sc in e.col_mappings
228
- ],
229
- }
230
- for e in edges
231
- ],
232
- }
233
- return manifest
234
-
235
-
236
- def main() -> None:
237
- parser = argparse.ArgumentParser(
238
- description="Extract Hive lineage from a local log file and write a JSON manifest",
239
- )
240
- parser.add_argument(
241
- "--log-file",
242
- default="/tmp/root/hive.log",
243
- help="Path to local HiveServer2 log file (default: /tmp/root/hive.log)", # ← SUBSTITUTE: your log path
244
- )
245
- parser.add_argument(
246
- "--output-file",
247
- default="lineage_output.json",
248
- help="Path to write the lineage manifest (default: lineage_output.json)",
249
- )
250
- args = parser.parse_args()
251
-
252
- manifest = collect(log_file=args.log_file)
253
-
254
- if not manifest["edges"]:
255
- print("No lineage edges detected — no CTAS or INSERT INTO ... SELECT patterns found.")
256
- return
257
-
258
- with open(args.output_file, "w") as fh:
259
- json.dump(manifest, fh, indent=2)
260
- print(f"Lineage manifest written to {args.output_file}")
261
- print("Done.")
262
-
263
-
264
- if __name__ == "__main__":
265
- main()
@@ -1,313 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Collect table metadata from a Hive Metastore — collection only.
4
-
5
- Connects to HiveServer2 (default port 10000), discovers all databases and
6
- tables via SHOW DATABASES / SHOW TABLES, reads schema and table statistics
7
- via DESCRIBE FORMATTED, then writes a JSON manifest file.
8
-
9
- Can be run standalone via CLI or imported (use the ``collect()`` function).
10
-
11
- Substitution points
12
- -------------------
13
- - HIVE_HOST (env) / --hive-host (CLI) : HiveServer2 hostname
14
- - HIVE_PORT (env) / --hive-port (CLI) : HiveServer2 port (default 10000)
15
-
16
- Prerequisites
17
- -------------
18
- pip install pyhive python-dotenv
19
-
20
- Usage
21
- -----
22
- python collect_metadata.py \\
23
- --hive-host <HIVESERVER2_HOSTNAME> \\
24
- --output-file metadata_output.json
25
- """
26
-
27
- import argparse
28
- import json
29
- import os
30
- import re
31
- from datetime import datetime, timezone
32
-
33
- from pyhive import hive
34
-
35
-
36
- def _check_available_memory(min_gb: float = 2.0) -> None:
37
- """Warn if available memory is below the threshold."""
38
- try:
39
- if hasattr(os, "sysconf"): # Linux / macOS
40
- page_size = os.sysconf("SC_PAGE_SIZE")
41
- avail_pages = os.sysconf("SC_AVPHYS_PAGES")
42
- avail_gb = (page_size * avail_pages) / (1024 ** 3)
43
- else:
44
- return # Windows — skip check
45
- except (ValueError, OSError):
46
- return
47
- if avail_gb < min_gb:
48
- print(
49
- f"WARNING: Only {avail_gb:.1f} GB of memory available "
50
- f"(minimum recommended: {min_gb:.1f} GB). "
51
- f"Consider reducing the number of databases/tables or increasing available memory."
52
- )
53
-
54
- # ← SUBSTITUTE: set RESOURCE_TYPE to match your Monte Carlo connection type
55
- RESOURCE_TYPE = "data-lake"
56
-
57
- # Map Hive native types to SQL-standard uppercase types expected by Monte Carlo
58
- _HIVE_TYPE_MAP: dict[str, str] = {
59
- "tinyint": "TINYINT",
60
- "smallint": "SMALLINT",
61
- "int": "INTEGER",
62
- "integer": "INTEGER",
63
- "bigint": "BIGINT",
64
- "float": "FLOAT",
65
- "double": "DOUBLE",
66
- "double precision": "DOUBLE",
67
- "decimal": "DECIMAL",
68
- "numeric": "DECIMAL",
69
- "boolean": "BOOLEAN",
70
- "string": "VARCHAR",
71
- "varchar": "VARCHAR",
72
- "char": "CHAR",
73
- "binary": "BINARY",
74
- "timestamp": "TIMESTAMP",
75
- "date": "DATE",
76
- "interval": "INTERVAL",
77
- "array": "ARRAY",
78
- "map": "MAP",
79
- "struct": "STRUCT",
80
- "uniontype": "UNION",
81
- }
82
-
83
- # ← SUBSTITUTE: add any internal table name prefixes you want to skip
84
- _INTERNAL_TABLE_PREFIXES = ("tmp_", "__", "hive_")
85
-
86
-
87
- def _normalize_hive_type(hive_type: str) -> str:
88
- """Uppercase and normalize a Hive type string to a SQL-standard form.
89
-
90
- Parametrized types like ``decimal(10,2)`` or ``varchar(255)`` keep their
91
- suffix; the base type is mapped through ``_HIVE_TYPE_MAP``.
92
- """
93
- lower = hive_type.lower().strip()
94
- base = lower.split("(")[0].strip()
95
- suffix = hive_type[len(base):].strip() # preserve original params, e.g. decimal(10,2)
96
- return _HIVE_TYPE_MAP.get(base, base.upper()) + suffix
97
-
98
-
99
- def _connect(host: str, port: int) -> hive.Connection:
100
- # ← SUBSTITUTE: update username/auth if your cluster requires Kerberos or LDAP
101
- return hive.connect(host=host, port=port, username="hadoop", auth="NONE")
102
-
103
-
104
- def _fetch_rows(cursor, query: str) -> list[tuple]:
105
- """Execute a query and fetch results in memory-safe chunks."""
106
- cursor.execute(query)
107
- rows: list[tuple] = []
108
- while True:
109
- chunk = cursor.fetchmany(1000)
110
- if not chunk:
111
- break
112
- rows.extend(chunk)
113
- return rows
114
-
115
-
116
- def _parse_describe_formatted(rows: list[tuple]) -> dict:
117
- """
118
- Parse DESCRIBE FORMATTED <db>.<table> output into a structured dict:
119
- columns, row_count, total_size, last_modified, description, created_on
120
- """
121
- result: dict = {
122
- "columns": [],
123
- "row_count": None,
124
- "total_size": None,
125
- "last_modified": None,
126
- "description": None,
127
- "created_on": None,
128
- }
129
- in_col_info = False
130
- in_table_info = False
131
-
132
- for row in rows:
133
- col_name = (row[0] or "").strip()
134
- data_type = (row[1] or "").strip()
135
- comment = (row[2] or "").strip() if len(row) > 2 else ""
136
-
137
- if col_name.startswith("# col_name"):
138
- in_col_info = True
139
- in_table_info = False
140
- continue
141
- if col_name.startswith("# Detailed Table Information"):
142
- in_col_info = False
143
- in_table_info = True
144
- continue
145
- if col_name.startswith("#"):
146
- in_col_info = False
147
- continue
148
-
149
- if in_col_info and col_name and data_type:
150
- result["columns"].append(
151
- {
152
- "name": col_name,
153
- "type": _normalize_hive_type(data_type),
154
- "description": comment or None,
155
- }
156
- )
157
-
158
- if in_table_info:
159
- # Table Parameters rows have an empty col_name; key is in data_type, value in comment
160
- param_key = data_type.strip() if not col_name else col_name.strip().rstrip(":")
161
- param_val = (comment.strip() if not col_name else data_type.strip()) or ""
162
-
163
- if re.search(r"numRows", param_key, re.IGNORECASE):
164
- try:
165
- result["row_count"] = int(param_val)
166
- except (ValueError, TypeError):
167
- pass
168
- elif re.search(r"totalSize", param_key, re.IGNORECASE):
169
- try:
170
- result["total_size"] = int(param_val)
171
- except (ValueError, TypeError):
172
- pass
173
- elif re.search(r"last_modified_time", param_key, re.IGNORECASE):
174
- try:
175
- result["last_modified"] = datetime.fromtimestamp(
176
- int(param_val), tz=timezone.utc
177
- ).isoformat()
178
- except (ValueError, TypeError):
179
- pass
180
- elif re.search(r"^CreateTime", param_key):
181
- # e.g. "Wed Mar 18 20:15:40 UTC 2026"
182
- try:
183
- result["created_on"] = datetime.strptime(
184
- param_val, "%a %b %d %H:%M:%S %Z %Y"
185
- ).replace(tzinfo=timezone.utc).isoformat()
186
- except (ValueError, TypeError):
187
- pass
188
- elif param_key == "comment" and not result["description"] and param_val:
189
- result["description"] = param_val
190
-
191
- return result
192
-
193
-
194
- def collect(
195
- hive_host: str,
196
- hive_port: int = 10000,
197
- ) -> dict:
198
- """
199
- Connect to HiveServer2, discover all databases and tables, and return a
200
- manifest dict with collected asset metadata.
201
-
202
- Args:
203
- hive_host: HiveServer2 hostname.
204
- hive_port: HiveServer2 port (default 10000).
205
-
206
- Returns:
207
- Manifest dict with keys: resource_type, collected_at, assets.
208
- """
209
- _check_available_memory()
210
- print(f"Connecting to HiveServer2 at {hive_host}:{hive_port} ...")
211
- conn = _connect(hive_host, hive_port)
212
- cursor = conn.cursor()
213
- assets: list[dict] = []
214
-
215
- print("Collecting table metadata ...")
216
- databases = [row[0] for row in _fetch_rows(cursor, "SHOW DATABASES")]
217
- print(f" Found databases: {databases}")
218
-
219
- for db in databases:
220
- # ← SUBSTITUTE: add any system databases you want to skip
221
- if db in ("information_schema",):
222
- continue
223
-
224
- tables = _fetch_rows(cursor, f"SHOW TABLES IN {db}")
225
- table_names = [row[0] for row in tables]
226
- print(f" {db}: {len(table_names)} table(s)")
227
-
228
- for table in table_names:
229
- if any(table.startswith(p) for p in _INTERNAL_TABLE_PREFIXES):
230
- continue
231
-
232
- try:
233
- desc_rows = _fetch_rows(cursor, f"DESCRIBE FORMATTED {db}.{table}")
234
- except Exception as exc:
235
- print(f" WARNING: could not describe {db}.{table}: {exc}")
236
- continue
237
-
238
- info = _parse_describe_formatted(desc_rows)
239
-
240
- row_count = info["row_count"] if info["row_count"] and info["row_count"] > 0 else None
241
- byte_count = info["total_size"] if info["total_size"] and info["total_size"] > 0 else None
242
-
243
- assets.append(
244
- {
245
- "database": db,
246
- "schema": db,
247
- "name": table,
248
- "description": info["description"],
249
- "created_on": info["created_on"],
250
- "row_count": row_count,
251
- "byte_count": byte_count,
252
- "last_modified": info["last_modified"],
253
- "fields": [
254
- {"name": col["name"], "type": col["type"], "description": col["description"]}
255
- for col in info["columns"]
256
- ],
257
- }
258
- )
259
- print(
260
- f" + {db}.{table} ({len(info['columns'])} columns, "
261
- f"desc={info['description']!r}, created={info['created_on']})"
262
- )
263
-
264
- cursor.close()
265
- conn.close()
266
- print(f"\nCollected {len(assets)} table(s).")
267
-
268
- manifest = {
269
- "resource_type": RESOURCE_TYPE,
270
- "collected_at": datetime.now(tz=timezone.utc).isoformat(),
271
- "assets": assets,
272
- }
273
- return manifest
274
-
275
-
276
- def main() -> None:
277
- parser = argparse.ArgumentParser(
278
- description="Collect Hive table metadata and write a JSON manifest",
279
- )
280
- parser.add_argument(
281
- "--hive-host",
282
- default=os.environ.get("HIVE_HOST"),
283
- help="HiveServer2 hostname (env: HIVE_HOST)", # ← SUBSTITUTE: your EMR master DNS or Hive host
284
- )
285
- parser.add_argument(
286
- "--hive-port",
287
- type=int,
288
- default=10000,
289
- help="HiveServer2 port (default: 10000)", # ← SUBSTITUTE if your cluster uses a non-standard port
290
- )
291
- parser.add_argument(
292
- "--output-file",
293
- default="metadata_output.json",
294
- help="Path to write the output manifest (default: metadata_output.json)",
295
- )
296
- args = parser.parse_args()
297
-
298
- if not args.hive_host:
299
- parser.error("--hive-host is required (or set HIVE_HOST)")
300
-
301
- manifest = collect(
302
- hive_host=args.hive_host,
303
- hive_port=args.hive_port,
304
- )
305
-
306
- with open(args.output_file, "w") as fh:
307
- json.dump(manifest, fh, indent=2)
308
- print(f"Asset manifest written to {args.output_file}")
309
- print("Done.")
310
-
311
-
312
- if __name__ == "__main__":
313
- main()