nuvu-scan 1.3.7__tar.gz → 2.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/.github/workflows/ci.yml +32 -17
  2. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/.github/workflows/release.yml +19 -19
  3. nuvu_scan-2.0.0/.pre-commit-config.yaml +42 -0
  4. nuvu_scan-2.0.0/DEVELOPMENT_STATUS.md +359 -0
  5. nuvu_scan-2.0.0/Makefile +39 -0
  6. nuvu_scan-1.3.7/README.md → nuvu_scan-2.0.0/PKG-INFO +115 -11
  7. nuvu_scan-1.3.7/PKG-INFO → nuvu_scan-2.0.0/README.md +68 -49
  8. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/RELEASE.md +2 -2
  9. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/__init__.py +1 -1
  10. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/cli/commands/scan.py +94 -0
  11. nuvu_scan-2.0.0/nuvu_scan/cli/formatters/html.py +421 -0
  12. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/cli/main.py +2 -1
  13. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/base.py +6 -0
  14. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/aws/aws_scanner.py +55 -15
  15. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/aws/collectors/athena.py +3 -0
  16. nuvu_scan-2.0.0/nuvu_scan/core/providers/aws/collectors/glue.py +553 -0
  17. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/aws/collectors/iam.py +9 -0
  18. nuvu_scan-2.0.0/nuvu_scan/core/providers/aws/collectors/redshift.py +910 -0
  19. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/gcp/gcp_scanner.py +42 -10
  20. nuvu_scan-2.0.0/pyproject.toml +138 -0
  21. nuvu_scan-1.3.7/DEVELOPMENT_STATUS.md +0 -249
  22. nuvu_scan-1.3.7/nuvu_scan/cli/formatters/html.py +0 -169
  23. nuvu_scan-1.3.7/nuvu_scan/core/providers/aws/collectors/glue.py +0 -148
  24. nuvu_scan-1.3.7/nuvu_scan/core/providers/aws/collectors/redshift.py +0 -232
  25. nuvu_scan-1.3.7/pyproject.toml +0 -90
  26. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  27. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/.gitignore +0 -0
  28. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/CONTRIBUTING.md +0 -0
  29. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/cli/__init__.py +0 -0
  30. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/cli/commands/__init__.py +0 -0
  31. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/cli/formatters/__init__.py +0 -0
  32. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/cli/formatters/csv.py +0 -0
  33. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/cli/formatters/json.py +0 -0
  34. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/__init__.py +0 -0
  35. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/analyzers/__init__.py +0 -0
  36. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/models/__init__.py +0 -0
  37. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/__init__.py +0 -0
  38. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/aws/__init__.py +0 -0
  39. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/aws/collectors/__init__.py +0 -0
  40. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/aws/collectors/cost_explorer.py +0 -0
  41. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/aws/collectors/mwaa.py +0 -0
  42. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/aws/collectors/s3.py +0 -0
  43. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/gcp/__init__.py +0 -0
  44. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/gcp/collectors/__init__.py +0 -0
  45. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/gcp/collectors/bigquery.py +0 -0
  46. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/gcp/collectors/billing.py +0 -0
  47. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/gcp/collectors/dataproc.py +0 -0
  48. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/gcp/collectors/gcs.py +0 -0
  49. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/gcp/collectors/gemini.py +0 -0
  50. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/gcp/collectors/iam.py +0 -0
  51. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/gcp/collectors/pubsub.py +0 -0
  52. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/tests/__init__.py +0 -0
  53. {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/tests/test_base.py +0 -0
@@ -7,43 +7,58 @@ on:
7
7
  branches: [main, develop]
8
8
 
9
9
  jobs:
10
+ lint:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v4
14
+
15
+ - name: Install uv
16
+ uses: astral-sh/setup-uv@v4
17
+ with:
18
+ version: "latest"
19
+
20
+ - name: Set up Python
21
+ run: uv python install 3.12
22
+
23
+ - name: Install dependencies
24
+ run: uv sync --dev
25
+
26
+ - name: Run pre-commit
27
+ run: uv run pre-commit run --all-files
28
+
10
29
  test:
11
30
  runs-on: ubuntu-latest
31
+ needs: lint
12
32
  strategy:
13
33
  matrix:
14
34
  python-version: ["3.10", "3.11", "3.12", "3.13"]
15
-
35
+
16
36
  steps:
17
37
  - uses: actions/checkout@v4
18
-
38
+
19
39
  - name: Install uv
20
40
  uses: astral-sh/setup-uv@v4
21
41
  with:
22
42
  version: "latest"
23
-
43
+
24
44
  - name: Set up Python ${{ matrix.python-version }}
25
45
  run: uv python install ${{ matrix.python-version }}
26
-
46
+
27
47
  - name: Install dependencies
28
48
  run: |
29
49
  uv sync --dev
30
-
31
- - name: Run linter
32
- run: |
33
- uv run ruff check .
34
- uv run black --check .
35
-
50
+
36
51
  - name: Run type checker
37
52
  run: |
38
53
  uv run mypy nuvu_scan || true # Allow failures for now
39
-
54
+
40
55
  - name: Run tests
41
56
  run: |
42
57
  uv run pytest --cov=nuvu_scan --cov-report=xml
43
58
  env:
44
59
  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
45
60
  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
46
-
61
+
47
62
  - name: Upload coverage
48
63
  uses: codecov/codecov-action@v3
49
64
  with:
@@ -54,22 +69,22 @@ jobs:
54
69
  runs-on: ubuntu-latest
55
70
  needs: test
56
71
  if: github.event_name == 'push' && github.ref == 'refs/heads/main'
57
-
72
+
58
73
  steps:
59
74
  - uses: actions/checkout@v4
60
-
75
+
61
76
  - name: Install uv
62
77
  uses: astral-sh/setup-uv@v4
63
78
  with:
64
79
  version: "latest"
65
-
80
+
66
81
  - name: Set up Python
67
82
  run: uv python install 3.11
68
-
83
+
69
84
  - name: Build package
70
85
  run: |
71
86
  uv build
72
-
87
+
73
88
  - name: Upload artifacts
74
89
  uses: actions/upload-artifact@v4
75
90
  with:
@@ -18,21 +18,21 @@ jobs:
18
18
  permissions:
19
19
  contents: write # Required to create tags and releases
20
20
  id-token: write # Required for trusted publishing
21
-
21
+
22
22
  steps:
23
23
  - name: Checkout code
24
24
  uses: actions/checkout@v4
25
25
  with:
26
26
  fetch-depth: 0 # Fetch full history for version comparison
27
-
27
+
28
28
  - name: Install uv
29
29
  uses: astral-sh/setup-uv@v4
30
30
  with:
31
31
  version: "latest"
32
-
32
+
33
33
  - name: Set up Python
34
34
  run: uv python install 3.12
35
-
35
+
36
36
  - name: Extract version from pyproject.toml
37
37
  id: get_version
38
38
  run: |
@@ -41,18 +41,18 @@ jobs:
41
41
  VERSION=$(grep -E '^\s*version\s*=' pyproject.toml | head -1 | sed -E "s/.*version\s*=\s*['\"]([^'\"]+)['\"].*/\1/" | tr -d ' ')
42
42
  echo "version=$VERSION" >> $GITHUB_OUTPUT
43
43
  echo "Current version: $VERSION"
44
-
44
+
45
45
  - name: Check if version was bumped
46
46
  id: check_version
47
47
  if: github.event_name == 'push'
48
48
  run: |
49
49
  CURRENT_VERSION="${{ steps.get_version.outputs.version }}"
50
-
50
+
51
51
  # Get the previous commit's version
52
52
  git checkout HEAD~1 pyproject.toml 2>/dev/null || echo "No previous commit found"
53
53
  PREVIOUS_VERSION=$(grep -E '^\s*version\s*=' pyproject.toml 2>/dev/null | head -1 | sed -E "s/.*version\s*=\s*['\"]([^'\"]+)['\"].*/\1/" | tr -d ' ' || echo "")
54
54
  git checkout HEAD pyproject.toml
55
-
55
+
56
56
  if [ -z "$PREVIOUS_VERSION" ]; then
57
57
  echo "No previous version found, assuming first release"
58
58
  echo "should_release=true" >> $GITHUB_OUTPUT
@@ -63,7 +63,7 @@ jobs:
63
63
  echo "Version unchanged ($CURRENT_VERSION), skipping release"
64
64
  echo "should_release=false" >> $GITHUB_OUTPUT
65
65
  fi
66
-
66
+
67
67
  - name: Check if tag already exists
68
68
  id: check_tag
69
69
  if: |
@@ -72,7 +72,7 @@ jobs:
72
72
  run: |
73
73
  VERSION="${{ steps.get_version.outputs.version }}"
74
74
  TAG="v${VERSION}"
75
-
75
+
76
76
  if git rev-parse "$TAG" >/dev/null 2>&1; then
77
77
  echo "Tag $TAG already exists, skipping release"
78
78
  echo "tag_exists=true" >> $GITHUB_OUTPUT
@@ -80,7 +80,7 @@ jobs:
80
80
  echo "Tag $TAG does not exist, will create release"
81
81
  echo "tag_exists=false" >> $GITHUB_OUTPUT
82
82
  fi
83
-
83
+
84
84
  - name: Create Git Tag
85
85
  if: |
86
86
  ((github.event_name == 'push' && steps.check_version.outputs.should_release == 'true' && steps.check_tag.outputs.tag_exists == 'false') ||
@@ -88,14 +88,14 @@ jobs:
88
88
  run: |
89
89
  VERSION="${{ steps.get_version.outputs.version }}"
90
90
  TAG="v${VERSION}"
91
-
91
+
92
92
  git config user.name "github-actions[bot]"
93
93
  git config user.email "github-actions[bot]@users.noreply.github.com"
94
-
94
+
95
95
  git tag -a "$TAG" -m "Release $TAG"
96
96
  git push origin "$TAG"
97
97
  echo "Created and pushed tag: $TAG"
98
-
98
+
99
99
  - name: Create GitHub Release
100
100
  if: |
101
101
  ((github.event_name == 'push' && steps.check_version.outputs.should_release == 'true' && steps.check_tag.outputs.tag_exists == 'false') ||
@@ -106,12 +106,12 @@ jobs:
106
106
  name: Release v${{ steps.get_version.outputs.version }}
107
107
  body: |
108
108
  ## Release v${{ steps.get_version.outputs.version }}
109
-
109
+
110
110
  Automated release created from merged PR.
111
-
111
+
112
112
  ### Changes
113
113
  See the merged PR for detailed changelog.
114
-
114
+
115
115
  ### Installation
116
116
  ```bash
117
117
  pip install --upgrade nuvu-scan==${{ steps.get_version.outputs.version }}
@@ -120,14 +120,14 @@ jobs:
120
120
  prerelease: false
121
121
  env:
122
122
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
123
-
123
+
124
124
  - name: Build package
125
125
  if: |
126
126
  ((github.event_name == 'push' && steps.check_version.outputs.should_release == 'true' && steps.check_tag.outputs.tag_exists == 'false') ||
127
127
  (github.event_name == 'workflow_dispatch' && steps.check_tag.outputs.tag_exists == 'false'))
128
128
  run: |
129
129
  uv build
130
-
130
+
131
131
  - name: Publish to PyPI
132
132
  if: |
133
133
  ((github.event_name == 'push' && steps.check_version.outputs.should_release == 'true' && steps.check_tag.outputs.tag_exists == 'false') ||
@@ -136,7 +136,7 @@ jobs:
136
136
  with:
137
137
  packages-dir: dist/
138
138
  print-hash: true
139
-
139
+
140
140
  - name: Summary
141
141
  if: always()
142
142
  run: |
@@ -0,0 +1,42 @@
1
+ # Pre-commit hooks for nuvu-scan
2
+ # Install: uv run pre-commit install
3
+ # Run manually: uv run pre-commit run --all-files
4
+
5
+ repos:
6
+ # Ruff - Fast Python linter and formatter
7
+ - repo: https://github.com/astral-sh/ruff-pre-commit
8
+ rev: v0.8.6
9
+ hooks:
10
+ # Run the linter with auto-fix
11
+ - id: ruff
12
+ args: [--fix, --exit-non-zero-on-fix]
13
+ # Run the formatter
14
+ - id: ruff-format
15
+
16
+ # Pre-commit hooks for general file hygiene
17
+ - repo: https://github.com/pre-commit/pre-commit-hooks
18
+ rev: v5.0.0
19
+ hooks:
20
+ - id: trailing-whitespace
21
+ - id: end-of-file-fixer
22
+ - id: check-yaml
23
+ - id: check-json
24
+ - id: check-added-large-files
25
+ args: ['--maxkb=1000']
26
+ - id: check-merge-conflict
27
+ - id: detect-private-key
28
+
29
+ # Check for common security issues
30
+ - repo: https://github.com/PyCQA/bandit
31
+ rev: 1.8.2
32
+ hooks:
33
+ - id: bandit
34
+ args: ["-c", "pyproject.toml"]
35
+ additional_dependencies: ["bandit[toml]"]
36
+
37
+ # Configuration
38
+ ci:
39
+ autofix_commit_msg: |
40
+ [pre-commit.ci] auto fixes from pre-commit hooks
41
+ autoupdate_commit_msg: |
42
+ [pre-commit.ci] pre-commit autoupdate
@@ -0,0 +1,359 @@
1
+ # Nuvu Scan - Development Status
2
+
3
+ **Multi-Cloud Data Asset Control** - Designed from the ground up to support AWS, GCP, Azure, and Databricks.
4
+
5
+ ## ✅ Completed (v2.0.0)
6
+
7
+ ### Core Architecture
8
+ - ✅ Cloud-agnostic base interface (`CloudProviderScan`)
9
+ - ✅ Normalized asset categories enum (now includes `DATA_PIPELINE`, `DATA_SHARING`)
10
+ - ✅ Cloud-agnostic data models (`Asset`, `ScanResult`, `ScanConfig`)
11
+ - ✅ Provider module structure for future multi-cloud support
12
+ - ✅ Modern Python packaging with `uv` and `pyproject.toml`
13
+ - ✅ Python 3.10+ support (removed EOL versions 3.8, 3.9)
14
+
15
+ ### AWS Provider Implementation
16
+
17
+ #### S3 Bucket Collector
18
+ - ✅ Lists all buckets across all regions
19
+ - ✅ Gets bucket metadata (size, storage class, tags)
20
+ - ✅ Detects public access and policy status
21
+ - ✅ Estimates costs (storage + requests)
22
+ - ✅ Flags risks (empty buckets, PII naming, public access)
23
+ - ✅ Infers ownership from tags
24
+ - ✅ Last activity tracking via CloudTrail
25
+
26
+ #### Glue Data Catalog Collector (Enhanced in v2.0.0)
27
+ - ✅ **Databases & Tables**
28
+ - Lists databases and tables
29
+ - Detects empty tables and databases
30
+ - Links databases to their crawlers for activity tracking
31
+ - Table update time tracking
32
+ - External table (Spectrum) detection
33
+ - ✅ **Glue Crawlers** (NEW)
34
+ - Lists all crawlers with status (READY, RUNNING)
35
+ - Schedule expression and state (SCHEDULED, unscheduled)
36
+ - Last crawl time and status
37
+ - Tables created/updated/deleted counts
38
+ - Risk flags: `stale_crawler` (>90 days), `no_schedule`, `never_run`
39
+ - ✅ **Glue ETL Jobs** (NEW)
40
+ - Lists all ETL jobs
41
+ - Job type, Glue version, allocated capacity
42
+ - Last run status and time
43
+ - Cost estimation based on DPU hours
44
+ - Risk flags: `stale_job`, `never_run`, `failed_job`
45
+ - ✅ **Glue Connections** (NEW)
46
+ - Lists JDBC connections
47
+ - Connection type and masked JDBC URLs
48
+ - Risk flags: `external_connection` (non-AWS databases)
49
+
50
+ #### Athena Workgroup Collector
51
+ - ✅ Lists workgroups
52
+ - ✅ Analyzes query history (last 90 days)
53
+ - ✅ Detects idle workgroups
54
+ - ✅ Flags high failure rates
55
+ - ✅ Last activity tracking from query stats
56
+
57
+ #### Redshift Collector (Major Enhancement in v2.0.0)
58
+ - ✅ **Provisioned Clusters** (Enhanced)
59
+ - Lists all clusters with detailed metrics
60
+ - Node type, count, encryption status
61
+ - CloudWatch-based activity tracking (DatabaseConnections, CPUUtilization)
62
+ - Cluster age calculation
63
+ - VPC and public accessibility detection
64
+ - **Reservation coverage analysis** - checks if covered by reserved nodes
65
+ - **WLM configuration analysis** - queue count, auto WLM, unlimited queues
66
+ - Potential reservation savings calculation (40% estimate)
67
+ - Risk flags: `publicly_accessible`, `unencrypted`, `low_activity`, `potentially_unused`, `no_reservation_long_running`, `default_wlm_only`, `unlimited_wlm_queue`
68
+ - ✅ **Redshift Serverless**
69
+ - Namespaces with encryption status
70
+ - Workgroups with base capacity and cost estimation
71
+ - Risk flags: `publicly_accessible`
72
+ - ✅ **Redshift Datashares** (NEW)
73
+ - Lists all datashares (inbound and outbound)
74
+ - Consumer account identification
75
+ - Cross-account and cross-region detection
76
+ - Public consumer allowance check
77
+ - Risk flags: `cross_account_sharing`, `cross_region_sharing`, `allows_public_consumers`
78
+ - ✅ **Redshift Snapshots** (NEW)
79
+ - Lists all snapshots (manual and automated)
80
+ - Snapshot size and storage cost estimation
81
+ - Snapshot age tracking
82
+ - Orphan snapshot detection (source cluster deleted)
83
+ - Risk flags: `old_snapshot` (>90 days), `very_old_snapshot` (>365 days), `large_snapshot` (>1TB), `orphan_snapshot`
84
+ - ✅ **Redshift Reserved Nodes** (NEW)
85
+ - Lists all reserved nodes (active and retired)
86
+ - Node type, count, offering type
87
+ - Remaining duration calculation
88
+ - Expiration tracking
89
+ - Annual and monthly cost calculation
90
+ - Risk flags: `reservation_expired`, `reservation_expiring_soon`, `reservation_retired`
91
+
92
+ #### IAM Roles Collector
93
+ - ✅ Lists IAM roles with data-access permissions
94
+ - ✅ Detects unused roles (90+ days)
95
+ - ✅ Flags overly permissive policies
96
+ - ✅ Infers ownership from tags and role names
97
+ - ✅ Last activity tracking from `RoleLastUsed`
98
+
99
+ #### MWAA (Managed Workflows for Apache Airflow) Collector
100
+ - ✅ Lists MWAA environments across regions
101
+ - ✅ Collects environment details (status, version, worker counts)
102
+ - ✅ Estimates costs based on environment class
103
+ - ✅ Infers ownership from tags
104
+ - ✅ Last activity tracking from `LastUpdate`
105
+
106
+ #### Cost Explorer Integration
107
+ - ✅ Retrieves actual costs from AWS Cost Explorer API
108
+ - ✅ Service-level cost breakdown
109
+ - ✅ Monthly cost estimates based on last 30 days
110
+ - ✅ Cost summary asset in scan results
111
+
112
+ ### GCP Provider Implementation
113
+
114
+ #### GCS (Google Cloud Storage) Collector
115
+ - ✅ Lists all buckets
116
+ - ✅ Gets bucket metadata (size, storage class, labels)
117
+ - ✅ Detects public access
118
+ - ✅ Estimates costs
119
+ - ✅ Flags risks (empty buckets, public access)
120
+ - ✅ Infers ownership from labels
121
+ - ✅ Last activity tracking from bucket update time
122
+
123
+ #### BigQuery Collector
124
+ - ✅ Lists datasets and tables
125
+ - ✅ Analyzes query job history (last 90 days)
126
+ - ✅ Tracks query costs (including public datasets)
127
+ - ✅ Creates dedicated asset for query costs
128
+ - ✅ Estimates costs with 1 TB free tier consideration
129
+ - ✅ Detailed usage metrics (TB processed, monthly estimates)
130
+ - ✅ Last activity tracking from query stats
131
+
132
+ #### Dataproc Collector
133
+ - ✅ Lists Dataproc clusters
134
+ - ✅ Collects cluster details and job history
135
+ - ✅ Estimates costs
136
+ - ✅ Last activity tracking from job stats
137
+
138
+ #### Pub/Sub Collector
139
+ - ✅ Lists topics and subscriptions
140
+ - ✅ Collects topic metadata
141
+ - ✅ Estimates costs
142
+ - ✅ Last activity tracking
143
+
144
+ #### IAM Service Accounts Collector
145
+ - ✅ Lists service accounts
146
+ - ✅ Checks for data-access roles
147
+ - ✅ Flags overly permissive roles
148
+ - ✅ Infers ownership from display names and email patterns
149
+ - ✅ Last activity tracking from update time
150
+
151
+ #### Gemini API Collector
152
+ - ✅ Checks if Gemini API is enabled
153
+ - ✅ Retrieves actual costs from BigQuery billing export
154
+ - ✅ Fallback to Cloud Monitoring API for usage detection
155
+ - ✅ Last activity tracking from billing data
156
+
157
+ ### CLI
158
+ - ✅ Command-line interface with `nuvu scan --provider <aws|gcp>`
159
+ - ✅ Support for multiple output formats:
160
+ - HTML (default) - Beautiful interactive report with governance insights
161
+ - JSON - Machine-readable format
162
+ - CSV - Spreadsheet-friendly format
163
+ - ✅ Credential handling:
164
+ - AWS: env vars, CLI args, AWS profiles, IAM role assumption
165
+ - GCP: JSON key files, `GOOGLE_APPLICATION_CREDENTIALS`, JSON content
166
+ - ✅ Region filtering support (AWS)
167
+ - ✅ Project ID support (GCP)
168
+ - ✅ **Nuvu Cloud API push** (`--push --api-key`)
169
+ - ✅ **Collector Filtering** (NEW)
170
+ - `--collectors` / `-c` option to run specific collectors
171
+ - `--list-collectors` to show available collectors
172
+ - AWS collectors: `s3`, `glue`, `athena`, `redshift`, `iam`, `mwaa`
173
+ - GCP collectors: `gcs`, `bigquery`, `dataproc`, `pubsub`, `iam`, `gemini`
174
+ - Omit option for full scan (all collectors)
175
+ - ✅ **Progress Logging** - Real-time status updates during collection
176
+
177
+ ### Enhanced HTML Reports (v2.0.0)
178
+ - ✅ **Executive Summary** with key metrics
179
+ - ✅ **Cost Optimization Section**
180
+ - Snapshot cost analysis with old snapshot flagging
181
+ - Reserved node status and expiration tracking
182
+ - Potential savings calculation
183
+ - ✅ **Governance Insights Section**
184
+ - Stale/unused crawlers and ETL jobs
185
+ - Cross-account data sharing alerts
186
+ - WLM configuration review
187
+ - ✅ Improved styling with insight boxes (warning, alert, info)
188
+ - ✅ Potential savings card in summary
189
+
190
+ ### New Asset Categories (v2.0.0)
191
+ - ✅ `DATA_PIPELINE` - ETL jobs, crawlers, workflows
192
+ - ✅ `DATA_SHARING` - Datashares, cross-account sharing
193
+
194
+ ### New Asset Types (v2.0.0)
195
+ | Asset Type | Service | Description |
196
+ |------------|---------|-------------|
197
+ | `glue_crawler` | Glue | Crawler status, schedule, last run |
198
+ | `glue_job` | Glue | ETL job status, DPU allocation |
199
+ | `glue_connection` | Glue | JDBC connections to external DBs |
200
+ | `redshift_datashare` | Redshift | Cross-account data sharing |
201
+ | `redshift_snapshot` | Redshift | Manual and automated snapshots |
202
+ | `redshift_reserved_node` | Redshift | Reserved capacity purchases |
203
+ | `redshift_serverless_workgroup` | Redshift | Serverless workgroup details |
204
+
205
+ ### New Risk Flags (v2.0.0)
206
+ | Category | Flag | Description |
207
+ |----------|------|-------------|
208
+ | Glue | `stale_crawler` | Crawler hasn't run in 90+ days |
209
+ | Glue | `no_schedule` | Crawler has no schedule configured |
210
+ | Glue | `never_run` | Crawler or job has never been executed |
211
+ | Glue | `stale_job` | ETL job hasn't run in 90+ days |
212
+ | Glue | `failed_job` | Last job run failed |
213
+ | Glue | `external_connection` | JDBC connection to non-AWS database |
214
+ | Redshift | `cross_account_sharing` | Datashare shared to another AWS account |
215
+ | Redshift | `cross_region_sharing` | Datashare shared across regions |
216
+ | Redshift | `allows_public_consumers` | Datashare allows public consumers |
217
+ | Redshift | `old_snapshot` | Snapshot older than 90 days |
218
+ | Redshift | `very_old_snapshot` | Snapshot older than 365 days |
219
+ | Redshift | `large_snapshot` | Snapshot larger than 1TB |
220
+ | Redshift | `orphan_snapshot` | Source cluster no longer exists |
221
+ | Redshift | `no_reservation_long_running` | Cluster running 90+ days without reservation |
222
+ | Redshift | `reservation_expired` | Reserved node has expired |
223
+ | Redshift | `reservation_expiring_soon` | Reserved node expires within 30 days |
224
+ | Redshift | `default_wlm_only` | Cluster using only default WLM queue |
225
+ | Redshift | `unlimited_wlm_queue` | WLM queue with no concurrency limit |
226
+
227
+ ### Cost Tracking & Reporting
228
+ - ✅ Asset-level cost estimation for all resources
229
+ - ✅ AWS Cost Explorer API integration for actual costs
230
+ - ✅ GCP Cloud Billing API integration (Gemini costs)
231
+ - ✅ BigQuery query cost tracking (including public datasets)
232
+ - ✅ Redshift snapshot storage cost estimation
233
+ - ✅ Potential reservation savings calculation
234
+ - ✅ Cost summary assets showing service-level breakdowns
235
+
236
+ ### Usage & Activity Tracking
237
+ - ✅ Last activity timestamp for all assets (`last_activity_at`)
238
+ - ✅ Days since last use calculation
239
+ - ✅ **CloudWatch metrics for Redshift** (DatabaseConnections, CPUUtilization)
240
+ - ✅ CloudTrail integration for AWS (S3, Redshift)
241
+ - ✅ Crawler run times for Glue database/table activity
242
+ - ✅ Query history analysis (Athena, BigQuery)
243
+ - ✅ Job history analysis (Dataproc, Glue ETL)
244
+
245
+ ### Package & Distribution
246
+ - ✅ Modern Python packaging with `pyproject.toml` and `uv`
247
+ - ✅ Comprehensive README.md with setup instructions
248
+ - ✅ IAM policy file (`aws-iam-policy.json`) with 60+ read-only actions
249
+ - ✅ GitHub Actions CI/CD workflows
250
+ - ✅ Package structure ready for PyPI
251
+
252
+ ## 🧪 Tested
253
+
254
+ ### AWS (v2.0.0 Test Results)
255
+ - ✅ Discovered 2,344 assets in single-region scan (us-west-2)
256
+ - 90 S3 buckets
257
+ - 1,013 Glue assets (94 databases, 904 tables, 10 crawlers, 2 jobs, 3 connections)
258
+ - 1 Athena workgroup
259
+ - 1,141 Redshift assets (5 clusters, 2 namespaces, 2 workgroups, 12 datashares, 1,096 snapshots, 24 reserved nodes)
260
+ - 95 IAM roles
261
+ - 3 MWAA environments
262
+ - ✅ Snapshot cost totaling $88,684.92/month identified
263
+ - ✅ Reserved node status correctly identified (active vs retired)
264
+ - ✅ Cross-account datashares flagged correctly
265
+ - ✅ WLM configuration analysis working
266
+ - ✅ CloudWatch-based activity tracking working
267
+ - ✅ HTML report with Cost Optimization and Governance sections
268
+
269
+ ### GCP
270
+ - ✅ Discovered GCS buckets, BigQuery datasets, Dataproc clusters, Pub/Sub topics
271
+ - ✅ IAM service accounts scanning
272
+ - ✅ Gemini API cost tracking from billing export
273
+ - ✅ BigQuery query cost tracking (including public datasets)
274
+
275
+ ## 🔒 IAM Permissions Required
276
+
277
+ ### AWS
278
+ The complete IAM policy is available in `aws-iam-policy.json`. Key permission groups:
279
+
280
+ | Permission Group | Actions | Purpose |
281
+ |-----------------|---------|---------|
282
+ | S3 | 9 actions | Bucket metadata, public access, encryption |
283
+ | Glue Data Catalog | 6 actions | Databases, tables, partitions |
284
+ | Glue Crawlers | 4 actions | Crawler status, metrics |
285
+ | Glue ETL Jobs | 5 actions | Job status, run history |
286
+ | Glue Connections | 2 actions | JDBC connections |
287
+ | Athena | 4 actions | Workgroups, query history |
288
+ | Redshift Clusters | 4 actions | Cluster metadata, logging |
289
+ | Redshift Snapshots | 3 actions | Snapshot inventory |
290
+ | Redshift Reserved Nodes | 3 actions | Reservation status |
291
+ | Redshift WLM | 2 actions | Parameter groups |
292
+ | Redshift Datashares | 3 actions | Cross-account sharing |
293
+ | Redshift Serverless | 5 actions | Namespaces, workgroups |
294
+ | IAM | 8 actions | Role policies, data access |
295
+ | MWAA | 3 actions | Airflow environments |
296
+ | CloudWatch | 3 actions | Metrics for activity tracking |
297
+ | CloudTrail | 1 action | Last activity detection |
298
+ | Cost Explorer | 5 actions | Actual cost reporting |
299
+ | STS | 1 action | Account identity |
300
+
301
+ **Total: 66 read-only actions** following the principle of least privilege.
302
+
303
+ ### GCP
304
+ Required IAM roles for the service account:
305
+ - `roles/storage.objectViewer` - Cloud Storage
306
+ - `roles/bigquery.dataViewer` + `roles/bigquery.jobUser` - BigQuery
307
+ - `roles/dataproc.viewer` - Dataproc
308
+ - `roles/pubsub.subscriber` - Pub/Sub
309
+ - `roles/iam.serviceAccountViewer` - IAM service accounts
310
+ - `roles/serviceusage.serviceUsageViewer` - API status
311
+ - `roles/billing.costsViewer` - Cost Explorer (optional)
312
+ - `roles/monitoring.viewer` - Cloud Monitoring
313
+
314
+ ## 📋 TODO for Full v2
315
+
316
+ ### Additional AWS Collectors
317
+ - [ ] OpenSearch collector
318
+ - [ ] EMR collector
319
+ - [ ] SageMaker collector
320
+ - [ ] Bedrock collector
321
+ - [ ] MSK (Kafka) collector
322
+ - [ ] Kinesis collector
323
+ - [ ] DataSync/Transfer Family collector
324
+ - [ ] EBS Volumes & Snapshots collector
325
+ - [ ] VPC Endpoints collector
326
+ - [ ] Lake Formation collector
327
+ - [ ] Step Functions collector
328
+ - [ ] EventBridge collector
329
+
330
+ ### Redshift Deep Governance (Phase 2)
331
+ - [ ] Schema-level inventory via Redshift Data API
332
+ - [ ] Table-level inventory with column metadata
333
+ - [ ] PII detection via column naming heuristics
334
+ - [ ] Permission matrix visualization
335
+ - [ ] Usage-based stale table detection (STL_SCAN)
336
+
337
+ ### Additional GCP Collectors
338
+ - [ ] Cloud SQL collector
339
+ - [ ] Cloud Spanner collector
340
+ - [ ] Bigtable collector
341
+ - [ ] Firestore collector
342
+ - [ ] Vertex AI collector
343
+ - [ ] Dataflow collector
344
+ - [ ] Cloud Composer collector
345
+
346
+ ### Enhancements
347
+ - [ ] Parallel collection for faster scans
348
+ - [ ] Progress bars with ETA
349
+ - [ ] PDF report export
350
+ - [ ] Cost alerts and thresholds
351
+ - [ ] Asset dependency mapping
352
+ - [ ] Realized savings tracking (scan-over-scan comparison)
353
+
354
+ ## 🚀 Next Steps
355
+
356
+ 1. **Redshift Deep Governance** - Schema/table level inventory without data access
357
+ 2. **Azure Provider** - Blob Storage, Data Lake, Synapse, Databricks
358
+ 3. **Databricks Provider** - Workspace discovery, Unity Catalog
359
+ 4. **Enterprise Features** - RBAC, audit logging, compliance reporting
@@ -0,0 +1,39 @@
1
+ .PHONY: install lint format fix test build clean help
2
+
3
+ help:
4
+ @echo "Available commands:"
5
+ @echo " make install - Install dependencies (including dev)"
6
+ @echo " make lint - Run linting checks"
7
+ @echo " make format - Check code formatting"
8
+ @echo " make fix - Auto-fix linting and formatting issues"
9
+ @echo " make pre-commit - Run all pre-commit hooks"
10
+ @echo " make test - Run tests with coverage"
11
+ @echo " make build - Build package"
12
+ @echo " make clean - Clean build artifacts"
13
+
14
+ install:
15
+ uv sync --dev
16
+ uv run pre-commit install
17
+
18
+ lint:
19
+ uv run ruff check .
20
+
21
+ format:
22
+ uv run ruff format --check .
23
+
24
+ fix:
25
+ uv run ruff check --fix .
26
+ uv run ruff format .
27
+
28
+ pre-commit:
29
+ uv run pre-commit run --all-files
30
+
31
+ test:
32
+ uv run pytest --cov=nuvu_scan --cov-report=term-missing
33
+
34
+ build:
35
+ uv build
36
+
37
+ clean:
38
+ rm -rf dist/ build/ *.egg-info/ .coverage coverage.xml .pytest_cache/ .ruff_cache/
39
+ find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true