nuvu-scan 1.3.7__tar.gz → 2.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/.github/workflows/ci.yml +32 -17
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/.github/workflows/release.yml +19 -19
- nuvu_scan-2.0.0/.pre-commit-config.yaml +42 -0
- nuvu_scan-2.0.0/DEVELOPMENT_STATUS.md +359 -0
- nuvu_scan-2.0.0/Makefile +39 -0
- nuvu_scan-1.3.7/README.md → nuvu_scan-2.0.0/PKG-INFO +115 -11
- nuvu_scan-1.3.7/PKG-INFO → nuvu_scan-2.0.0/README.md +68 -49
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/RELEASE.md +2 -2
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/__init__.py +1 -1
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/cli/commands/scan.py +94 -0
- nuvu_scan-2.0.0/nuvu_scan/cli/formatters/html.py +421 -0
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/cli/main.py +2 -1
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/base.py +6 -0
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/aws/aws_scanner.py +55 -15
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/aws/collectors/athena.py +3 -0
- nuvu_scan-2.0.0/nuvu_scan/core/providers/aws/collectors/glue.py +553 -0
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/aws/collectors/iam.py +9 -0
- nuvu_scan-2.0.0/nuvu_scan/core/providers/aws/collectors/redshift.py +910 -0
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/gcp/gcp_scanner.py +42 -10
- nuvu_scan-2.0.0/pyproject.toml +138 -0
- nuvu_scan-1.3.7/DEVELOPMENT_STATUS.md +0 -249
- nuvu_scan-1.3.7/nuvu_scan/cli/formatters/html.py +0 -169
- nuvu_scan-1.3.7/nuvu_scan/core/providers/aws/collectors/glue.py +0 -148
- nuvu_scan-1.3.7/nuvu_scan/core/providers/aws/collectors/redshift.py +0 -232
- nuvu_scan-1.3.7/pyproject.toml +0 -90
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/.gitignore +0 -0
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/CONTRIBUTING.md +0 -0
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/cli/__init__.py +0 -0
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/cli/commands/__init__.py +0 -0
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/cli/formatters/__init__.py +0 -0
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/cli/formatters/csv.py +0 -0
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/cli/formatters/json.py +0 -0
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/__init__.py +0 -0
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/analyzers/__init__.py +0 -0
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/models/__init__.py +0 -0
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/__init__.py +0 -0
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/aws/__init__.py +0 -0
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/aws/collectors/__init__.py +0 -0
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/aws/collectors/cost_explorer.py +0 -0
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/aws/collectors/mwaa.py +0 -0
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/aws/collectors/s3.py +0 -0
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/gcp/__init__.py +0 -0
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/gcp/collectors/__init__.py +0 -0
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/gcp/collectors/bigquery.py +0 -0
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/gcp/collectors/billing.py +0 -0
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/gcp/collectors/dataproc.py +0 -0
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/gcp/collectors/gcs.py +0 -0
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/gcp/collectors/gemini.py +0 -0
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/gcp/collectors/iam.py +0 -0
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/nuvu_scan/core/providers/gcp/collectors/pubsub.py +0 -0
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/tests/__init__.py +0 -0
- {nuvu_scan-1.3.7 → nuvu_scan-2.0.0}/tests/test_base.py +0 -0
|
@@ -7,43 +7,58 @@ on:
|
|
|
7
7
|
branches: [main, develop]
|
|
8
8
|
|
|
9
9
|
jobs:
|
|
10
|
+
lint:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v4
|
|
14
|
+
|
|
15
|
+
- name: Install uv
|
|
16
|
+
uses: astral-sh/setup-uv@v4
|
|
17
|
+
with:
|
|
18
|
+
version: "latest"
|
|
19
|
+
|
|
20
|
+
- name: Set up Python
|
|
21
|
+
run: uv python install 3.12
|
|
22
|
+
|
|
23
|
+
- name: Install dependencies
|
|
24
|
+
run: uv sync --dev
|
|
25
|
+
|
|
26
|
+
- name: Run pre-commit
|
|
27
|
+
run: uv run pre-commit run --all-files
|
|
28
|
+
|
|
10
29
|
test:
|
|
11
30
|
runs-on: ubuntu-latest
|
|
31
|
+
needs: lint
|
|
12
32
|
strategy:
|
|
13
33
|
matrix:
|
|
14
34
|
python-version: ["3.10", "3.11", "3.12", "3.13"]
|
|
15
|
-
|
|
35
|
+
|
|
16
36
|
steps:
|
|
17
37
|
- uses: actions/checkout@v4
|
|
18
|
-
|
|
38
|
+
|
|
19
39
|
- name: Install uv
|
|
20
40
|
uses: astral-sh/setup-uv@v4
|
|
21
41
|
with:
|
|
22
42
|
version: "latest"
|
|
23
|
-
|
|
43
|
+
|
|
24
44
|
- name: Set up Python ${{ matrix.python-version }}
|
|
25
45
|
run: uv python install ${{ matrix.python-version }}
|
|
26
|
-
|
|
46
|
+
|
|
27
47
|
- name: Install dependencies
|
|
28
48
|
run: |
|
|
29
49
|
uv sync --dev
|
|
30
|
-
|
|
31
|
-
- name: Run linter
|
|
32
|
-
run: |
|
|
33
|
-
uv run ruff check .
|
|
34
|
-
uv run black --check .
|
|
35
|
-
|
|
50
|
+
|
|
36
51
|
- name: Run type checker
|
|
37
52
|
run: |
|
|
38
53
|
uv run mypy nuvu_scan || true # Allow failures for now
|
|
39
|
-
|
|
54
|
+
|
|
40
55
|
- name: Run tests
|
|
41
56
|
run: |
|
|
42
57
|
uv run pytest --cov=nuvu_scan --cov-report=xml
|
|
43
58
|
env:
|
|
44
59
|
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
|
45
60
|
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
|
46
|
-
|
|
61
|
+
|
|
47
62
|
- name: Upload coverage
|
|
48
63
|
uses: codecov/codecov-action@v3
|
|
49
64
|
with:
|
|
@@ -54,22 +69,22 @@ jobs:
|
|
|
54
69
|
runs-on: ubuntu-latest
|
|
55
70
|
needs: test
|
|
56
71
|
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
|
|
57
|
-
|
|
72
|
+
|
|
58
73
|
steps:
|
|
59
74
|
- uses: actions/checkout@v4
|
|
60
|
-
|
|
75
|
+
|
|
61
76
|
- name: Install uv
|
|
62
77
|
uses: astral-sh/setup-uv@v4
|
|
63
78
|
with:
|
|
64
79
|
version: "latest"
|
|
65
|
-
|
|
80
|
+
|
|
66
81
|
- name: Set up Python
|
|
67
82
|
run: uv python install 3.11
|
|
68
|
-
|
|
83
|
+
|
|
69
84
|
- name: Build package
|
|
70
85
|
run: |
|
|
71
86
|
uv build
|
|
72
|
-
|
|
87
|
+
|
|
73
88
|
- name: Upload artifacts
|
|
74
89
|
uses: actions/upload-artifact@v4
|
|
75
90
|
with:
|
|
@@ -18,21 +18,21 @@ jobs:
|
|
|
18
18
|
permissions:
|
|
19
19
|
contents: write # Required to create tags and releases
|
|
20
20
|
id-token: write # Required for trusted publishing
|
|
21
|
-
|
|
21
|
+
|
|
22
22
|
steps:
|
|
23
23
|
- name: Checkout code
|
|
24
24
|
uses: actions/checkout@v4
|
|
25
25
|
with:
|
|
26
26
|
fetch-depth: 0 # Fetch full history for version comparison
|
|
27
|
-
|
|
27
|
+
|
|
28
28
|
- name: Install uv
|
|
29
29
|
uses: astral-sh/setup-uv@v4
|
|
30
30
|
with:
|
|
31
31
|
version: "latest"
|
|
32
|
-
|
|
32
|
+
|
|
33
33
|
- name: Set up Python
|
|
34
34
|
run: uv python install 3.12
|
|
35
|
-
|
|
35
|
+
|
|
36
36
|
- name: Extract version from pyproject.toml
|
|
37
37
|
id: get_version
|
|
38
38
|
run: |
|
|
@@ -41,18 +41,18 @@ jobs:
|
|
|
41
41
|
VERSION=$(grep -E '^\s*version\s*=' pyproject.toml | head -1 | sed -E "s/.*version\s*=\s*['\"]([^'\"]+)['\"].*/\1/" | tr -d ' ')
|
|
42
42
|
echo "version=$VERSION" >> $GITHUB_OUTPUT
|
|
43
43
|
echo "Current version: $VERSION"
|
|
44
|
-
|
|
44
|
+
|
|
45
45
|
- name: Check if version was bumped
|
|
46
46
|
id: check_version
|
|
47
47
|
if: github.event_name == 'push'
|
|
48
48
|
run: |
|
|
49
49
|
CURRENT_VERSION="${{ steps.get_version.outputs.version }}"
|
|
50
|
-
|
|
50
|
+
|
|
51
51
|
# Get the previous commit's version
|
|
52
52
|
git checkout HEAD~1 pyproject.toml 2>/dev/null || echo "No previous commit found"
|
|
53
53
|
PREVIOUS_VERSION=$(grep -E '^\s*version\s*=' pyproject.toml 2>/dev/null | head -1 | sed -E "s/.*version\s*=\s*['\"]([^'\"]+)['\"].*/\1/" | tr -d ' ' || echo "")
|
|
54
54
|
git checkout HEAD pyproject.toml
|
|
55
|
-
|
|
55
|
+
|
|
56
56
|
if [ -z "$PREVIOUS_VERSION" ]; then
|
|
57
57
|
echo "No previous version found, assuming first release"
|
|
58
58
|
echo "should_release=true" >> $GITHUB_OUTPUT
|
|
@@ -63,7 +63,7 @@ jobs:
|
|
|
63
63
|
echo "Version unchanged ($CURRENT_VERSION), skipping release"
|
|
64
64
|
echo "should_release=false" >> $GITHUB_OUTPUT
|
|
65
65
|
fi
|
|
66
|
-
|
|
66
|
+
|
|
67
67
|
- name: Check if tag already exists
|
|
68
68
|
id: check_tag
|
|
69
69
|
if: |
|
|
@@ -72,7 +72,7 @@ jobs:
|
|
|
72
72
|
run: |
|
|
73
73
|
VERSION="${{ steps.get_version.outputs.version }}"
|
|
74
74
|
TAG="v${VERSION}"
|
|
75
|
-
|
|
75
|
+
|
|
76
76
|
if git rev-parse "$TAG" >/dev/null 2>&1; then
|
|
77
77
|
echo "Tag $TAG already exists, skipping release"
|
|
78
78
|
echo "tag_exists=true" >> $GITHUB_OUTPUT
|
|
@@ -80,7 +80,7 @@ jobs:
|
|
|
80
80
|
echo "Tag $TAG does not exist, will create release"
|
|
81
81
|
echo "tag_exists=false" >> $GITHUB_OUTPUT
|
|
82
82
|
fi
|
|
83
|
-
|
|
83
|
+
|
|
84
84
|
- name: Create Git Tag
|
|
85
85
|
if: |
|
|
86
86
|
((github.event_name == 'push' && steps.check_version.outputs.should_release == 'true' && steps.check_tag.outputs.tag_exists == 'false') ||
|
|
@@ -88,14 +88,14 @@ jobs:
|
|
|
88
88
|
run: |
|
|
89
89
|
VERSION="${{ steps.get_version.outputs.version }}"
|
|
90
90
|
TAG="v${VERSION}"
|
|
91
|
-
|
|
91
|
+
|
|
92
92
|
git config user.name "github-actions[bot]"
|
|
93
93
|
git config user.email "github-actions[bot]@users.noreply.github.com"
|
|
94
|
-
|
|
94
|
+
|
|
95
95
|
git tag -a "$TAG" -m "Release $TAG"
|
|
96
96
|
git push origin "$TAG"
|
|
97
97
|
echo "Created and pushed tag: $TAG"
|
|
98
|
-
|
|
98
|
+
|
|
99
99
|
- name: Create GitHub Release
|
|
100
100
|
if: |
|
|
101
101
|
((github.event_name == 'push' && steps.check_version.outputs.should_release == 'true' && steps.check_tag.outputs.tag_exists == 'false') ||
|
|
@@ -106,12 +106,12 @@ jobs:
|
|
|
106
106
|
name: Release v${{ steps.get_version.outputs.version }}
|
|
107
107
|
body: |
|
|
108
108
|
## Release v${{ steps.get_version.outputs.version }}
|
|
109
|
-
|
|
109
|
+
|
|
110
110
|
Automated release created from merged PR.
|
|
111
|
-
|
|
111
|
+
|
|
112
112
|
### Changes
|
|
113
113
|
See the merged PR for detailed changelog.
|
|
114
|
-
|
|
114
|
+
|
|
115
115
|
### Installation
|
|
116
116
|
```bash
|
|
117
117
|
pip install --upgrade nuvu-scan==${{ steps.get_version.outputs.version }}
|
|
@@ -120,14 +120,14 @@ jobs:
|
|
|
120
120
|
prerelease: false
|
|
121
121
|
env:
|
|
122
122
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
123
|
-
|
|
123
|
+
|
|
124
124
|
- name: Build package
|
|
125
125
|
if: |
|
|
126
126
|
((github.event_name == 'push' && steps.check_version.outputs.should_release == 'true' && steps.check_tag.outputs.tag_exists == 'false') ||
|
|
127
127
|
(github.event_name == 'workflow_dispatch' && steps.check_tag.outputs.tag_exists == 'false'))
|
|
128
128
|
run: |
|
|
129
129
|
uv build
|
|
130
|
-
|
|
130
|
+
|
|
131
131
|
- name: Publish to PyPI
|
|
132
132
|
if: |
|
|
133
133
|
((github.event_name == 'push' && steps.check_version.outputs.should_release == 'true' && steps.check_tag.outputs.tag_exists == 'false') ||
|
|
@@ -136,7 +136,7 @@ jobs:
|
|
|
136
136
|
with:
|
|
137
137
|
packages-dir: dist/
|
|
138
138
|
print-hash: true
|
|
139
|
-
|
|
139
|
+
|
|
140
140
|
- name: Summary
|
|
141
141
|
if: always()
|
|
142
142
|
run: |
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# Pre-commit hooks for nuvu-scan
|
|
2
|
+
# Install: uv run pre-commit install
|
|
3
|
+
# Run manually: uv run pre-commit run --all-files
|
|
4
|
+
|
|
5
|
+
repos:
|
|
6
|
+
# Ruff - Fast Python linter and formatter
|
|
7
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
8
|
+
rev: v0.8.6
|
|
9
|
+
hooks:
|
|
10
|
+
# Run the linter with auto-fix
|
|
11
|
+
- id: ruff
|
|
12
|
+
args: [--fix, --exit-non-zero-on-fix]
|
|
13
|
+
# Run the formatter
|
|
14
|
+
- id: ruff-format
|
|
15
|
+
|
|
16
|
+
# Pre-commit hooks for general file hygiene
|
|
17
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
18
|
+
rev: v5.0.0
|
|
19
|
+
hooks:
|
|
20
|
+
- id: trailing-whitespace
|
|
21
|
+
- id: end-of-file-fixer
|
|
22
|
+
- id: check-yaml
|
|
23
|
+
- id: check-json
|
|
24
|
+
- id: check-added-large-files
|
|
25
|
+
args: ['--maxkb=1000']
|
|
26
|
+
- id: check-merge-conflict
|
|
27
|
+
- id: detect-private-key
|
|
28
|
+
|
|
29
|
+
# Check for common security issues
|
|
30
|
+
- repo: https://github.com/PyCQA/bandit
|
|
31
|
+
rev: 1.8.2
|
|
32
|
+
hooks:
|
|
33
|
+
- id: bandit
|
|
34
|
+
args: ["-c", "pyproject.toml"]
|
|
35
|
+
additional_dependencies: ["bandit[toml]"]
|
|
36
|
+
|
|
37
|
+
# Configuration
|
|
38
|
+
ci:
|
|
39
|
+
autofix_commit_msg: |
|
|
40
|
+
[pre-commit.ci] auto fixes from pre-commit hooks
|
|
41
|
+
autoupdate_commit_msg: |
|
|
42
|
+
[pre-commit.ci] pre-commit autoupdate
|
|
@@ -0,0 +1,359 @@
|
|
|
1
|
+
# Nuvu Scan - Development Status
|
|
2
|
+
|
|
3
|
+
**Multi-Cloud Data Asset Control** - Designed from the ground up to support AWS, GCP, Azure, and Databricks.
|
|
4
|
+
|
|
5
|
+
## ✅ Completed (v2.0.0)
|
|
6
|
+
|
|
7
|
+
### Core Architecture
|
|
8
|
+
- ✅ Cloud-agnostic base interface (`CloudProviderScan`)
|
|
9
|
+
- ✅ Normalized asset categories enum (now includes `DATA_PIPELINE`, `DATA_SHARING`)
|
|
10
|
+
- ✅ Cloud-agnostic data models (`Asset`, `ScanResult`, `ScanConfig`)
|
|
11
|
+
- ✅ Provider module structure for future multi-cloud support
|
|
12
|
+
- ✅ Modern Python packaging with `uv` and `pyproject.toml`
|
|
13
|
+
- ✅ Python 3.10+ support (removed EOL versions 3.8, 3.9)
|
|
14
|
+
|
|
15
|
+
### AWS Provider Implementation
|
|
16
|
+
|
|
17
|
+
#### S3 Bucket Collector
|
|
18
|
+
- ✅ Lists all buckets across all regions
|
|
19
|
+
- ✅ Gets bucket metadata (size, storage class, tags)
|
|
20
|
+
- ✅ Detects public access and policy status
|
|
21
|
+
- ✅ Estimates costs (storage + requests)
|
|
22
|
+
- ✅ Flags risks (empty buckets, PII naming, public access)
|
|
23
|
+
- ✅ Infers ownership from tags
|
|
24
|
+
- ✅ Last activity tracking via CloudTrail
|
|
25
|
+
|
|
26
|
+
#### Glue Data Catalog Collector (Enhanced in v2.0.0)
|
|
27
|
+
- ✅ **Databases & Tables**
|
|
28
|
+
- Lists databases and tables
|
|
29
|
+
- Detects empty tables and databases
|
|
30
|
+
- Links databases to their crawlers for activity tracking
|
|
31
|
+
- Table update time tracking
|
|
32
|
+
- External table (Spectrum) detection
|
|
33
|
+
- ✅ **Glue Crawlers** (NEW)
|
|
34
|
+
- Lists all crawlers with status (READY, RUNNING)
|
|
35
|
+
- Schedule expression and state (SCHEDULED, unscheduled)
|
|
36
|
+
- Last crawl time and status
|
|
37
|
+
- Tables created/updated/deleted counts
|
|
38
|
+
- Risk flags: `stale_crawler` (>90 days), `no_schedule`, `never_run`
|
|
39
|
+
- ✅ **Glue ETL Jobs** (NEW)
|
|
40
|
+
- Lists all ETL jobs
|
|
41
|
+
- Job type, Glue version, allocated capacity
|
|
42
|
+
- Last run status and time
|
|
43
|
+
- Cost estimation based on DPU hours
|
|
44
|
+
- Risk flags: `stale_job`, `never_run`, `failed_job`
|
|
45
|
+
- ✅ **Glue Connections** (NEW)
|
|
46
|
+
- Lists JDBC connections
|
|
47
|
+
- Connection type and masked JDBC URLs
|
|
48
|
+
- Risk flags: `external_connection` (non-AWS databases)
|
|
49
|
+
|
|
50
|
+
#### Athena Workgroup Collector
|
|
51
|
+
- ✅ Lists workgroups
|
|
52
|
+
- ✅ Analyzes query history (last 90 days)
|
|
53
|
+
- ✅ Detects idle workgroups
|
|
54
|
+
- ✅ Flags high failure rates
|
|
55
|
+
- ✅ Last activity tracking from query stats
|
|
56
|
+
|
|
57
|
+
#### Redshift Collector (Major Enhancement in v2.0.0)
|
|
58
|
+
- ✅ **Provisioned Clusters** (Enhanced)
|
|
59
|
+
- Lists all clusters with detailed metrics
|
|
60
|
+
- Node type, count, encryption status
|
|
61
|
+
- CloudWatch-based activity tracking (DatabaseConnections, CPUUtilization)
|
|
62
|
+
- Cluster age calculation
|
|
63
|
+
- VPC and public accessibility detection
|
|
64
|
+
- **Reservation coverage analysis** - checks if covered by reserved nodes
|
|
65
|
+
- **WLM configuration analysis** - queue count, auto WLM, unlimited queues
|
|
66
|
+
- Potential reservation savings calculation (40% estimate)
|
|
67
|
+
- Risk flags: `publicly_accessible`, `unencrypted`, `low_activity`, `potentially_unused`, `no_reservation_long_running`, `default_wlm_only`, `unlimited_wlm_queue`
|
|
68
|
+
- ✅ **Redshift Serverless**
|
|
69
|
+
- Namespaces with encryption status
|
|
70
|
+
- Workgroups with base capacity and cost estimation
|
|
71
|
+
- Risk flags: `publicly_accessible`
|
|
72
|
+
- ✅ **Redshift Datashares** (NEW)
|
|
73
|
+
- Lists all datashares (inbound and outbound)
|
|
74
|
+
- Consumer account identification
|
|
75
|
+
- Cross-account and cross-region detection
|
|
76
|
+
- Public consumer allowance check
|
|
77
|
+
- Risk flags: `cross_account_sharing`, `cross_region_sharing`, `allows_public_consumers`
|
|
78
|
+
- ✅ **Redshift Snapshots** (NEW)
|
|
79
|
+
- Lists all snapshots (manual and automated)
|
|
80
|
+
- Snapshot size and storage cost estimation
|
|
81
|
+
- Snapshot age tracking
|
|
82
|
+
- Orphan snapshot detection (source cluster deleted)
|
|
83
|
+
- Risk flags: `old_snapshot` (>90 days), `very_old_snapshot` (>365 days), `large_snapshot` (>1TB), `orphan_snapshot`
|
|
84
|
+
- ✅ **Redshift Reserved Nodes** (NEW)
|
|
85
|
+
- Lists all reserved nodes (active and retired)
|
|
86
|
+
- Node type, count, offering type
|
|
87
|
+
- Remaining duration calculation
|
|
88
|
+
- Expiration tracking
|
|
89
|
+
- Annual and monthly cost calculation
|
|
90
|
+
- Risk flags: `reservation_expired`, `reservation_expiring_soon`, `reservation_retired`
|
|
91
|
+
|
|
92
|
+
#### IAM Roles Collector
|
|
93
|
+
- ✅ Lists IAM roles with data-access permissions
|
|
94
|
+
- ✅ Detects unused roles (90+ days)
|
|
95
|
+
- ✅ Flags overly permissive policies
|
|
96
|
+
- ✅ Infers ownership from tags and role names
|
|
97
|
+
- ✅ Last activity tracking from `RoleLastUsed`
|
|
98
|
+
|
|
99
|
+
#### MWAA (Managed Workflows for Apache Airflow) Collector
|
|
100
|
+
- ✅ Lists MWAA environments across regions
|
|
101
|
+
- ✅ Collects environment details (status, version, worker counts)
|
|
102
|
+
- ✅ Estimates costs based on environment class
|
|
103
|
+
- ✅ Infers ownership from tags
|
|
104
|
+
- ✅ Last activity tracking from `LastUpdate`
|
|
105
|
+
|
|
106
|
+
#### Cost Explorer Integration
|
|
107
|
+
- ✅ Retrieves actual costs from AWS Cost Explorer API
|
|
108
|
+
- ✅ Service-level cost breakdown
|
|
109
|
+
- ✅ Monthly cost estimates based on last 30 days
|
|
110
|
+
- ✅ Cost summary asset in scan results
|
|
111
|
+
|
|
112
|
+
### GCP Provider Implementation
|
|
113
|
+
|
|
114
|
+
#### GCS (Google Cloud Storage) Collector
|
|
115
|
+
- ✅ Lists all buckets
|
|
116
|
+
- ✅ Gets bucket metadata (size, storage class, labels)
|
|
117
|
+
- ✅ Detects public access
|
|
118
|
+
- ✅ Estimates costs
|
|
119
|
+
- ✅ Flags risks (empty buckets, public access)
|
|
120
|
+
- ✅ Infers ownership from labels
|
|
121
|
+
- ✅ Last activity tracking from bucket update time
|
|
122
|
+
|
|
123
|
+
#### BigQuery Collector
|
|
124
|
+
- ✅ Lists datasets and tables
|
|
125
|
+
- ✅ Analyzes query job history (last 90 days)
|
|
126
|
+
- ✅ Tracks query costs (including public datasets)
|
|
127
|
+
- ✅ Creates dedicated asset for query costs
|
|
128
|
+
- ✅ Estimates costs with 1 TB free tier consideration
|
|
129
|
+
- ✅ Detailed usage metrics (TB processed, monthly estimates)
|
|
130
|
+
- ✅ Last activity tracking from query stats
|
|
131
|
+
|
|
132
|
+
#### Dataproc Collector
|
|
133
|
+
- ✅ Lists Dataproc clusters
|
|
134
|
+
- ✅ Collects cluster details and job history
|
|
135
|
+
- ✅ Estimates costs
|
|
136
|
+
- ✅ Last activity tracking from job stats
|
|
137
|
+
|
|
138
|
+
#### Pub/Sub Collector
|
|
139
|
+
- ✅ Lists topics and subscriptions
|
|
140
|
+
- ✅ Collects topic metadata
|
|
141
|
+
- ✅ Estimates costs
|
|
142
|
+
- ✅ Last activity tracking
|
|
143
|
+
|
|
144
|
+
#### IAM Service Accounts Collector
|
|
145
|
+
- ✅ Lists service accounts
|
|
146
|
+
- ✅ Checks for data-access roles
|
|
147
|
+
- ✅ Flags overly permissive roles
|
|
148
|
+
- ✅ Infers ownership from display names and email patterns
|
|
149
|
+
- ✅ Last activity tracking from update time
|
|
150
|
+
|
|
151
|
+
#### Gemini API Collector
|
|
152
|
+
- ✅ Checks if Gemini API is enabled
|
|
153
|
+
- ✅ Retrieves actual costs from BigQuery billing export
|
|
154
|
+
- ✅ Fallback to Cloud Monitoring API for usage detection
|
|
155
|
+
- ✅ Last activity tracking from billing data
|
|
156
|
+
|
|
157
|
+
### CLI
|
|
158
|
+
- ✅ Command-line interface with `nuvu scan --provider <aws|gcp>`
|
|
159
|
+
- ✅ Support for multiple output formats:
|
|
160
|
+
- HTML (default) - Beautiful interactive report with governance insights
|
|
161
|
+
- JSON - Machine-readable format
|
|
162
|
+
- CSV - Spreadsheet-friendly format
|
|
163
|
+
- ✅ Credential handling:
|
|
164
|
+
- AWS: env vars, CLI args, AWS profiles, IAM role assumption
|
|
165
|
+
- GCP: JSON key files, `GOOGLE_APPLICATION_CREDENTIALS`, JSON content
|
|
166
|
+
- ✅ Region filtering support (AWS)
|
|
167
|
+
- ✅ Project ID support (GCP)
|
|
168
|
+
- ✅ **Nuvu Cloud API push** (`--push --api-key`)
|
|
169
|
+
- ✅ **Collector Filtering** (NEW)
|
|
170
|
+
- `--collectors` / `-c` option to run specific collectors
|
|
171
|
+
- `--list-collectors` to show available collectors
|
|
172
|
+
- AWS collectors: `s3`, `glue`, `athena`, `redshift`, `iam`, `mwaa`
|
|
173
|
+
- GCP collectors: `gcs`, `bigquery`, `dataproc`, `pubsub`, `iam`, `gemini`
|
|
174
|
+
- Omit option for full scan (all collectors)
|
|
175
|
+
- ✅ **Progress Logging** - Real-time status updates during collection
|
|
176
|
+
|
|
177
|
+
### Enhanced HTML Reports (v2.0.0)
|
|
178
|
+
- ✅ **Executive Summary** with key metrics
|
|
179
|
+
- ✅ **Cost Optimization Section**
|
|
180
|
+
- Snapshot cost analysis with old snapshot flagging
|
|
181
|
+
- Reserved node status and expiration tracking
|
|
182
|
+
- Potential savings calculation
|
|
183
|
+
- ✅ **Governance Insights Section**
|
|
184
|
+
- Stale/unused crawlers and ETL jobs
|
|
185
|
+
- Cross-account data sharing alerts
|
|
186
|
+
- WLM configuration review
|
|
187
|
+
- ✅ Improved styling with insight boxes (warning, alert, info)
|
|
188
|
+
- ✅ Potential savings card in summary
|
|
189
|
+
|
|
190
|
+
### New Asset Categories (v2.0.0)
|
|
191
|
+
- ✅ `DATA_PIPELINE` - ETL jobs, crawlers, workflows
|
|
192
|
+
- ✅ `DATA_SHARING` - Datashares, cross-account sharing
|
|
193
|
+
|
|
194
|
+
### New Asset Types (v2.0.0)
|
|
195
|
+
| Asset Type | Service | Description |
|
|
196
|
+
|------------|---------|-------------|
|
|
197
|
+
| `glue_crawler` | Glue | Crawler status, schedule, last run |
|
|
198
|
+
| `glue_job` | Glue | ETL job status, DPU allocation |
|
|
199
|
+
| `glue_connection` | Glue | JDBC connections to external DBs |
|
|
200
|
+
| `redshift_datashare` | Redshift | Cross-account data sharing |
|
|
201
|
+
| `redshift_snapshot` | Redshift | Manual and automated snapshots |
|
|
202
|
+
| `redshift_reserved_node` | Redshift | Reserved capacity purchases |
|
|
203
|
+
| `redshift_serverless_workgroup` | Redshift | Serverless workgroup details |
|
|
204
|
+
|
|
205
|
+
### New Risk Flags (v2.0.0)
|
|
206
|
+
| Category | Flag | Description |
|
|
207
|
+
|----------|------|-------------|
|
|
208
|
+
| Glue | `stale_crawler` | Crawler hasn't run in 90+ days |
|
|
209
|
+
| Glue | `no_schedule` | Crawler has no schedule configured |
|
|
210
|
+
| Glue | `never_run` | Crawler or job has never been executed |
|
|
211
|
+
| Glue | `stale_job` | ETL job hasn't run in 90+ days |
|
|
212
|
+
| Glue | `failed_job` | Last job run failed |
|
|
213
|
+
| Glue | `external_connection` | JDBC connection to non-AWS database |
|
|
214
|
+
| Redshift | `cross_account_sharing` | Datashare shared to another AWS account |
|
|
215
|
+
| Redshift | `cross_region_sharing` | Datashare shared across regions |
|
|
216
|
+
| Redshift | `allows_public_consumers` | Datashare allows public consumers |
|
|
217
|
+
| Redshift | `old_snapshot` | Snapshot older than 90 days |
|
|
218
|
+
| Redshift | `very_old_snapshot` | Snapshot older than 365 days |
|
|
219
|
+
| Redshift | `large_snapshot` | Snapshot larger than 1TB |
|
|
220
|
+
| Redshift | `orphan_snapshot` | Source cluster no longer exists |
|
|
221
|
+
| Redshift | `no_reservation_long_running` | Cluster running 90+ days without reservation |
|
|
222
|
+
| Redshift | `reservation_expired` | Reserved node has expired |
|
|
223
|
+
| Redshift | `reservation_expiring_soon` | Reserved node expires within 30 days |
|
|
224
|
+
| Redshift | `default_wlm_only` | Cluster using only default WLM queue |
|
|
225
|
+
| Redshift | `unlimited_wlm_queue` | WLM queue with no concurrency limit |
|
|
226
|
+
|
|
227
|
+
### Cost Tracking & Reporting
|
|
228
|
+
- ✅ Asset-level cost estimation for all resources
|
|
229
|
+
- ✅ AWS Cost Explorer API integration for actual costs
|
|
230
|
+
- ✅ GCP Cloud Billing API integration (Gemini costs)
|
|
231
|
+
- ✅ BigQuery query cost tracking (including public datasets)
|
|
232
|
+
- ✅ Redshift snapshot storage cost estimation
|
|
233
|
+
- ✅ Potential reservation savings calculation
|
|
234
|
+
- ✅ Cost summary assets showing service-level breakdowns
|
|
235
|
+
|
|
236
|
+
### Usage & Activity Tracking
|
|
237
|
+
- ✅ Last activity timestamp for all assets (`last_activity_at`)
|
|
238
|
+
- ✅ Days since last use calculation
|
|
239
|
+
- ✅ **CloudWatch metrics for Redshift** (DatabaseConnections, CPUUtilization)
|
|
240
|
+
- ✅ CloudTrail integration for AWS (S3, Redshift)
|
|
241
|
+
- ✅ Crawler run times for Glue database/table activity
|
|
242
|
+
- ✅ Query history analysis (Athena, BigQuery)
|
|
243
|
+
- ✅ Job history analysis (Dataproc, Glue ETL)
|
|
244
|
+
|
|
245
|
+
### Package & Distribution
|
|
246
|
+
- ✅ Modern Python packaging with `pyproject.toml` and `uv`
|
|
247
|
+
- ✅ Comprehensive README.md with setup instructions
|
|
248
|
+
- ✅ IAM policy file (`aws-iam-policy.json`) with 60+ read-only actions
|
|
249
|
+
- ✅ GitHub Actions CI/CD workflows
|
|
250
|
+
- ✅ Package structure ready for PyPI
|
|
251
|
+
|
|
252
|
+
## 🧪 Tested
|
|
253
|
+
|
|
254
|
+
### AWS (v2.0.0 Test Results)
|
|
255
|
+
- ✅ Discovered 2,344 assets in single-region scan (us-west-2)
|
|
256
|
+
- 90 S3 buckets
|
|
257
|
+
- 1,013 Glue assets (94 databases, 904 tables, 10 crawlers, 2 jobs, 3 connections)
|
|
258
|
+
- 1 Athena workgroup
|
|
259
|
+
- 1,141 Redshift assets (5 clusters, 2 namespaces, 2 workgroups, 12 datashares, 1,096 snapshots, 24 reserved nodes)
|
|
260
|
+
- 95 IAM roles
|
|
261
|
+
- 3 MWAA environments
|
|
262
|
+
- ✅ Snapshot cost totaling $88,684.92/month identified
|
|
263
|
+
- ✅ Reserved node status correctly identified (active vs retired)
|
|
264
|
+
- ✅ Cross-account datashares flagged correctly
|
|
265
|
+
- ✅ WLM configuration analysis working
|
|
266
|
+
- ✅ CloudWatch-based activity tracking working
|
|
267
|
+
- ✅ HTML report with Cost Optimization and Governance sections
|
|
268
|
+
|
|
269
|
+
### GCP
|
|
270
|
+
- ✅ Discovered GCS buckets, BigQuery datasets, Dataproc clusters, Pub/Sub topics
|
|
271
|
+
- ✅ IAM service accounts scanning
|
|
272
|
+
- ✅ Gemini API cost tracking from billing export
|
|
273
|
+
- ✅ BigQuery query cost tracking (including public datasets)
|
|
274
|
+
|
|
275
|
+
## 🔒 IAM Permissions Required
|
|
276
|
+
|
|
277
|
+
### AWS
|
|
278
|
+
The complete IAM policy is available in `aws-iam-policy.json`. Key permission groups:
|
|
279
|
+
|
|
280
|
+
| Permission Group | Actions | Purpose |
|
|
281
|
+
|-----------------|---------|---------|
|
|
282
|
+
| S3 | 9 actions | Bucket metadata, public access, encryption |
|
|
283
|
+
| Glue Data Catalog | 6 actions | Databases, tables, partitions |
|
|
284
|
+
| Glue Crawlers | 4 actions | Crawler status, metrics |
|
|
285
|
+
| Glue ETL Jobs | 5 actions | Job status, run history |
|
|
286
|
+
| Glue Connections | 2 actions | JDBC connections |
|
|
287
|
+
| Athena | 4 actions | Workgroups, query history |
|
|
288
|
+
| Redshift Clusters | 4 actions | Cluster metadata, logging |
|
|
289
|
+
| Redshift Snapshots | 3 actions | Snapshot inventory |
|
|
290
|
+
| Redshift Reserved Nodes | 3 actions | Reservation status |
|
|
291
|
+
| Redshift WLM | 2 actions | Parameter groups |
|
|
292
|
+
| Redshift Datashares | 3 actions | Cross-account sharing |
|
|
293
|
+
| Redshift Serverless | 5 actions | Namespaces, workgroups |
|
|
294
|
+
| IAM | 8 actions | Role policies, data access |
|
|
295
|
+
| MWAA | 3 actions | Airflow environments |
|
|
296
|
+
| CloudWatch | 3 actions | Metrics for activity tracking |
|
|
297
|
+
| CloudTrail | 1 action | Last activity detection |
|
|
298
|
+
| Cost Explorer | 5 actions | Actual cost reporting |
|
|
299
|
+
| STS | 1 action | Account identity |
|
|
300
|
+
|
|
301
|
+
**Total: 66 read-only actions** following the principle of least privilege.
|
|
302
|
+
|
|
303
|
+
### GCP
|
|
304
|
+
Required IAM roles for the service account:
|
|
305
|
+
- `roles/storage.objectViewer` - Cloud Storage
|
|
306
|
+
- `roles/bigquery.dataViewer` + `roles/bigquery.jobUser` - BigQuery
|
|
307
|
+
- `roles/dataproc.viewer` - Dataproc
|
|
308
|
+
- `roles/pubsub.subscriber` - Pub/Sub
|
|
309
|
+
- `roles/iam.serviceAccountViewer` - IAM service accounts
|
|
310
|
+
- `roles/serviceusage.serviceUsageViewer` - API status
|
|
311
|
+
- `roles/billing.costsViewer` - Cost Explorer (optional)
|
|
312
|
+
- `roles/monitoring.viewer` - Cloud Monitoring
|
|
313
|
+
|
|
314
|
+
## 📋 TODO for Full v2
|
|
315
|
+
|
|
316
|
+
### Additional AWS Collectors
|
|
317
|
+
- [ ] OpenSearch collector
|
|
318
|
+
- [ ] EMR collector
|
|
319
|
+
- [ ] SageMaker collector
|
|
320
|
+
- [ ] Bedrock collector
|
|
321
|
+
- [ ] MSK (Kafka) collector
|
|
322
|
+
- [ ] Kinesis collector
|
|
323
|
+
- [ ] DataSync/Transfer Family collector
|
|
324
|
+
- [ ] EBS Volumes & Snapshots collector
|
|
325
|
+
- [ ] VPC Endpoints collector
|
|
326
|
+
- [ ] Lake Formation collector
|
|
327
|
+
- [ ] Step Functions collector
|
|
328
|
+
- [ ] EventBridge collector
|
|
329
|
+
|
|
330
|
+
### Redshift Deep Governance (Phase 2)
|
|
331
|
+
- [ ] Schema-level inventory via Redshift Data API
|
|
332
|
+
- [ ] Table-level inventory with column metadata
|
|
333
|
+
- [ ] PII detection via column naming heuristics
|
|
334
|
+
- [ ] Permission matrix visualization
|
|
335
|
+
- [ ] Usage-based stale table detection (STL_SCAN)
|
|
336
|
+
|
|
337
|
+
### Additional GCP Collectors
|
|
338
|
+
- [ ] Cloud SQL collector
|
|
339
|
+
- [ ] Cloud Spanner collector
|
|
340
|
+
- [ ] Bigtable collector
|
|
341
|
+
- [ ] Firestore collector
|
|
342
|
+
- [ ] Vertex AI collector
|
|
343
|
+
- [ ] Dataflow collector
|
|
344
|
+
- [ ] Cloud Composer collector
|
|
345
|
+
|
|
346
|
+
### Enhancements
|
|
347
|
+
- [ ] Parallel collection for faster scans
|
|
348
|
+
- [ ] Progress bars with ETA
|
|
349
|
+
- [ ] PDF report export
|
|
350
|
+
- [ ] Cost alerts and thresholds
|
|
351
|
+
- [ ] Asset dependency mapping
|
|
352
|
+
- [ ] Realized savings tracking (scan-over-scan comparison)
|
|
353
|
+
|
|
354
|
+
## 🚀 Next Steps
|
|
355
|
+
|
|
356
|
+
1. **Redshift Deep Governance** - Schema/table level inventory without data access
|
|
357
|
+
2. **Azure Provider** - Blob Storage, Data Lake, Synapse, Databricks
|
|
358
|
+
3. **Databricks Provider** - Workspace discovery, Unity Catalog
|
|
359
|
+
4. **Enterprise Features** - RBAC, audit logging, compliance reporting
|
nuvu_scan-2.0.0/Makefile
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
.PHONY: install lint format fix test build clean help
|
|
2
|
+
|
|
3
|
+
help:
|
|
4
|
+
@echo "Available commands:"
|
|
5
|
+
@echo " make install - Install dependencies (including dev)"
|
|
6
|
+
@echo " make lint - Run linting checks"
|
|
7
|
+
@echo " make format - Check code formatting"
|
|
8
|
+
@echo " make fix - Auto-fix linting and formatting issues"
|
|
9
|
+
@echo " make pre-commit - Run all pre-commit hooks"
|
|
10
|
+
@echo " make test - Run tests with coverage"
|
|
11
|
+
@echo " make build - Build package"
|
|
12
|
+
@echo " make clean - Clean build artifacts"
|
|
13
|
+
|
|
14
|
+
install:
|
|
15
|
+
uv sync --dev
|
|
16
|
+
uv run pre-commit install
|
|
17
|
+
|
|
18
|
+
lint:
|
|
19
|
+
uv run ruff check .
|
|
20
|
+
|
|
21
|
+
format:
|
|
22
|
+
uv run ruff format --check .
|
|
23
|
+
|
|
24
|
+
fix:
|
|
25
|
+
uv run ruff check --fix .
|
|
26
|
+
uv run ruff format .
|
|
27
|
+
|
|
28
|
+
pre-commit:
|
|
29
|
+
uv run pre-commit run --all-files
|
|
30
|
+
|
|
31
|
+
test:
|
|
32
|
+
uv run pytest --cov=nuvu_scan --cov-report=term-missing
|
|
33
|
+
|
|
34
|
+
build:
|
|
35
|
+
uv build
|
|
36
|
+
|
|
37
|
+
clean:
|
|
38
|
+
rm -rf dist/ build/ *.egg-info/ .coverage coverage.xml .pytest_cache/ .ruff_cache/
|
|
39
|
+
find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
|