splunk-ddss-extractor 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,277 @@
1
+ Metadata-Version: 2.4
2
+ Name: splunk-ddss-extractor
3
+ Version: 0.3.0
4
+ Classifier: Development Status :: 4 - Beta
5
+ Classifier: Intended Audience :: Developers
6
+ Classifier: License :: OSI Approved :: MIT License
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: Programming Language :: Python :: 3.10
9
+ Classifier: Programming Language :: Python :: 3.11
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Classifier: Programming Language :: Python :: 3.13
12
+ Classifier: Programming Language :: Rust
13
+ Classifier: Topic :: System :: Archiving
14
+ Requires-Dist: zstandard>=0.22.0
15
+ Requires-Dist: orjson>=3.9.0
16
+ Requires-Dist: click>=8.1.7 ; extra == 'cli'
17
+ Requires-Dist: pytest>=7.4.3 ; extra == 'dev'
18
+ Requires-Dist: pytest-cov>=4.1.0 ; extra == 'dev'
19
+ Requires-Dist: pyarrow>=14.0.0 ; extra == 'parquet'
20
+ Requires-Dist: boto3>=1.34.0 ; extra == 's3'
21
+ Provides-Extra: cli
22
+ Provides-Extra: dev
23
+ Provides-Extra: parquet
24
+ Provides-Extra: s3
25
+ Summary: Extract events from Splunk journal archives to raw format (JSON, CSV, Parquet)
26
+ Author: Lech Lachowicz
27
+ License: MIT
28
+ Requires-Python: >=3.10
29
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
30
+ Project-URL: Homepage, https://github.com/ponquersohn/splunk_ddss_extractor
31
+ Project-URL: Issues, https://github.com/ponquersohn/splunk_ddss_extractor/issues
32
+ Project-URL: Repository, https://github.com/ponquersohn/splunk_ddss_extractor
33
+
34
+ # Splunk DDSS Extractor
35
+
36
+ Convert Splunk self-hosted storage archives from compressed journal format to raw format.
37
+
38
+ ## Overview
39
+
40
+ Splunk DDSS Extractor is a Python library that processes Splunk journal archives, extracts events, and converts them to raw format for easier analysis and long-term storage. Use it in your own applications, data pipelines, or as a CLI tool.
41
+
42
+ **Note:** This project is based on the concept from [fionera/splunker](https://github.com/fionera/splunker), reimplemented in Python with additional features for production use.
43
+
44
+ ## Features
45
+
46
+ - Automatic compression detection (.zst, .gz, uncompressed)
47
+ - Extract events with full metadata (host, source, sourcetype, timestamps)
48
+ - Multiple output formats (JSON Lines, CSV, Parquet)
49
+ - Streaming processing for memory efficiency
50
+ - Simple Python API and CLI interface
51
+ - Docker support for containerized deployments
52
+ - Integrates with AWS Lambda, ECS, or any Python environment
53
+
54
+ ## Quick Start
55
+
56
+ ### Using the Makefile (Recommended)
57
+
58
+ ```bash
59
+ # Show all available commands
60
+ make env
61
+
62
+ # Complete development setup (venv + dependencies)
63
+ make dev-setup
64
+
65
+ # Run tests
66
+ make test
67
+
68
+ # Build Docker image
69
+ make docker
70
+ ```
71
+
72
+ ### Manual Setup
73
+
74
+ #### Installation
75
+
76
+ ```bash
77
+ # Create virtual environment
78
+ python3 -m venv venv
79
+ source venv/bin/activate
80
+
81
+ # Install dependencies
82
+ pip install -r requirements.txt
83
+ pip install -e .
84
+
85
+ # Optional: Install Parquet support
86
+ pip install pyarrow
87
+ ```
88
+
89
+ #### Basic Usage
90
+
91
+ **Extract a journal file:**
92
+
93
+ ```python
94
+ from splunk_ddss_extractor.extractor import Extractor
95
+
96
+ extractor = Extractor()
97
+
98
+ # Extract to JSON Lines
99
+ extractor.extract(
100
+ input_path='/path/to/journal.zst',
101
+ output_path='output.json',
102
+ output_format='ndjson'
103
+ )
104
+
105
+ # Extract to CSV
106
+ extractor.extract(
107
+ input_path='/path/to/journal.zst',
108
+ output_path='output.csv',
109
+ output_format='csv'
110
+ )
111
+
112
+ # Extract to Parquet
113
+ extractor.extract(
114
+ input_path='/path/to/journal.zst',
115
+ output_path='output.parquet',
116
+ output_format='parquet'
117
+ )
118
+
119
+ # Extract from S3 to local file (streaming, no download)
120
+ extractor.extract(
121
+ input_path='s3://bucket/path/journal.zst',
122
+ output_path='output.json',
123
+ output_format='ndjson'
124
+ )
125
+
126
+ # Extract from local to S3
127
+ extractor.extract(
128
+ input_path='/path/to/journal.zst',
129
+ output_path='s3://bucket/output/data.json',
130
+ output_format='ndjson'
131
+ )
132
+ ```
133
+
134
+ **Low-level streaming (advanced):**
135
+
136
+ ```python
137
+ from splunk_ddss_extractor.decoder import JournalDecoder
138
+ import zstandard as zstd
139
+
140
+ # For low-level access, decoder needs an uncompressed stream
141
+ # If reading a compressed file, decompress it first:
142
+ with open('/path/to/journal.zst', 'rb') as compressed_file:
143
+ dctx = zstd.ZstdDecompressor()
144
+ with dctx.stream_reader(compressed_file) as reader:
145
+ decoder = JournalDecoder(reader=reader)
146
+ while decoder.scan():
147
+ event = decoder.get_event()
148
+ print(f"Host: {decoder.host()}")
149
+ print(f"Source: {decoder.source()}")
150
+ print(f"Sourcetype: {decoder.source_type()}")
151
+ print(f"Timestamp: {event.index_time}")
152
+ print(f"Message: {event.message_string()}")
153
+
154
+ # For uncompressed journal files:
155
+ with open('/path/to/journal', 'rb') as f:
156
+ decoder = JournalDecoder(reader=f)
157
+ while decoder.scan():
158
+ event = decoder.get_event()
159
+ # Process event...
160
+ ```
161
+
162
+ #### Docker Usage
163
+
164
+ ```bash
165
+ # Build image
166
+ make docker
167
+
168
+ # Run with local file
169
+ docker run -v /path/to/data:/data ghcr.io/ponquersohn/splunk_ddss_extractor:latest
170
+
171
+ # Use in your own Dockerfile
172
+ FROM ghcr.io/ponquersohn/splunk_ddss_extractor:latest
173
+ COPY your_script.py /app/
174
+ CMD ["python", "/app/your_script.py"]
175
+ ```
176
+
177
+ ## Architecture
178
+
179
+ This is a **Python library** with the following components:
180
+
181
+ 1. **Journal Decoder** - Low-level decoder for Splunk's binary journal format
182
+ 2. **Extractor Interface** - High-level API for common extraction tasks
183
+ 3. **Output Writers** - Support for JSON, CSV, and Parquet formats
184
+ 4. **Compression Detection** - Automatic detection and handling of .zst, .gz formats
185
+
186
+ **Integration Options:**
187
+ - Direct Python import in your applications
188
+ - AWS Lambda functions for serverless processing
189
+ - ECS/Fargate tasks for batch processing
190
+ - Docker containers for isolated environments
191
+ - Local scripts for one-off extractions
192
+
193
+ See [CLAUDE.md](CLAUDE.md) for detailed architecture documentation.
194
+
195
+ ## Development
196
+
197
+ ### Quick Commands
198
+
199
+ ```bash
200
+ # Run tests
201
+ make test
202
+
203
+ # Run tests with coverage
204
+ make test-coverage
205
+
206
+ # Build Docker image
207
+ make docker
208
+
209
+ # Test Docker locally
210
+ make docker-run
211
+
212
+ # Run all checks (tests)
213
+ make check
214
+
215
+ # Clean temporary files
216
+ make clean
217
+ ```
218
+
219
+ ### Manual Commands
220
+
221
+ ```bash
222
+ # Run tests
223
+ pytest tests/
224
+
225
+ # Code formatting
226
+ black src/ tests/
227
+
228
+ # Local Docker testing
229
+ cd docker
230
+ docker-compose up
231
+ ```
232
+
233
+ ## Configuration
234
+
235
+ When integrating with AWS or other environments, you may use these environment variables:
236
+
237
+ - `OUTPUT_FORMAT`: Output format - json, csv, or parquet (default: json)
238
+ - `LOG_LEVEL`: Logging level (default: INFO)
239
+ - `AWS_REGION`: AWS region for S3 operations (default: us-east-1)
240
+ - `S3_BUCKET`: S3 bucket name (for S3 integrations)
241
+
242
+ ## Output Formats
243
+
244
+ ### JSON Lines (default)
245
+
246
+ ```json
247
+ {"timestamp": 1234567890, "host": "server01", "source": "/var/log/app.log", "sourcetype": "app", "message": "Event data"}
248
+ ```
249
+
250
+ ### CSV
251
+
252
+ ```csv
253
+ timestamp,host,source,sourcetype,message
254
+ 1234567890,server01,/var/log/app.log,app,"Event data"
255
+ ```
256
+
257
+ ### Parquet
258
+
259
+ Columnar format optimized for analytics (requires pyarrow).
260
+
261
+ ## Credits
262
+
263
+ This project is inspired by and based on the concept from [fionera/splunker](https://github.com/fionera/splunker), a Go implementation for extracting Splunk journal files. This Python implementation extends the original concept with:
264
+
265
+ - Streaming S3 support (no temporary files)
266
+ - Multiple output formats (JSON Lines, CSV, Parquet)
267
+ - Python library API for easy integration
268
+ - Docker and AWS deployment options
269
+
270
+ ## License
271
+
272
+ Proprietary
273
+
274
+ ## Contributing
275
+
276
+ See [CLAUDE.md](CLAUDE.md) for development guidelines.
277
+
@@ -0,0 +1,243 @@
1
+ # Splunk DDSS Extractor
2
+
3
+ Convert Splunk self-hosted storage archives from compressed journal format to raw format.
4
+
5
+ ## Overview
6
+
7
+ Splunk DDSS Extractor is a Python library that processes Splunk journal archives, extracts events, and converts them to raw format for easier analysis and long-term storage. Use it in your own applications, data pipelines, or as a CLI tool.
8
+
9
+ **Note:** This project is based on the concept from [fionera/splunker](https://github.com/fionera/splunker), reimplemented in Python with additional features for production use.
10
+
11
+ ## Features
12
+
13
+ - Automatic compression detection (.zst, .gz, uncompressed)
14
+ - Extract events with full metadata (host, source, sourcetype, timestamps)
15
+ - Multiple output formats (JSON Lines, CSV, Parquet)
16
+ - Streaming processing for memory efficiency
17
+ - Simple Python API and CLI interface
18
+ - Docker support for containerized deployments
19
+ - Integrates with AWS Lambda, ECS, or any Python environment
20
+
21
+ ## Quick Start
22
+
23
+ ### Using the Makefile (Recommended)
24
+
25
+ ```bash
26
+ # Show all available commands
27
+ make env
28
+
29
+ # Complete development setup (venv + dependencies)
30
+ make dev-setup
31
+
32
+ # Run tests
33
+ make test
34
+
35
+ # Build Docker image
36
+ make docker
37
+ ```
38
+
39
+ ### Manual Setup
40
+
41
+ #### Installation
42
+
43
+ ```bash
44
+ # Create virtual environment
45
+ python3 -m venv venv
46
+ source venv/bin/activate
47
+
48
+ # Install dependencies
49
+ pip install -r requirements.txt
50
+ pip install -e .
51
+
52
+ # Optional: Install Parquet support
53
+ pip install pyarrow
54
+ ```
55
+
56
+ #### Basic Usage
57
+
58
+ **Extract a journal file:**
59
+
60
+ ```python
61
+ from splunk_ddss_extractor.extractor import Extractor
62
+
63
+ extractor = Extractor()
64
+
65
+ # Extract to JSON Lines
66
+ extractor.extract(
67
+ input_path='/path/to/journal.zst',
68
+ output_path='output.json',
69
+ output_format='ndjson'
70
+ )
71
+
72
+ # Extract to CSV
73
+ extractor.extract(
74
+ input_path='/path/to/journal.zst',
75
+ output_path='output.csv',
76
+ output_format='csv'
77
+ )
78
+
79
+ # Extract to Parquet
80
+ extractor.extract(
81
+ input_path='/path/to/journal.zst',
82
+ output_path='output.parquet',
83
+ output_format='parquet'
84
+ )
85
+
86
+ # Extract from S3 to local file (streaming, no download)
87
+ extractor.extract(
88
+ input_path='s3://bucket/path/journal.zst',
89
+ output_path='output.json',
90
+ output_format='ndjson'
91
+ )
92
+
93
+ # Extract from local to S3
94
+ extractor.extract(
95
+ input_path='/path/to/journal.zst',
96
+ output_path='s3://bucket/output/data.json',
97
+ output_format='ndjson'
98
+ )
99
+ ```
100
+
101
+ **Low-level streaming (advanced):**
102
+
103
+ ```python
104
+ from splunk_ddss_extractor.decoder import JournalDecoder
105
+ import zstandard as zstd
106
+
107
+ # For low-level access, decoder needs an uncompressed stream
108
+ # If reading a compressed file, decompress it first:
109
+ with open('/path/to/journal.zst', 'rb') as compressed_file:
110
+ dctx = zstd.ZstdDecompressor()
111
+ with dctx.stream_reader(compressed_file) as reader:
112
+ decoder = JournalDecoder(reader=reader)
113
+ while decoder.scan():
114
+ event = decoder.get_event()
115
+ print(f"Host: {decoder.host()}")
116
+ print(f"Source: {decoder.source()}")
117
+ print(f"Sourcetype: {decoder.source_type()}")
118
+ print(f"Timestamp: {event.index_time}")
119
+ print(f"Message: {event.message_string()}")
120
+
121
+ # For uncompressed journal files:
122
+ with open('/path/to/journal', 'rb') as f:
123
+ decoder = JournalDecoder(reader=f)
124
+ while decoder.scan():
125
+ event = decoder.get_event()
126
+ # Process event...
127
+ ```
128
+
129
+ #### Docker Usage
130
+
131
+ ```bash
132
+ # Build image
133
+ make docker
134
+
135
+ # Run with local file
136
+ docker run -v /path/to/data:/data ghcr.io/ponquersohn/splunk_ddss_extractor:latest
137
+
138
+ # Use in your own Dockerfile
139
+ FROM ghcr.io/ponquersohn/splunk_ddss_extractor:latest
140
+ COPY your_script.py /app/
141
+ CMD ["python", "/app/your_script.py"]
142
+ ```
143
+
144
+ ## Architecture
145
+
146
+ This is a **Python library** with the following components:
147
+
148
+ 1. **Journal Decoder** - Low-level decoder for Splunk's binary journal format
149
+ 2. **Extractor Interface** - High-level API for common extraction tasks
150
+ 3. **Output Writers** - Support for JSON, CSV, and Parquet formats
151
+ 4. **Compression Detection** - Automatic detection and handling of .zst, .gz formats
152
+
153
+ **Integration Options:**
154
+ - Direct Python import in your applications
155
+ - AWS Lambda functions for serverless processing
156
+ - ECS/Fargate tasks for batch processing
157
+ - Docker containers for isolated environments
158
+ - Local scripts for one-off extractions
159
+
160
+ See [CLAUDE.md](CLAUDE.md) for detailed architecture documentation.
161
+
162
+ ## Development
163
+
164
+ ### Quick Commands
165
+
166
+ ```bash
167
+ # Run tests
168
+ make test
169
+
170
+ # Run tests with coverage
171
+ make test-coverage
172
+
173
+ # Build Docker image
174
+ make docker
175
+
176
+ # Test Docker locally
177
+ make docker-run
178
+
179
+ # Run all checks (tests)
180
+ make check
181
+
182
+ # Clean temporary files
183
+ make clean
184
+ ```
185
+
186
+ ### Manual Commands
187
+
188
+ ```bash
189
+ # Run tests
190
+ pytest tests/
191
+
192
+ # Code formatting
193
+ black src/ tests/
194
+
195
+ # Local Docker testing
196
+ cd docker
197
+ docker-compose up
198
+ ```
199
+
200
+ ## Configuration
201
+
202
+ When integrating with AWS or other environments, you may use these environment variables:
203
+
204
+ - `OUTPUT_FORMAT`: Output format - json, csv, or parquet (default: json)
205
+ - `LOG_LEVEL`: Logging level (default: INFO)
206
+ - `AWS_REGION`: AWS region for S3 operations (default: us-east-1)
207
+ - `S3_BUCKET`: S3 bucket name (for S3 integrations)
208
+
209
+ ## Output Formats
210
+
211
+ ### JSON Lines (default)
212
+
213
+ ```json
214
+ {"timestamp": 1234567890, "host": "server01", "source": "/var/log/app.log", "sourcetype": "app", "message": "Event data"}
215
+ ```
216
+
217
+ ### CSV
218
+
219
+ ```csv
220
+ timestamp,host,source,sourcetype,message
221
+ 1234567890,server01,/var/log/app.log,app,"Event data"
222
+ ```
223
+
224
+ ### Parquet
225
+
226
+ Columnar format optimized for analytics (requires pyarrow).
227
+
228
+ ## Credits
229
+
230
+ This project is inspired by and based on the concept from [fionera/splunker](https://github.com/fionera/splunker), a Go implementation for extracting Splunk journal files. This Python implementation extends the original concept with:
231
+
232
+ - Streaming S3 support (no temporary files)
233
+ - Multiple output formats (JSON Lines, CSV, Parquet)
234
+ - Python library API for easy integration
235
+ - Docker and AWS deployment options
236
+
237
+ ## License
238
+
239
+ Proprietary
240
+
241
+ ## Contributing
242
+
243
+ See [CLAUDE.md](CLAUDE.md) for development guidelines.
@@ -0,0 +1,47 @@
1
+ [build-system]
2
+ requires = ["maturin>=1.0,<2.0"]
3
+ build-backend = "maturin"
4
+
5
+ [project]
6
+ name = "splunk-ddss-extractor"
7
+ version = "0.3.0"
8
+ description = "Extract events from Splunk journal archives to raw format (JSON, CSV, Parquet)"
9
+ readme = "README.md"
10
+ license = {text = "MIT"}
11
+ requires-python = ">=3.10"
12
+ authors = [
13
+ {name = "Lech Lachowicz"},
14
+ ]
15
+ classifiers = [
16
+ "Development Status :: 4 - Beta",
17
+ "Intended Audience :: Developers",
18
+ "License :: OSI Approved :: MIT License",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3.10",
21
+ "Programming Language :: Python :: 3.11",
22
+ "Programming Language :: Python :: 3.12",
23
+ "Programming Language :: Python :: 3.13",
24
+ "Programming Language :: Rust",
25
+ "Topic :: System :: Archiving",
26
+ ]
27
+ dependencies = ["zstandard>=0.22.0", "orjson>=3.9.0"]
28
+
29
+ [project.optional-dependencies]
30
+ s3 = ["boto3>=1.34.0"]
31
+ cli = ["click>=8.1.7"]
32
+ parquet = ["pyarrow>=14.0.0"]
33
+ dev = ["pytest>=7.4.3", "pytest-cov>=4.1.0"]
34
+
35
+ [project.urls]
36
+ Homepage = "https://github.com/ponquersohn/splunk_ddss_extractor"
37
+ Repository = "https://github.com/ponquersohn/splunk_ddss_extractor"
38
+ Issues = "https://github.com/ponquersohn/splunk_ddss_extractor/issues"
39
+
40
+ [project.scripts]
41
+ splunk-extract = "splunk_ddss_extractor.main:main"
42
+
43
+ [tool.maturin]
44
+ manifest-path = "rust/Cargo.toml"
45
+ module-name = "splunk_ddss_extractor._native"
46
+ python-source = "src"
47
+ sdist-include = ["rust/Cargo.toml", "rust/src/**"]