sql-xel-parser 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,379 @@
1
+ """
2
+ XEL Parser - Core module for parsing SQL Server Extended Events (.xel) files.
3
+
4
+ This module handles the low-level parsing of XEL binary files and extraction
5
+ of event data.
6
+ """
7
+
8
+ import struct
9
+ import sys
10
+ import xml.etree.ElementTree as ET
11
+ from typing import Iterator, Dict, Any, Optional, BinaryIO
12
+ from datetime import datetime
13
+ from io import BytesIO
14
+
15
+
16
+ class XELParser:
17
+ """Parser for SQL Server Extended Events (.xel) files."""
18
+
19
+ # XEL file format constants
20
+ HEADER_MAGIC = b'XELH'
21
+ EVENT_MAGIC = b'XEVT'
22
+
23
+ def __init__(self, file_path: str):
24
+ """
25
+ Initialize the XEL parser.
26
+
27
+ Args:
28
+ file_path: Path to the .xel file
29
+ """
30
+ self.file_path = file_path
31
+ self.events_parsed = 0
32
+
33
+ def parse(self) -> Iterator[Dict[str, Any]]:
34
+ """
35
+ Parse the XEL file and yield events as dictionaries.
36
+
37
+ Yields:
38
+ Dictionary containing event data
39
+ """
40
+ events_found = 0
41
+ try:
42
+ with open(self.file_path, 'rb') as f:
43
+ # Try to parse as XEL binary format
44
+ for event in self._parse_binary_format(f):
45
+ self.events_parsed += 1
46
+ events_found += 1
47
+ yield event
48
+
49
+ # If no events found in binary, try XML fallback
50
+ if events_found == 0:
51
+ for event in self._parse_xml_fallback():
52
+ self.events_parsed += 1
53
+ yield event
54
+ except Exception as e:
55
+ # If all parsing fails, try XML parsing as last resort
56
+ print(f"Parsing error, attempting XML fallback: {e}", file=sys.stderr)
57
+ try:
58
+ for event in self._parse_xml_fallback():
59
+ self.events_parsed += 1
60
+ yield event
61
+ except Exception as xml_error:
62
+ raise Exception(f"Failed to parse XEL file: Error: {e}, XML error: {xml_error}")
63
+
64
+ def _parse_binary_format(self, f: BinaryIO) -> Iterator[Dict[str, Any]]:
65
+ """
66
+ Parse XEL file in binary format.
67
+
68
+ XEL files contain a header followed by event blocks. SQL Server XEL files
69
+ use a proprietary binary format that requires specialized parsing.
70
+ """
71
+ # Read all data
72
+ f.seek(0)
73
+ data = f.read()
74
+
75
+ # Check for XEL magic number
76
+ if len(data) < 4:
77
+ return
78
+
79
+ # Check if it's an XML file
80
+ if data[:5] == b'<?xml' or data[:6] == b'<event' or data[:7] == b'<Events':
81
+ # It's XML, don't parse as binary
82
+ return
83
+
84
+ magic = data[0:4]
85
+ # Common XEL magic numbers
86
+ if magic not in [b'Z7\xab\xef', b'\x5a\x37\xab\xef']:
87
+ # Unknown format, try XML parsing as fallback
88
+ return
89
+
90
+ # For SQL Server XEL files, try to extract structured data
91
+ # These files use a complex binary format with UTF-16 strings
92
+ events_found = 0
93
+
94
+ # Look for XML event patterns (some XEL variants have XML)
95
+ xml_start_markers = [b'<event', b'<?xml', b'<\x00e\x00v\x00e\x00n\x00t\x00']
96
+
97
+ for marker in xml_start_markers:
98
+ offset = 0
99
+ while True:
100
+ pos = data.find(marker, offset)
101
+ if pos == -1:
102
+ break
103
+
104
+ # Try to extract XML from this position
105
+ try:
106
+ xml_data = self._extract_xml_from_position(data, pos)
107
+ if xml_data:
108
+ event = self._parse_event_xml(xml_data)
109
+ if event:
110
+ events_found += 1
111
+ yield event
112
+ offset = pos + 1
113
+ except Exception:
114
+ offset = pos + 1
115
+ continue
116
+
117
+ # If no XML events found, try to extract real audit data from binary
118
+ if events_found == 0:
119
+ try:
120
+ from .real_parser import extract_real_data_from_xel
121
+ real_data = extract_real_data_from_xel(self.file_path)
122
+
123
+ # Create events from extracted data
124
+ info = real_data['extracted_info']
125
+
126
+ # Create main audit summary event
127
+ yield {
128
+ 'name': 'audit_data_extracted',
129
+ 'timestamp': info.get('timestamps', [''])[0] if info.get('timestamps') else '',
130
+ 'data': {
131
+ 'format': 'SQL Server XEL binary format',
132
+ 'extraction_method': 'Binary data analysis',
133
+ 'server': info['server_names'][0] if info['server_names'] else 'unknown',
134
+ 'databases': info['database_names'],
135
+ 'ip_addresses': info['ip_addresses'],
136
+ 'event_types': info['event_types'],
137
+ 'applications': info['applications'],
138
+ 'users': info['users'],
139
+ 'file_size_bytes': len(data),
140
+ },
141
+ 'actions': info.get('session_info', {})
142
+ }
143
+
144
+ # Create individual events for reconstructed data
145
+ for event in real_data.get('reconstructed_events', [])[:10]:
146
+ yield {
147
+ 'name': event.get('event_type', 'unknown'),
148
+ 'timestamp': event.get('possible_timestamp', ''),
149
+ 'data': {k: v for k, v in event.items() if k not in ['event_type', 'possible_timestamp']},
150
+ 'actions': {}
151
+ }
152
+
153
+ except ImportError:
154
+ # Fallback to simple string extraction
155
+ strings = self._extract_utf16_strings(data)
156
+ yield {
157
+ 'name': 'binary_data',
158
+ 'timestamp': '',
159
+ 'data': {
160
+ 'format': 'SQL Server XEL binary format',
161
+ 'note': 'Install xel_real_parser for detailed extraction',
162
+ 'extracted_strings_count': len(strings),
163
+ 'extracted_strings_sample': strings[:10] if len(strings) > 10 else strings,
164
+ 'file_size_bytes': len(data),
165
+ 'magic_bytes': magic.hex()
166
+ },
167
+ 'actions': {}
168
+ }
169
+
170
+ def _extract_utf16_strings(self, data: bytes, min_length: int = 8) -> list:
171
+ """
172
+ Extract UTF-16 encoded strings from binary data.
173
+
174
+ Args:
175
+ data: Binary data
176
+ min_length: Minimum string length to extract
177
+
178
+ Returns:
179
+ List of extracted strings
180
+ """
181
+ strings = []
182
+ seen = set()
183
+ try:
184
+ # Try UTF-16 LE decoding
185
+ decoded = data.decode('utf-16-le', errors='ignore')
186
+ # Split on null characters and filter
187
+ parts = decoded.split('\x00')
188
+ for part in parts:
189
+ part = part.strip()
190
+ # Filter for readable ASCII strings and avoid duplicates
191
+ if (len(part) >= min_length and
192
+ part.isprintable() and
193
+ part.isascii() and
194
+ part not in seen and
195
+ (' ' in part or '.' in part or '_' in part)): # Likely meaningful text
196
+ strings.append(part)
197
+ seen.add(part)
198
+ except:
199
+ pass
200
+
201
+ return list(set(strings))[:100] # Limit to unique 100 most useful strings
202
+
203
+ def _extract_xml_from_position(self, data: bytes, start_pos: int) -> Optional[str]:
204
+ """
205
+ Extract complete XML from a starting position in binary data.
206
+
207
+ Args:
208
+ data: Binary data
209
+ start_pos: Starting position of XML
210
+
211
+ Returns:
212
+ XML string or None if extraction fails
213
+ """
214
+ # Look for XML end tag
215
+ end_markers = [b'</event>', b'</Event>']
216
+
217
+ for end_marker in end_markers:
218
+ end_pos = data.find(end_marker, start_pos)
219
+ if end_pos != -1:
220
+ end_pos += len(end_marker)
221
+ xml_bytes = data[start_pos:end_pos]
222
+ try:
223
+ # Try to decode as UTF-8 or UTF-16
224
+ for encoding in ['utf-8', 'utf-16-le', 'utf-16-be', 'latin-1']:
225
+ try:
226
+ xml_str = xml_bytes.decode(encoding)
227
+ # Basic validation
228
+ if '<event' in xml_str.lower() and '</event>' in xml_str.lower():
229
+ return xml_str
230
+ except:
231
+ continue
232
+ except:
233
+ pass
234
+ return None
235
+
236
+ def _parse_xml_fallback(self) -> Iterator[Dict[str, Any]]:
237
+ """
238
+ Fallback method to parse XEL file as XML.
239
+ Some tools export XEL data as XML.
240
+ """
241
+ try:
242
+ tree = ET.parse(self.file_path)
243
+ root = tree.getroot()
244
+
245
+ # Handle different XML structures
246
+ events = root.findall('.//event') or root.findall('.//Event')
247
+ for event_elem in events:
248
+ event = self._parse_event_element(event_elem)
249
+ if event:
250
+ yield event
251
+ except Exception as e:
252
+ raise Exception(f"XML parsing failed: {e}")
253
+
254
+ def _parse_event_xml(self, xml_str: str) -> Optional[Dict[str, Any]]:
255
+ """
256
+ Parse an individual event XML string.
257
+
258
+ Args:
259
+ xml_str: XML string containing event data
260
+
261
+ Returns:
262
+ Dictionary with event data or None
263
+ """
264
+ try:
265
+ # Clean up XML string
266
+ xml_str = xml_str.strip()
267
+
268
+ # Parse XML
269
+ root = ET.fromstring(xml_str)
270
+ return self._parse_event_element(root)
271
+ except Exception:
272
+ return None
273
+
274
+ def _parse_event_element(self, elem: ET.Element) -> Dict[str, Any]:
275
+ """
276
+ Parse an event XML element into a dictionary.
277
+
278
+ Args:
279
+ elem: XML element representing an event
280
+
281
+ Returns:
282
+ Dictionary with event data
283
+ """
284
+ event = {
285
+ 'name': elem.get('name', 'unknown'),
286
+ 'timestamp': elem.get('timestamp', ''),
287
+ 'data': {},
288
+ 'actions': {}
289
+ }
290
+
291
+ # Parse data fields
292
+ for data_elem in elem.findall('.//data') or elem.findall('.//Data'):
293
+ name = data_elem.get('name', '')
294
+ value = self._extract_value(data_elem)
295
+ if name:
296
+ event['data'][name] = value
297
+
298
+ # Parse action fields
299
+ for action_elem in elem.findall('.//action') or elem.findall('.//Action'):
300
+ name = action_elem.get('name', '')
301
+ value = self._extract_value(action_elem)
302
+ if name:
303
+ event['actions'][name] = value
304
+
305
+ # Parse any direct text content
306
+ if elem.text and elem.text.strip():
307
+ event['content'] = elem.text.strip()
308
+
309
+ return event
310
+
311
+ def _extract_value(self, elem: ET.Element) -> Any:
312
+ """
313
+ Extract value from an XML element.
314
+
315
+ Args:
316
+ elem: XML element
317
+
318
+ Returns:
319
+ Extracted value (string, int, float, etc.)
320
+ """
321
+ # Try 'value' attribute first
322
+ value = elem.get('value')
323
+ if value is not None:
324
+ return self._convert_value(value)
325
+
326
+ # Try text content
327
+ if elem.text:
328
+ return self._convert_value(elem.text.strip())
329
+
330
+ # Try child elements
331
+ value_elem = elem.find('value') or elem.find('Value')
332
+ if value_elem is not None and value_elem.text:
333
+ return self._convert_value(value_elem.text.strip())
334
+
335
+ return None
336
+
337
+ def _convert_value(self, value: str) -> Any:
338
+ """
339
+ Convert string value to appropriate type.
340
+
341
+ Args:
342
+ value: String value
343
+
344
+ Returns:
345
+ Converted value
346
+ """
347
+ if not value:
348
+ return value
349
+
350
+ # Try integer
351
+ try:
352
+ return int(value)
353
+ except ValueError:
354
+ pass
355
+
356
+ # Try float
357
+ try:
358
+ return float(value)
359
+ except ValueError:
360
+ pass
361
+
362
+ # Try boolean
363
+ if value.lower() in ('true', 'false'):
364
+ return value.lower() == 'true'
365
+
366
+ # Return as string
367
+ return value
368
+
369
+ def get_stats(self) -> Dict[str, Any]:
370
+ """
371
+ Get parsing statistics.
372
+
373
+ Returns:
374
+ Dictionary with stats
375
+ """
376
+ return {
377
+ 'file_path': self.file_path,
378
+ 'events_parsed': self.events_parsed
379
+ }
@@ -0,0 +1,295 @@
1
+ """
2
+ Real XEL Parser - Extract actual data from SQL Server binary XEL files.
3
+
4
+ This parser extracts meaningful audit information from SQL Server Extended Events files
5
+ without requiring SQL Server.
6
+ """
7
+
8
+ import struct
9
+ import re
10
+ from typing import List, Dict, Any
11
+ from collections import defaultdict
12
+
13
+
14
+ def extract_real_data_from_xel(file_path: str) -> Dict[str, Any]:
15
+ """
16
+ Extract real audit/event data from a SQL Server XEL binary file.
17
+
18
+ Args:
19
+ file_path: Path to .xel file
20
+
21
+ Returns:
22
+ Dictionary with extracted event data
23
+ """
24
+ with open(file_path, 'rb') as f:
25
+ data = f.read()
26
+
27
+ result = {
28
+ 'file_path': file_path,
29
+ 'file_size': len(data),
30
+ 'magic': data[:4].hex() if len(data) >= 4 else None,
31
+ 'extracted_info': {}
32
+ }
33
+
34
+ # Decode as UTF-16 to extract strings
35
+ try:
36
+ decoded = data.decode('utf-16-le', errors='ignore')
37
+
38
+ # Extract various types of information
39
+ result['extracted_info'] = {
40
+ 'server_names': extract_servers(decoded),
41
+ 'database_names': extract_databases(decoded),
42
+ 'sql_statements': extract_sql(decoded),
43
+ 'ip_addresses': extract_ips(decoded),
44
+ 'event_types': extract_event_types(decoded),
45
+ 'timestamps': extract_timestamps(decoded),
46
+ 'session_info': extract_sessions(decoded),
47
+ 'users': extract_users(decoded),
48
+ 'applications': extract_applications(decoded),
49
+ }
50
+
51
+ # Try to reconstruct partial events from the data
52
+ result['reconstructed_events'] = reconstruct_events(decoded, data)
53
+
54
+ except Exception as e:
55
+ result['error'] = str(e)
56
+
57
+ return result
58
+
59
+
60
+ def extract_servers(text: str) -> List[str]:
61
+ """Extract server names."""
62
+ pattern = r'sql-[a-zA-Z0-9\-]+-[a-z]+\d+'
63
+ servers = re.findall(pattern, text)
64
+ return list(set(servers))
65
+
66
+
67
+ def extract_databases(text: str) -> List[str]:
68
+ """Extract database names."""
69
+ # Look for common database patterns
70
+ databases = set()
71
+
72
+ # Pattern 1: database names in context
73
+ patterns = [
74
+ r'(?:database|db)[:\s]+([a-zA-Z][a-zA-Z0-9_]{2,30})',
75
+ r'USE\s+([a-zA-Z][a-zA-Z0-9_]{2,30})',
76
+ r'FROM\s+([a-zA-Z][a-zA-Z0-9_]{2,30})\.',
77
+ ]
78
+
79
+ for pattern in patterns:
80
+ matches = re.findall(pattern, text, re.IGNORECASE)
81
+ databases.update(matches)
82
+
83
+ # Pattern 2: Look for specific known database names
84
+ # Common patterns in audit logs
85
+ words = text.split()
86
+ for i, word in enumerate(words):
87
+ # Look for DB-like words
88
+ if len(word) > 3 and word.isalnum() and not word.isdigit():
89
+ # Check if it appears in database-related context
90
+ if i > 0 and any(kw in words[i-1].lower() for kw in ['database', 'db', 'use', 'from']):
91
+ databases.add(word)
92
+
93
+ # Common system databases
94
+ system_dbs = {'master', 'tempdb', 'model', 'msdb'}
95
+ found_system = [db for db in system_dbs if db in text.lower()]
96
+ databases.update(found_system)
97
+
98
+ # Filter out noise
99
+ filtered = {db for db in databases if 2 < len(db) < 50 and not db.isdigit()}
100
+ return sorted(filtered)
101
+
102
+
103
+ def extract_sql(text: str) -> List[str]:
104
+ """Extract SQL statements."""
105
+ statements = []
106
+
107
+ # Look for SQL keywords
108
+ sql_pattern = r'(SELECT|INSERT|UPDATE|DELETE|EXECUTE|EXEC|CREATE|ALTER|DROP)[\s\S]{10,200}?(?:;|FROM|WHERE|INTO|SET|VALUES)'
109
+ matches = re.findall(sql_pattern, text, re.IGNORECASE)
110
+
111
+ for match in matches:
112
+ stmt = ' '.join(match.split()) # Normalize whitespace
113
+ if len(stmt) > 15:
114
+ statements.append(stmt[:200]) # Limit length
115
+
116
+ return list(set(statements))[:20] # Return up to 20 unique statements
117
+
118
+
119
+ def extract_ips(text: str) -> List[str]:
120
+ """Extract IP addresses."""
121
+ pattern = r'\b(?:\d{1,3}\.){3}\d{1,3}\b'
122
+ ips = re.findall(pattern, text)
123
+ # Filter valid IPs
124
+ valid_ips = []
125
+ for ip in ips:
126
+ parts = ip.split('.')
127
+ if all(0 <= int(p) <= 255 for p in parts):
128
+ valid_ips.append(ip)
129
+ return list(set(valid_ips))
130
+
131
+
132
+ def extract_event_types(text: str) -> List[str]:
133
+ """Extract event types."""
134
+ event_types = set()
135
+
136
+ # Known XEL event types
137
+ known_types = [
138
+ 'audit_event', 'sql_batch_completed', 'sql_batch_starting',
139
+ 'rpc_completed', 'rpc_starting', 'login', 'logout',
140
+ 'attention', 'existing_connection', 'session_id',
141
+ 'audit_schema_version', 'event_sequence',
142
+ ]
143
+
144
+ for event_type in known_types:
145
+ if event_type in text.lower():
146
+ event_types.add(event_type)
147
+
148
+ # Look for patterns like "event_*" or "*_event"
149
+ pattern = r'\b\w+_event\b|\bevent_\w+\b'
150
+ matches = re.findall(pattern, text, re.IGNORECASE)
151
+ event_types.update(m.lower() for m in matches)
152
+
153
+ return sorted(event_types)
154
+
155
+
156
+ def extract_timestamps(text: str) -> List[str]:
157
+ """Extract timestamps."""
158
+ timestamps = []
159
+
160
+ # ISO format timestamps
161
+ pattern = r'\d{4}-\d{2}-\d{2}[T\s]\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:\d{2})?'
162
+ matches = re.findall(pattern, text)
163
+ timestamps.extend(matches)
164
+
165
+ return list(set(timestamps))[:50] # Limit to 50
166
+
167
+
168
+ def extract_sessions(text: str) -> Dict[str, Any]:
169
+ """Extract session information."""
170
+ session_info = {}
171
+
172
+ # Look for session IDs
173
+ pattern = r'session[_\s]*id[:\s]*(\d+)'
174
+ matches = re.findall(pattern, text, re.IGNORECASE)
175
+ if matches:
176
+ session_info['session_ids'] = list(set(matches))[:20]
177
+
178
+ # Look for SqlDbAuditing sessions
179
+ pattern = r'SqlDbAuditing[^<>]{0,100}'
180
+ matches = re.findall(pattern, text)
181
+ if matches:
182
+ session_info['audit_sessions'] = list(set(matches))[:10]
183
+
184
+ return session_info
185
+
186
+
187
+ def extract_users(text: str) -> List[str]:
188
+ """Extract usernames."""
189
+ users = set()
190
+
191
+ # Look for common user patterns
192
+ patterns = [
193
+ r'user[:\s]+([a-zA-Z][a-zA-Z0-9_@\-\.]{2,50})',
194
+ r'login[:\s]+([a-zA-Z][a-zA-Z0-9_@\-\.]{2,50})',
195
+ r'USER_NAME\(\)[:\s]*([a-zA-Z][a-zA-Z0-9_@\-\.]{2,50})',
196
+ ]
197
+
198
+ for pattern in patterns:
199
+ matches = re.findall(pattern, text, re.IGNORECASE)
200
+ users.update(matches)
201
+
202
+ # Common system users
203
+ if 'sa' in text and 'sa' not in users:
204
+ users.add('sa')
205
+ if 'dbo' in text and 'dbo' not in users:
206
+ users.add('dbo')
207
+
208
+ # Filter out noise
209
+ filtered = {u for u in users if 2 < len(u) < 100}
210
+ return sorted(filtered)[:30]
211
+
212
+
213
+ def extract_applications(text: str) -> List[str]:
214
+ """Extract application names."""
215
+ apps = set()
216
+
217
+ # Look for common application patterns
218
+ patterns = [
219
+ r'Core Microsoft SqlClient Data Provider',
220
+ r'\.NET [^<>\s]{5,50}',
221
+ r'ODBC Driver \d+',
222
+ r'[A-Z][a-zA-Z0-9\s]{5,50}(?:Application|App|Client|Service)',
223
+ ]
224
+
225
+ for pattern in patterns:
226
+ matches = re.findall(pattern, text)
227
+ apps.update(matches)
228
+
229
+ return list(apps)[:20]
230
+
231
+
232
+ def reconstruct_events(text: str, binary_data: bytes) -> List[Dict[str, Any]]:
233
+ """
234
+ Attempt to reconstruct event records from extracted data.
235
+ """
236
+ events = []
237
+
238
+ # Strategy: Look for combinations of extracted data that likely represent events
239
+
240
+ # Extract all the pieces
241
+ servers = extract_servers(text)
242
+ databases = extract_databases(text)
243
+ sql_stmts = extract_sql(text)
244
+ ips = extract_ips(text)
245
+ timestamps = extract_timestamps(text)
246
+ event_types = extract_event_types(text)
247
+
248
+ # If we have SQL statements, create events for them
249
+ for i, stmt in enumerate(sql_stmts):
250
+ event = {
251
+ 'event_type': 'sql_execution',
252
+ 'statement': stmt,
253
+ }
254
+
255
+ # Try to associate with other data
256
+ if databases:
257
+ event['possible_database'] = databases[i % len(databases)]
258
+ if servers:
259
+ event['server'] = servers[0]
260
+ if timestamps and i < len(timestamps):
261
+ event['possible_timestamp'] = timestamps[i]
262
+
263
+ events.append(event)
264
+
265
+ # Create generic audit events for other extracted info
266
+ if event_types:
267
+ for event_type in event_types:
268
+ event = {
269
+ 'event_type': event_type,
270
+ 'extracted_from': 'audit_metadata'
271
+ }
272
+ if servers:
273
+ event['server'] = servers[0]
274
+ events.append(event)
275
+
276
+ return events[:50] # Limit to 50 events
277
+
278
+
279
+ def parse_xel_file(file_path: str) -> Dict[str, Any]:
280
+ """
281
+ Main entry point to parse a real XEL file.
282
+ """
283
+ return extract_real_data_from_xel(file_path)
284
+
285
+
286
+ if __name__ == '__main__':
287
+ import sys
288
+ import json
289
+
290
+ if len(sys.argv) < 2:
291
+ print("Usage: python xel_real_parser.py <xel_file>")
292
+ sys.exit(1)
293
+
294
+ result = parse_xel_file(sys.argv[1])
295
+ print(json.dumps(result, indent=2))