gitflow-analytics 1.0.3__py3-none-any.whl → 1.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. gitflow_analytics/_version.py +1 -1
  2. gitflow_analytics/classification/__init__.py +31 -0
  3. gitflow_analytics/classification/batch_classifier.py +752 -0
  4. gitflow_analytics/classification/classifier.py +464 -0
  5. gitflow_analytics/classification/feature_extractor.py +725 -0
  6. gitflow_analytics/classification/linguist_analyzer.py +574 -0
  7. gitflow_analytics/classification/model.py +455 -0
  8. gitflow_analytics/cli.py +4158 -350
  9. gitflow_analytics/cli_rich.py +198 -48
  10. gitflow_analytics/config/__init__.py +43 -0
  11. gitflow_analytics/config/errors.py +261 -0
  12. gitflow_analytics/config/loader.py +905 -0
  13. gitflow_analytics/config/profiles.py +264 -0
  14. gitflow_analytics/config/repository.py +124 -0
  15. gitflow_analytics/config/schema.py +444 -0
  16. gitflow_analytics/config/validator.py +154 -0
  17. gitflow_analytics/config.py +44 -508
  18. gitflow_analytics/core/analyzer.py +1209 -98
  19. gitflow_analytics/core/cache.py +1337 -29
  20. gitflow_analytics/core/data_fetcher.py +1285 -0
  21. gitflow_analytics/core/identity.py +363 -14
  22. gitflow_analytics/core/metrics_storage.py +526 -0
  23. gitflow_analytics/core/progress.py +372 -0
  24. gitflow_analytics/core/schema_version.py +269 -0
  25. gitflow_analytics/extractors/ml_tickets.py +1100 -0
  26. gitflow_analytics/extractors/story_points.py +8 -1
  27. gitflow_analytics/extractors/tickets.py +749 -11
  28. gitflow_analytics/identity_llm/__init__.py +6 -0
  29. gitflow_analytics/identity_llm/analysis_pass.py +231 -0
  30. gitflow_analytics/identity_llm/analyzer.py +464 -0
  31. gitflow_analytics/identity_llm/models.py +76 -0
  32. gitflow_analytics/integrations/github_integration.py +175 -11
  33. gitflow_analytics/integrations/jira_integration.py +461 -24
  34. gitflow_analytics/integrations/orchestrator.py +124 -1
  35. gitflow_analytics/metrics/activity_scoring.py +322 -0
  36. gitflow_analytics/metrics/branch_health.py +470 -0
  37. gitflow_analytics/metrics/dora.py +379 -20
  38. gitflow_analytics/models/database.py +843 -53
  39. gitflow_analytics/pm_framework/__init__.py +115 -0
  40. gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
  41. gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
  42. gitflow_analytics/pm_framework/base.py +406 -0
  43. gitflow_analytics/pm_framework/models.py +211 -0
  44. gitflow_analytics/pm_framework/orchestrator.py +652 -0
  45. gitflow_analytics/pm_framework/registry.py +333 -0
  46. gitflow_analytics/qualitative/__init__.py +9 -10
  47. gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
  48. gitflow_analytics/qualitative/classifiers/__init__.py +3 -3
  49. gitflow_analytics/qualitative/classifiers/change_type.py +518 -244
  50. gitflow_analytics/qualitative/classifiers/domain_classifier.py +272 -165
  51. gitflow_analytics/qualitative/classifiers/intent_analyzer.py +321 -222
  52. gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
  53. gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
  54. gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
  55. gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
  56. gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
  57. gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
  58. gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
  59. gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
  60. gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
  61. gitflow_analytics/qualitative/classifiers/risk_analyzer.py +215 -189
  62. gitflow_analytics/qualitative/core/__init__.py +4 -4
  63. gitflow_analytics/qualitative/core/llm_fallback.py +239 -235
  64. gitflow_analytics/qualitative/core/nlp_engine.py +157 -148
  65. gitflow_analytics/qualitative/core/pattern_cache.py +214 -192
  66. gitflow_analytics/qualitative/core/processor.py +381 -248
  67. gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
  68. gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
  69. gitflow_analytics/qualitative/models/__init__.py +7 -7
  70. gitflow_analytics/qualitative/models/schemas.py +155 -121
  71. gitflow_analytics/qualitative/utils/__init__.py +4 -4
  72. gitflow_analytics/qualitative/utils/batch_processor.py +136 -123
  73. gitflow_analytics/qualitative/utils/cost_tracker.py +142 -140
  74. gitflow_analytics/qualitative/utils/metrics.py +172 -158
  75. gitflow_analytics/qualitative/utils/text_processing.py +146 -104
  76. gitflow_analytics/reports/__init__.py +100 -0
  77. gitflow_analytics/reports/analytics_writer.py +539 -14
  78. gitflow_analytics/reports/base.py +648 -0
  79. gitflow_analytics/reports/branch_health_writer.py +322 -0
  80. gitflow_analytics/reports/classification_writer.py +924 -0
  81. gitflow_analytics/reports/cli_integration.py +427 -0
  82. gitflow_analytics/reports/csv_writer.py +1676 -212
  83. gitflow_analytics/reports/data_models.py +504 -0
  84. gitflow_analytics/reports/database_report_generator.py +427 -0
  85. gitflow_analytics/reports/example_usage.py +344 -0
  86. gitflow_analytics/reports/factory.py +499 -0
  87. gitflow_analytics/reports/formatters.py +698 -0
  88. gitflow_analytics/reports/html_generator.py +1116 -0
  89. gitflow_analytics/reports/interfaces.py +489 -0
  90. gitflow_analytics/reports/json_exporter.py +2770 -0
  91. gitflow_analytics/reports/narrative_writer.py +2287 -158
  92. gitflow_analytics/reports/story_point_correlation.py +1144 -0
  93. gitflow_analytics/reports/weekly_trends_writer.py +389 -0
  94. gitflow_analytics/training/__init__.py +5 -0
  95. gitflow_analytics/training/model_loader.py +377 -0
  96. gitflow_analytics/training/pipeline.py +550 -0
  97. gitflow_analytics/tui/__init__.py +1 -1
  98. gitflow_analytics/tui/app.py +129 -126
  99. gitflow_analytics/tui/screens/__init__.py +3 -3
  100. gitflow_analytics/tui/screens/analysis_progress_screen.py +188 -179
  101. gitflow_analytics/tui/screens/configuration_screen.py +154 -178
  102. gitflow_analytics/tui/screens/loading_screen.py +100 -110
  103. gitflow_analytics/tui/screens/main_screen.py +89 -72
  104. gitflow_analytics/tui/screens/results_screen.py +305 -281
  105. gitflow_analytics/tui/widgets/__init__.py +2 -2
  106. gitflow_analytics/tui/widgets/data_table.py +67 -69
  107. gitflow_analytics/tui/widgets/export_modal.py +76 -76
  108. gitflow_analytics/tui/widgets/progress_widget.py +41 -46
  109. gitflow_analytics-1.3.11.dist-info/METADATA +1015 -0
  110. gitflow_analytics-1.3.11.dist-info/RECORD +122 -0
  111. gitflow_analytics-1.0.3.dist-info/METADATA +0 -490
  112. gitflow_analytics-1.0.3.dist-info/RECORD +0 -62
  113. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/WHEEL +0 -0
  114. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/entry_points.txt +0 -0
  115. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/licenses/LICENSE +0 -0
  116. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,372 @@
1
+ """Centralized progress reporting service for GitFlow Analytics.
2
+
3
+ This module provides a unified interface for progress reporting across the application,
4
+ replacing scattered tqdm usage with a centralized, testable, and configurable service.
5
+
6
+ WHY: Progress reporting was scattered across multiple modules (analyzer.py, data_fetcher.py,
7
+ batch_classifier.py, etc.), violating DRY principles and making it difficult to maintain
8
+ consistent progress UX. This service centralizes all progress management.
9
+
10
+ DESIGN DECISIONS:
11
+ - Context-based API: Each progress bar gets a context object for clean lifecycle management
12
+ - Thread-safe: Uses threading locks to ensure safe concurrent access
13
+ - Testable: Can be globally disabled for testing, with event capture capability
14
+ - Nested support: Handles nested progress contexts with proper positioning
15
+ - Consistent styling: All progress bars follow the same formatting rules
16
+
17
+ USAGE:
18
+ from gitflow_analytics.core.progress import get_progress_service
19
+
20
+ progress = get_progress_service()
21
+ context = progress.create_progress(100, "Processing items")
22
+ for item in items:
23
+ # Process item
24
+ progress.update(context)
25
+ progress.complete(context)
26
+ """
27
+
28
+ import os
29
+ import sys
30
+ import threading
31
+ from contextlib import contextmanager
32
+ from dataclasses import dataclass
33
+ from typing import Any, Optional
34
+
35
+ from tqdm import tqdm
36
+
37
+
38
+ @dataclass
39
+ class ProgressContext:
40
+ """Context object for a single progress operation.
41
+
42
+ Encapsulates all state for a progress bar, allowing clean lifecycle management
43
+ and preventing resource leaks.
44
+ """
45
+
46
+ progress_bar: Optional[Any] # tqdm instance or None if disabled
47
+ description: str
48
+ total: int
49
+ unit: str
50
+ position: int
51
+ current: int = 0
52
+ is_nested: bool = False
53
+ parent_context: Optional["ProgressContext"] = None
54
+
55
+
56
+ @dataclass
57
+ class ProgressEvent:
58
+ """Event captured during progress operations for testing.
59
+
60
+ Allows tests to verify that progress operations occurred without
61
+ actually displaying progress bars.
62
+ """
63
+
64
+ event_type: str # 'create', 'update', 'complete'
65
+ description: str
66
+ total: Optional[int] = None
67
+ increment: Optional[int] = None
68
+ current: Optional[int] = None
69
+
70
+
71
+ class ProgressService:
72
+ """Centralized service for managing progress reporting.
73
+
74
+ This service provides a unified interface for creating and managing progress bars
75
+ throughout the application. It supports nested progress contexts, global disable
76
+ for testing, and event capture for verification.
77
+ """
78
+
79
+ def __init__(self):
80
+ """Initialize the progress service."""
81
+ self._enabled = True
82
+ self._lock = threading.Lock()
83
+ self._active_contexts: list[ProgressContext] = []
84
+ self._position_counter = 0
85
+ self._capture_events = False
86
+ self._captured_events: list[ProgressEvent] = []
87
+
88
+ # Check environment for testing mode
89
+ self._check_testing_environment()
90
+
91
+ def _check_testing_environment(self):
92
+ """Check if running in a testing environment and disable if needed.
93
+
94
+ WHY: Progress bars interfere with test output and can cause issues in CI/CD.
95
+ This automatically detects common testing scenarios and disables progress.
96
+ """
97
+ # Disable in pytest
98
+ if "pytest" in sys.modules:
99
+ self._enabled = False
100
+
101
+ # Disable if explicitly requested via environment
102
+ if os.environ.get("GITFLOW_DISABLE_PROGRESS", "").lower() in ("1", "true", "yes"):
103
+ self._enabled = False
104
+
105
+ # Disable if not in a TTY (e.g., CI/CD, piped output)
106
+ if not sys.stdout.isatty():
107
+ self._enabled = False
108
+
109
+ def create_progress(
110
+ self,
111
+ total: int,
112
+ description: str,
113
+ unit: str = "items",
114
+ nested: bool = False,
115
+ leave: bool = True,
116
+ position: Optional[int] = None,
117
+ ) -> ProgressContext:
118
+ """Create a new progress context.
119
+
120
+ Args:
121
+ total: Total number of items to process
122
+ description: Description shown next to the progress bar
123
+ unit: Unit label for items (e.g., "commits", "repos", "files")
124
+ nested: Whether this is a nested progress bar
125
+ leave: Whether to leave the progress bar on screen after completion
126
+ position: Explicit position for the progress bar (for nested contexts)
127
+
128
+ Returns:
129
+ ProgressContext object to use for updates
130
+
131
+ DESIGN: Returns a context object rather than the tqdm instance directly
132
+ to provide better lifecycle management and prevent resource leaks.
133
+ """
134
+ with self._lock:
135
+ # Capture event if needed
136
+ if self._capture_events:
137
+ self._captured_events.append(ProgressEvent("create", description, total=total))
138
+
139
+ # Determine position for nested progress bars
140
+ if position is None:
141
+ if nested:
142
+ self._position_counter += 1
143
+ position = self._position_counter
144
+
145
+ # Create context
146
+ context = ProgressContext(
147
+ progress_bar=None,
148
+ description=description,
149
+ total=total,
150
+ unit=unit,
151
+ position=position,
152
+ is_nested=nested,
153
+ )
154
+
155
+ # Create actual progress bar if enabled
156
+ if self._enabled:
157
+ context.progress_bar = tqdm(
158
+ total=total,
159
+ desc=description,
160
+ unit=unit,
161
+ position=position,
162
+ leave=leave,
163
+ # Consistent styling
164
+ bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]",
165
+ dynamic_ncols=True,
166
+ )
167
+
168
+ self._active_contexts.append(context)
169
+ return context
170
+
171
+ def update(
172
+ self, context: ProgressContext, increment: int = 1, description: Optional[str] = None
173
+ ):
174
+ """Update progress for a given context.
175
+
176
+ Args:
177
+ context: The progress context to update
178
+ increment: Number of items completed (default: 1)
179
+ description: Optional new description to set
180
+
181
+ WHY: Centralizes update logic and ensures consistent behavior across
182
+ all progress bars in the application.
183
+ """
184
+ with self._lock:
185
+ context.current += increment
186
+
187
+ # Capture event if needed
188
+ if self._capture_events:
189
+ self._captured_events.append(
190
+ ProgressEvent(
191
+ "update",
192
+ description or context.description,
193
+ increment=increment,
194
+ current=context.current,
195
+ )
196
+ )
197
+
198
+ # Update actual progress bar if it exists
199
+ if context.progress_bar:
200
+ context.progress_bar.update(increment)
201
+ if description:
202
+ context.progress_bar.set_description(description)
203
+
204
+ def set_description(self, context: ProgressContext, description: str):
205
+ """Update the description of a progress context.
206
+
207
+ Args:
208
+ context: The progress context to update
209
+ description: New description to display
210
+ """
211
+ with self._lock:
212
+ context.description = description
213
+ if context.progress_bar:
214
+ context.progress_bar.set_description(description)
215
+
216
+ def complete(self, context: ProgressContext):
217
+ """Mark a progress context as complete and clean up resources.
218
+
219
+ Args:
220
+ context: The progress context to complete
221
+
222
+ IMPORTANT: Always call this method when done with a progress context
223
+ to ensure proper resource cleanup.
224
+ """
225
+ with self._lock:
226
+ # Capture event if needed
227
+ if self._capture_events:
228
+ self._captured_events.append(
229
+ ProgressEvent("complete", context.description, current=context.current)
230
+ )
231
+
232
+ # Remove from active contexts BEFORE modifying progress_bar
233
+ # to avoid comparison issues with None
234
+ if context in self._active_contexts:
235
+ self._active_contexts.remove(context)
236
+
237
+ # Close actual progress bar if it exists
238
+ if context.progress_bar:
239
+ context.progress_bar.close()
240
+ context.progress_bar = None
241
+
242
+ # Reset position counter if no nested contexts remain
243
+ if context.is_nested and not any(c.is_nested for c in self._active_contexts):
244
+ self._position_counter = 0
245
+
246
+ @contextmanager
247
+ def progress(
248
+ self,
249
+ total: int,
250
+ description: str,
251
+ unit: str = "items",
252
+ nested: bool = False,
253
+ leave: bool = True,
254
+ ):
255
+ """Context manager for progress operations.
256
+
257
+ Args:
258
+ total: Total number of items to process
259
+ description: Description shown next to the progress bar
260
+ unit: Unit label for items
261
+ nested: Whether this is a nested progress bar
262
+ leave: Whether to leave the progress bar on screen
263
+
264
+ Yields:
265
+ ProgressContext object for updates
266
+
267
+ Example:
268
+ with progress.progress(100, "Processing") as ctx:
269
+ for item in items:
270
+ process(item)
271
+ progress.update(ctx)
272
+ """
273
+ context = self.create_progress(total, description, unit, nested, leave)
274
+ try:
275
+ yield context
276
+ finally:
277
+ self.complete(context)
278
+
279
+ def disable(self):
280
+ """Disable all progress reporting globally.
281
+
282
+ Useful for testing or quiet mode operation.
283
+ """
284
+ with self._lock:
285
+ self._enabled = False
286
+ # Close any active progress bars
287
+ for context in self._active_contexts[:]:
288
+ if context.progress_bar:
289
+ context.progress_bar.close()
290
+ context.progress_bar = None
291
+
292
+ def enable(self):
293
+ """Enable progress reporting globally."""
294
+ with self._lock:
295
+ self._enabled = True
296
+
297
+ def is_enabled(self) -> bool:
298
+ """Check if progress reporting is enabled."""
299
+ return self._enabled
300
+
301
+ def start_event_capture(self):
302
+ """Start capturing progress events for testing.
303
+
304
+ WHY: Allows tests to verify that progress operations occurred
305
+ without actually displaying progress bars.
306
+ """
307
+ with self._lock:
308
+ self._capture_events = True
309
+ self._captured_events = []
310
+
311
+ def stop_event_capture(self) -> list[ProgressEvent]:
312
+ """Stop capturing events and return captured events.
313
+
314
+ Returns:
315
+ List of ProgressEvent objects that were captured
316
+ """
317
+ with self._lock:
318
+ self._capture_events = False
319
+ events = self._captured_events[:]
320
+ self._captured_events = []
321
+ return events
322
+
323
+ def get_captured_events(self) -> list[ProgressEvent]:
324
+ """Get currently captured events without stopping capture.
325
+
326
+ Returns:
327
+ List of ProgressEvent objects captured so far
328
+ """
329
+ with self._lock:
330
+ return self._captured_events[:]
331
+
332
+ def clear_captured_events(self):
333
+ """Clear captured events without stopping capture."""
334
+ with self._lock:
335
+ self._captured_events = []
336
+
337
+
338
+ # Global singleton instance
339
+ _progress_service: Optional[ProgressService] = None
340
+ _service_lock = threading.Lock()
341
+
342
+
343
+ def get_progress_service() -> ProgressService:
344
+ """Get the global progress service instance.
345
+
346
+ Returns:
347
+ The singleton ProgressService instance
348
+
349
+ Thread-safe singleton pattern ensures only one progress service exists.
350
+ """
351
+ global _progress_service
352
+
353
+ if _progress_service is None:
354
+ with _service_lock:
355
+ if _progress_service is None:
356
+ _progress_service = ProgressService()
357
+
358
+ return _progress_service
359
+
360
+
361
+ def reset_progress_service():
362
+ """Reset the global progress service instance.
363
+
364
+ WARNING: Only use this in tests or during application shutdown.
365
+ This will close all active progress bars and create a new service instance.
366
+ """
367
+ global _progress_service
368
+
369
+ with _service_lock:
370
+ if _progress_service:
371
+ _progress_service.disable()
372
+ _progress_service = None
@@ -0,0 +1,269 @@
1
+ """Schema versioning for tracking data structure changes."""
2
+
3
+ import hashlib
4
+ import json
5
+ from datetime import datetime, timezone
6
+ from pathlib import Path
7
+ from typing import Any, Optional
8
+
9
+ from sqlalchemy import Column, DateTime, String, Text, create_engine
10
+ from sqlalchemy.orm import declarative_base, sessionmaker
11
+
12
+ Base = declarative_base()
13
+
14
+
15
+ class SchemaVersion(Base):
16
+ """Track schema versions for incremental data processing."""
17
+
18
+ __tablename__ = "schema_versions"
19
+
20
+ component = Column(String, primary_key=True) # e.g., 'qualitative', 'identity', 'core'
21
+ version_hash = Column(String, nullable=False) # Hash of schema definition
22
+ schema_definition = Column(Text, nullable=False) # JSON schema definition
23
+ created_at = Column(DateTime, default=datetime.utcnow)
24
+ last_processed_date = Column(
25
+ DateTime, nullable=True
26
+ ) # Last date we processed data with this schema
27
+
28
+
29
+ class SchemaVersionManager:
30
+ """Manages schema versions and determines if incremental processing is possible."""
31
+
32
+ # Define current schema versions for each component
33
+ CURRENT_SCHEMAS = {
34
+ "qualitative": {
35
+ "version": "2.0",
36
+ "fields": [
37
+ "change_type",
38
+ "change_type_confidence",
39
+ "business_domain",
40
+ "domain_confidence",
41
+ "risk_level",
42
+ "risk_factors",
43
+ "intent_signals",
44
+ "collaboration_patterns",
45
+ "technical_context",
46
+ "processing_method",
47
+ "processing_time_ms",
48
+ "confidence_score",
49
+ ],
50
+ "config_fields": [
51
+ "nlp_config",
52
+ "llm_config",
53
+ "cache_config",
54
+ "confidence_threshold",
55
+ "max_llm_fallback_pct",
56
+ ],
57
+ },
58
+ "identity": {
59
+ "version": "1.3",
60
+ "fields": [
61
+ "canonical_id",
62
+ "primary_name",
63
+ "primary_email",
64
+ "manual_mappings",
65
+ "similarity_threshold",
66
+ "auto_analysis",
67
+ "display_names",
68
+ "preferred_name_field",
69
+ ],
70
+ },
71
+ "core": {
72
+ "version": "1.0",
73
+ "fields": [
74
+ "story_points",
75
+ "ticket_references",
76
+ "files_changed",
77
+ "insertions",
78
+ "deletions",
79
+ "complexity_delta",
80
+ "branch_mapping_rules",
81
+ ],
82
+ },
83
+ "github": {
84
+ "version": "1.0",
85
+ "fields": [
86
+ "pr_data",
87
+ "pr_metrics",
88
+ "issue_data",
89
+ "rate_limit_retries",
90
+ "backoff_factor",
91
+ "allowed_ticket_platforms",
92
+ ],
93
+ },
94
+ "jira": {
95
+ "version": "1.0",
96
+ "fields": ["story_point_fields", "project_keys", "base_url", "issue_data"],
97
+ },
98
+ }
99
+
100
+ def __init__(self, cache_dir: Path):
101
+ """Initialize schema version manager."""
102
+ self.cache_dir = cache_dir
103
+ self.db_path = cache_dir / "schema_versions.db"
104
+ self.engine = create_engine(f"sqlite:///{self.db_path}")
105
+ Base.metadata.create_all(self.engine)
106
+ self.session_factory = sessionmaker(bind=self.engine)
107
+
108
+ def get_schema_hash(self, component: str, config: Optional[dict[str, Any]] = None) -> str:
109
+ """Generate hash for a component's schema including configuration."""
110
+ if component not in self.CURRENT_SCHEMAS:
111
+ raise ValueError(f"Unknown component: {component}")
112
+
113
+ schema_def = self.CURRENT_SCHEMAS[component].copy()
114
+
115
+ # Include relevant configuration in the hash
116
+ if config and "config_fields" in schema_def:
117
+ relevant_config = {}
118
+ for field in schema_def["config_fields"]:
119
+ if field in config:
120
+ relevant_config[field] = self._normalize_config_value(config[field])
121
+ schema_def["config"] = relevant_config
122
+
123
+ # Create deterministic hash
124
+ schema_json = json.dumps(schema_def, sort_keys=True)
125
+ return hashlib.sha256(schema_json.encode()).hexdigest()[:16]
126
+
127
+ def _normalize_config_value(self, value: Any) -> Any:
128
+ """Normalize config values for consistent hashing."""
129
+ if isinstance(value, dict):
130
+ return {k: self._normalize_config_value(v) for k, v in sorted(value.items())}
131
+ elif isinstance(value, list):
132
+ return sorted([self._normalize_config_value(v) for v in value])
133
+ elif isinstance(value, (int, float, str, bool, type(None))):
134
+ return value
135
+ else:
136
+ # Convert complex objects to string representation
137
+ return str(value)
138
+
139
+ def has_schema_changed(self, component: str, config: Optional[dict[str, Any]] = None) -> bool:
140
+ """Check if schema has changed since last processing."""
141
+ current_hash = self.get_schema_hash(component, config)
142
+
143
+ with self.session_factory() as session:
144
+ stored_version = session.query(SchemaVersion).filter_by(component=component).first()
145
+
146
+ if not stored_version:
147
+ return True # No previous schema, consider changed
148
+
149
+ return stored_version.version_hash != current_hash
150
+
151
+ def update_schema_version(
152
+ self,
153
+ component: str,
154
+ config: Optional[dict[str, Any]] = None,
155
+ last_processed_date: Optional[datetime] = None,
156
+ ):
157
+ """Update stored schema version."""
158
+ current_hash = self.get_schema_hash(component, config)
159
+ schema_def = json.dumps(self.CURRENT_SCHEMAS[component], sort_keys=True)
160
+
161
+ # Ensure date is timezone-aware before storing
162
+ if last_processed_date and last_processed_date.tzinfo is None:
163
+ last_processed_date = last_processed_date.replace(tzinfo=timezone.utc)
164
+
165
+ with self.session_factory() as session:
166
+ stored_version = session.query(SchemaVersion).filter_by(component=component).first()
167
+
168
+ if stored_version:
169
+ stored_version.version_hash = current_hash
170
+ stored_version.schema_definition = schema_def
171
+ if last_processed_date:
172
+ stored_version.last_processed_date = last_processed_date
173
+ else:
174
+ stored_version = SchemaVersion(
175
+ component=component,
176
+ version_hash=current_hash,
177
+ schema_definition=schema_def,
178
+ last_processed_date=last_processed_date,
179
+ )
180
+ session.add(stored_version)
181
+
182
+ session.commit()
183
+
184
+ def get_last_processed_date(self, component: str) -> Optional[datetime]:
185
+ """Get the last date data was processed for this component."""
186
+ with self.session_factory() as session:
187
+ stored_version = session.query(SchemaVersion).filter_by(component=component).first()
188
+ return stored_version.last_processed_date if stored_version else None
189
+
190
+ def should_process_date(
191
+ self, component: str, date: datetime, config: Optional[dict[str, Any]] = None
192
+ ) -> bool:
193
+ """Determine if we should process data for a given date."""
194
+ # Always process if schema has changed
195
+ if self.has_schema_changed(component, config):
196
+ return True
197
+
198
+ # Check if we've already processed this date
199
+ last_processed = self.get_last_processed_date(component)
200
+ if not last_processed:
201
+ return True
202
+
203
+ # Ensure both dates are timezone-aware for comparison
204
+ if date.tzinfo is None:
205
+ date = date.replace(tzinfo=timezone.utc)
206
+ if last_processed.tzinfo is None:
207
+ last_processed = last_processed.replace(tzinfo=timezone.utc)
208
+
209
+ # Process if date is after last processed date
210
+ return date > last_processed
211
+
212
+ def mark_date_processed(
213
+ self, component: str, date: datetime, config: Optional[dict[str, Any]] = None
214
+ ):
215
+ """Mark a date as processed for incremental tracking."""
216
+ # Ensure date is timezone-aware before storing
217
+ if date.tzinfo is None:
218
+ date = date.replace(tzinfo=timezone.utc)
219
+
220
+ with self.session_factory() as session:
221
+ stored_version = session.query(SchemaVersion).filter_by(component=component).first()
222
+
223
+ if stored_version:
224
+ # Update to the latest processed date
225
+ if not stored_version.last_processed_date:
226
+ stored_version.last_processed_date = date
227
+ session.commit()
228
+ else:
229
+ # Ensure stored date is timezone-aware for comparison
230
+ stored_date = stored_version.last_processed_date
231
+ if stored_date.tzinfo is None:
232
+ stored_date = stored_date.replace(tzinfo=timezone.utc)
233
+
234
+ if date > stored_date:
235
+ stored_version.last_processed_date = date
236
+ session.commit()
237
+ else:
238
+ # Create new entry
239
+ self.update_schema_version(component, config, date)
240
+
241
+ def get_schema_info(self, component: str) -> dict[str, Any]:
242
+ """Get detailed schema information for debugging."""
243
+ with self.session_factory() as session:
244
+ stored_version = session.query(SchemaVersion).filter_by(component=component).first()
245
+
246
+ current_hash = self.get_schema_hash(component)
247
+
248
+ return {
249
+ "component": component,
250
+ "current_schema_hash": current_hash,
251
+ "stored_schema_hash": stored_version.version_hash if stored_version else None,
252
+ "schema_changed": self.has_schema_changed(component),
253
+ "last_processed": stored_version.last_processed_date if stored_version else None,
254
+ "created_at": stored_version.created_at if stored_version else None,
255
+ }
256
+
257
+ def reset_component(self, component: str):
258
+ """Reset schema version for a component (forces full reprocessing)."""
259
+ with self.session_factory() as session:
260
+ stored_version = session.query(SchemaVersion).filter_by(component=component).first()
261
+ if stored_version:
262
+ session.delete(stored_version)
263
+ session.commit()
264
+
265
+
266
+ def create_schema_manager(cache_dir: Path) -> SchemaVersionManager:
267
+ """Factory function to create a schema version manager."""
268
+ cache_dir.mkdir(parents=True, exist_ok=True)
269
+ return SchemaVersionManager(cache_dir)