ospac 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ospac might be problematic. Click here for more details.

@@ -0,0 +1,14 @@
1
+ """
2
+ OSPAC data processing pipeline.
3
+ Generates policy data from SPDX licenses using LLM analysis.
4
+ """
5
+
6
+ from ospac.pipeline.spdx_processor import SPDXProcessor
7
+ from ospac.pipeline.llm_analyzer import LicenseAnalyzer
8
+ from ospac.pipeline.data_generator import PolicyDataGenerator
9
+
10
+ __all__ = [
11
+ "SPDXProcessor",
12
+ "LicenseAnalyzer",
13
+ "PolicyDataGenerator",
14
+ ]
@@ -0,0 +1,530 @@
1
+ """
2
+ Policy data generator that produces OSPAC datasets.
3
+ Combines SPDX data with LLM analysis to generate comprehensive policy files.
4
+ """
5
+
6
+ import json
7
+ import yaml
8
+ import logging
9
+ import asyncio
10
+ from pathlib import Path
11
+ from typing import Dict, List, Any, Optional
12
+ from datetime import datetime
13
+
14
+ from ospac.pipeline.spdx_processor import SPDXProcessor
15
+ from ospac.pipeline.llm_analyzer import LicenseAnalyzer
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class PolicyDataGenerator:
21
+ """
22
+ Generate comprehensive policy data from SPDX licenses.
23
+ Produces all required datasets for OSPAC runtime.
24
+ """
25
+
26
+ def __init__(self, output_dir: Path = None, llm_provider: str = "ollama",
27
+ llm_model: str = None, llm_api_key: str = None, **llm_kwargs):
28
+ """
29
+ Initialize the data generator.
30
+
31
+ Args:
32
+ output_dir: Output directory for generated data
33
+ llm_provider: LLM provider ("openai", "claude", "ollama")
34
+ llm_model: LLM model name (auto-selected if not provided)
35
+ llm_api_key: API key for cloud providers
36
+ **llm_kwargs: Additional LLM configuration
37
+ """
38
+ self.output_dir = output_dir or Path("data")
39
+ self.spdx_processor = SPDXProcessor()
40
+ self.llm_analyzer = LicenseAnalyzer(
41
+ provider=llm_provider,
42
+ model=llm_model,
43
+ api_key=llm_api_key,
44
+ **llm_kwargs
45
+ )
46
+
47
+ # Ensure output directories exist
48
+ self.output_dir.mkdir(parents=True, exist_ok=True)
49
+ (self.output_dir / "licenses").mkdir(exist_ok=True)
50
+ (self.output_dir / "licenses" / "spdx").mkdir(exist_ok=True)
51
+ (self.output_dir / "compatibility").mkdir(exist_ok=True)
52
+ (self.output_dir / "compatibility" / "relationships").mkdir(exist_ok=True)
53
+ (self.output_dir / "obligations").mkdir(exist_ok=True)
54
+
55
+ # Progress tracking
56
+ self.progress_file = self.output_dir / "generation_progress.json"
57
+ self.processed_licenses = self._load_progress()
58
+
59
+ def _load_progress(self) -> set:
60
+ """Load previously processed licenses from progress file."""
61
+ if self.progress_file.exists():
62
+ try:
63
+ with open(self.progress_file, 'r') as f:
64
+ data = json.load(f)
65
+ return set(data.get('processed_licenses', []))
66
+ except Exception as e:
67
+ logger.warning(f"Failed to load progress file: {e}")
68
+ return set()
69
+
70
+ def _save_progress(self, license_id: str):
71
+ """Save progress after processing each license."""
72
+ self.processed_licenses.add(license_id)
73
+ progress_data = {
74
+ 'last_updated': datetime.now().isoformat(),
75
+ 'total_processed': len(self.processed_licenses),
76
+ 'processed_licenses': list(self.processed_licenses)
77
+ }
78
+ try:
79
+ with open(self.progress_file, 'w') as f:
80
+ json.dump(progress_data, f, indent=2)
81
+ except Exception as e:
82
+ logger.error(f"Failed to save progress: {e}")
83
+
84
+ def _generate_individual_policy(self, analysis: Dict[str, Any]):
85
+ """Generate individual policy file for a license."""
86
+ license_id = analysis.get("license_id")
87
+ if not license_id:
88
+ return
89
+
90
+ # Create policy structure
91
+ policy_data = {
92
+ "license": {
93
+ "id": license_id,
94
+ "name": license_id,
95
+ "type": analysis.get("category", "unknown"),
96
+ "spdx_id": license_id,
97
+ "properties": analysis.get("permissions", {}),
98
+ "requirements": analysis.get("conditions", {}),
99
+ "limitations": analysis.get("limitations", {}),
100
+ "compatibility": self._format_compatibility_rules(analysis.get("compatibility_rules", {})),
101
+ "obligations": analysis.get("obligations", []),
102
+ "key_requirements": analysis.get("key_requirements", [])
103
+ }
104
+ }
105
+
106
+ # Save to individual file
107
+ license_file = self.output_dir / "licenses" / "spdx" / f"{license_id}.yaml"
108
+ try:
109
+ with open(license_file, 'w') as f:
110
+ yaml.dump(policy_data, f, default_flow_style=False, sort_keys=False)
111
+ except Exception as e:
112
+ logger.error(f"Failed to save policy file for {license_id}: {e}")
113
+
114
+ def _format_compatibility_rules(self, rules: Dict) -> Dict:
115
+ """Format compatibility rules for policy file."""
116
+ if not rules:
117
+ return {
118
+ "static_linking": {"compatible_with": [], "incompatible_with": [], "requires_review": []},
119
+ "dynamic_linking": {"compatible_with": [], "incompatible_with": [], "requires_review": []},
120
+ "contamination_effect": "unknown",
121
+ "notes": ""
122
+ }
123
+
124
+ return {
125
+ "static_linking": rules.get("static_linking", {}),
126
+ "dynamic_linking": rules.get("dynamic_linking", {}),
127
+ "contamination_effect": rules.get("contamination_effect", "unknown"),
128
+ "notes": rules.get("notes", "")
129
+ }
130
+
131
+ def _load_all_processed_licenses(self) -> List[Dict]:
132
+ """Load all previously processed license analyses."""
133
+ analyzed_licenses = []
134
+ spdx_dir = self.output_dir / "licenses" / "spdx"
135
+
136
+ for license_file in spdx_dir.glob("*.yaml"):
137
+ try:
138
+ with open(license_file, 'r') as f:
139
+ policy_data = yaml.safe_load(f)
140
+ if "license" in policy_data:
141
+ analyzed_licenses.append(policy_data["license"])
142
+ except Exception as e:
143
+ logger.warning(f"Failed to load {license_file}: {e}")
144
+
145
+ return analyzed_licenses
146
+
147
+ def _update_master_databases(self, all_analyzed: List[Dict]):
148
+ """Update master databases with all processed licenses."""
149
+ # This method will update the main database files
150
+ pass
151
+
152
+ def _get_licenses_to_process(self, all_licenses: List[Dict], force: bool = False) -> List[Dict]:
153
+ """Get list of licenses that need processing (delta processing)."""
154
+ if force:
155
+ return all_licenses
156
+
157
+ # Filter out already processed licenses
158
+ licenses_to_process = []
159
+ for license_data in all_licenses:
160
+ license_id = license_data.get('licenseId', license_data.get('id', ''))
161
+ if license_id not in self.processed_licenses:
162
+ licenses_to_process.append(license_data)
163
+
164
+ logger.info(f"Found {len(licenses_to_process)} new licenses to process out of {len(all_licenses)} total")
165
+ return licenses_to_process
166
+
167
+ async def generate_all_data(self, force_download: bool = False,
168
+ limit: Optional[int] = None,
169
+ force_reprocess: bool = False) -> Dict[str, Any]:
170
+ """
171
+ Generate all policy data from SPDX licenses.
172
+
173
+ Args:
174
+ force_download: Force re-download of SPDX data
175
+ limit: Limit number of licenses to process (for testing)
176
+
177
+ Returns:
178
+ Summary of generated data
179
+ """
180
+ logger.info("Starting policy data generation")
181
+
182
+ # Step 1: Download and process SPDX data
183
+ logger.info("Downloading SPDX license data...")
184
+ spdx_data = self.spdx_processor.download_spdx_data(force=force_download)
185
+ all_licenses = spdx_data["licenses"]
186
+
187
+ # Step 2: Determine which licenses need processing (delta processing)
188
+ licenses_to_process = self._get_licenses_to_process(all_licenses, force_reprocess)
189
+
190
+ if limit:
191
+ licenses_to_process = licenses_to_process[:limit]
192
+ logger.info(f"Processing limited to {limit} licenses")
193
+
194
+ if not licenses_to_process:
195
+ logger.info("No new licenses to process. All licenses up to date.")
196
+ return self._generate_summary(all_licenses)
197
+
198
+ logger.info(f"Processing {len(licenses_to_process)} licenses with progress tracking...")
199
+
200
+ # Step 3: Process licenses with progress tracking
201
+ processed_licenses = []
202
+ analyzed_licenses = []
203
+
204
+ for i, license_data in enumerate(licenses_to_process, 1):
205
+ license_id = license_data.get("licenseId")
206
+ if not license_id:
207
+ continue
208
+
209
+ logger.info(f"[{i}/{len(licenses_to_process)}] Processing {license_id}")
210
+
211
+ try:
212
+ # Get license text
213
+ license_text = self.spdx_processor.get_license_text(license_id)
214
+
215
+ license_to_analyze = {
216
+ "id": license_id,
217
+ "text": license_text or "",
218
+ "spdx_data": license_data
219
+ }
220
+
221
+ # Analyze with LLM
222
+ analysis = await self.llm_analyzer.analyze_license(license_id, license_text or "")
223
+ compatibility = await self.llm_analyzer.extract_compatibility_rules(license_id, analysis)
224
+ analysis["compatibility_rules"] = compatibility
225
+
226
+ analyzed_licenses.append(analysis)
227
+
228
+ # Generate individual policy file immediately
229
+ self._generate_individual_policy(analysis)
230
+
231
+ # Save progress after each license
232
+ self._save_progress(license_id)
233
+
234
+ logger.info(f"✓ Completed {license_id} ({i}/{len(licenses_to_process)})")
235
+
236
+ except Exception as e:
237
+ logger.error(f"Failed to process {license_id}: {e}")
238
+ continue
239
+
240
+ # Step 4: Update master databases and compatibility matrix
241
+ logger.info("Updating master databases...")
242
+ all_analyzed = self._load_all_processed_licenses()
243
+ self._update_master_databases(all_analyzed)
244
+ compatibility_matrix = self._generate_compatibility_matrix(analyzed_licenses)
245
+ obligation_database = self._generate_obligation_database(analyzed_licenses)
246
+
247
+ # Step 5: Generate aggregate datasets
248
+ logger.info("Generating aggregate datasets...")
249
+ self._generate_master_database(analyzed_licenses, compatibility_matrix, obligation_database)
250
+
251
+ # Step 6: Generate validation data
252
+ validation_report = self._validate_generated_data(analyzed_licenses)
253
+
254
+ summary = {
255
+ "total_licenses": len(analyzed_licenses),
256
+ "spdx_version": spdx_data.get("version"),
257
+ "generated_at": datetime.now().isoformat(),
258
+ "output_directory": str(self.output_dir),
259
+ "categories": self._count_categories(analyzed_licenses),
260
+ "validation": validation_report
261
+ }
262
+
263
+ # Save summary
264
+ summary_file = self.output_dir / "generation_summary.json"
265
+ with open(summary_file, "w") as f:
266
+ json.dump(summary, f, indent=2)
267
+
268
+ logger.info(f"Data generation complete. Summary saved to {summary_file}")
269
+ return summary
270
+
271
+ def _generate_license_policies(self, licenses: List[Dict[str, Any]]) -> None:
272
+ """Generate individual license policy files."""
273
+ license_dir = self.output_dir / "licenses" / "spdx"
274
+ license_dir.mkdir(parents=True, exist_ok=True)
275
+
276
+ for license_data in licenses:
277
+ license_id = license_data.get("license_id")
278
+ if not license_id:
279
+ continue
280
+
281
+ # Create policy structure
282
+ policy = {
283
+ "license": {
284
+ "id": license_id,
285
+ "name": license_data.get("name", license_id),
286
+ "type": license_data.get("category", "permissive"),
287
+ "spdx_id": license_id,
288
+
289
+ "properties": license_data.get("permissions", {}),
290
+ "requirements": license_data.get("conditions", {}),
291
+ "limitations": license_data.get("limitations", {}),
292
+
293
+ "compatibility": self._format_compatibility_for_policy(
294
+ license_data.get("compatibility_rules", {})
295
+ ),
296
+
297
+ "obligations": license_data.get("obligations", []),
298
+ "key_requirements": license_data.get("key_requirements", [])
299
+ }
300
+ }
301
+
302
+ # Save as YAML
303
+ policy_file = license_dir / f"{license_id}.yaml"
304
+ with open(policy_file, "w") as f:
305
+ yaml.dump(policy, f, default_flow_style=False, sort_keys=False)
306
+
307
+ logger.info(f"Generated {len(licenses)} license policy files")
308
+
309
+ def _format_compatibility_for_policy(self, rules: Dict[str, Any]) -> Dict[str, Any]:
310
+ """Format compatibility rules for policy file."""
311
+ return {
312
+ "static_linking": {
313
+ "compatible_with": rules.get("static_linking", {}).get("compatible_with", []),
314
+ "incompatible_with": rules.get("static_linking", {}).get("incompatible_with", []),
315
+ "requires_review": rules.get("static_linking", {}).get("requires_review", [])
316
+ },
317
+ "dynamic_linking": {
318
+ "compatible_with": rules.get("dynamic_linking", {}).get("compatible_with", []),
319
+ "incompatible_with": rules.get("dynamic_linking", {}).get("incompatible_with", []),
320
+ "requires_review": rules.get("dynamic_linking", {}).get("requires_review", [])
321
+ },
322
+ "contamination_effect": rules.get("contamination_effect", "none"),
323
+ "notes": rules.get("notes", "")
324
+ }
325
+
326
+ def _generate_compatibility_matrix(self, licenses: List[Dict[str, Any]]) -> Dict[str, Any]:
327
+ """Generate license compatibility matrix using split architecture."""
328
+ from ospac.core.compatibility_matrix import CompatibilityMatrix
329
+
330
+ # Initialize the matrix handler
331
+ matrix_handler = CompatibilityMatrix(str(self.output_dir / "compatibility"))
332
+
333
+ # Build full matrix for conversion
334
+ full_matrix = {
335
+ "version": "1.0",
336
+ "generated": datetime.now().isoformat(),
337
+ "total_licenses": len(licenses),
338
+ "compatibility": {}
339
+ }
340
+
341
+ # Build compatibility matrix
342
+ for license1 in licenses:
343
+ id1 = license1.get("license_id")
344
+ if not id1:
345
+ continue
346
+
347
+ full_matrix["compatibility"][id1] = {}
348
+
349
+ for license2 in licenses:
350
+ id2 = license2.get("license_id")
351
+ if not id2:
352
+ continue
353
+
354
+ # Determine compatibility
355
+ compat = self._check_license_compatibility(license1, license2)
356
+ full_matrix["compatibility"][id1][id2] = compat
357
+
358
+ # Save both formats: full matrix for backward compatibility and split for efficiency
359
+ # Save full matrix (can be removed later if space is an issue)
360
+ matrix_file = self.output_dir / "compatibility_matrix.json"
361
+ with open(matrix_file, "w") as f:
362
+ json.dump(full_matrix, f, indent=2)
363
+
364
+ # Convert to efficient split format
365
+ matrix_handler.build_from_full_matrix(str(matrix_file))
366
+
367
+ logger.info(f"Generated compatibility matrix in both formats")
368
+ logger.info(f" Full matrix: {matrix_file}")
369
+ logger.info(f" Split format: {self.output_dir / 'compatibility'}")
370
+
371
+ return full_matrix
372
+
373
+ def _check_license_compatibility(self, license1: Dict, license2: Dict) -> Dict[str, Any]:
374
+ """Check compatibility between two licenses."""
375
+ cat1 = license1.get("category", "permissive")
376
+ cat2 = license2.get("category", "permissive")
377
+
378
+ # Basic compatibility rules
379
+ compatibility = {
380
+ "static_linking": "unknown",
381
+ "dynamic_linking": "unknown",
382
+ "distribution": "unknown"
383
+ }
384
+
385
+ # Permissive licenses are generally compatible
386
+ if cat1 == "permissive" and cat2 == "permissive":
387
+ compatibility = {
388
+ "static_linking": "compatible",
389
+ "dynamic_linking": "compatible",
390
+ "distribution": "compatible"
391
+ }
392
+
393
+ # Strong copyleft contamination
394
+ elif cat1 == "copyleft_strong" or cat2 == "copyleft_strong":
395
+ if cat1 == cat2:
396
+ compatibility = {
397
+ "static_linking": "compatible",
398
+ "dynamic_linking": "compatible",
399
+ "distribution": "compatible"
400
+ }
401
+ else:
402
+ compatibility = {
403
+ "static_linking": "incompatible",
404
+ "dynamic_linking": "review_required",
405
+ "distribution": "incompatible"
406
+ }
407
+
408
+ # Weak copyleft
409
+ elif cat1 == "copyleft_weak" or cat2 == "copyleft_weak":
410
+ compatibility = {
411
+ "static_linking": "review_required",
412
+ "dynamic_linking": "compatible",
413
+ "distribution": "compatible"
414
+ }
415
+
416
+ return compatibility
417
+
418
+ def _generate_obligation_database(self, licenses: List[Dict[str, Any]]) -> Dict[str, Any]:
419
+ """Generate obligation database."""
420
+ obligations = {
421
+ "version": "1.0",
422
+ "generated": datetime.now().isoformat(),
423
+ "licenses": {}
424
+ }
425
+
426
+ for license_data in licenses:
427
+ license_id = license_data.get("license_id")
428
+ if not license_id:
429
+ continue
430
+
431
+ obligations["licenses"][license_id] = {
432
+ "obligations": license_data.get("obligations", []),
433
+ "key_requirements": license_data.get("key_requirements", []),
434
+ "conditions": license_data.get("conditions", {}),
435
+ "attribution_required": license_data.get("conditions", {}).get("include_copyright", False),
436
+ "source_disclosure_required": license_data.get("conditions", {}).get("disclose_source", False),
437
+ "notice_required": license_data.get("conditions", {}).get("include_notice", False)
438
+ }
439
+
440
+ # Save obligations
441
+ obligations_file = self.output_dir / "obligation_database.json"
442
+ with open(obligations_file, "w") as f:
443
+ json.dump(obligations, f, indent=2)
444
+
445
+ logger.info(f"Generated obligation database: {obligations_file}")
446
+ return obligations
447
+
448
+ def _generate_master_database(self, licenses: List[Dict[str, Any]],
449
+ compatibility_matrix: Dict[str, Any],
450
+ obligation_database: Dict[str, Any]) -> None:
451
+ """Generate master database with all license information."""
452
+ master_db = {
453
+ "version": "1.0",
454
+ "generated": datetime.now().isoformat(),
455
+ "total_licenses": len(licenses),
456
+ "licenses": {}
457
+ }
458
+
459
+ for license_data in licenses:
460
+ license_id = license_data.get("license_id")
461
+ if not license_id:
462
+ continue
463
+
464
+ master_db["licenses"][license_id] = {
465
+ "id": license_id,
466
+ "name": license_data.get("name", license_id),
467
+ "category": license_data.get("category"),
468
+ "permissions": license_data.get("permissions"),
469
+ "conditions": license_data.get("conditions"),
470
+ "limitations": license_data.get("limitations"),
471
+ "obligations": obligation_database["licenses"].get(license_id, {}).get("obligations", []),
472
+ "compatibility_rules": license_data.get("compatibility_rules", {}),
473
+ "spdx_metadata": {
474
+ "is_osi_approved": license_data.get("spdx_data", {}).get("isOsiApproved", False),
475
+ "is_fsf_libre": license_data.get("spdx_data", {}).get("isFsfLibre", False),
476
+ "is_deprecated": license_data.get("spdx_data", {}).get("isDeprecatedLicenseId", False)
477
+ }
478
+ }
479
+
480
+ # Save master database
481
+ master_file = self.output_dir / "ospac_license_database.json"
482
+ with open(master_file, "w") as f:
483
+ json.dump(master_db, f, indent=2)
484
+
485
+ logger.info(f"Generated master database: {master_file}")
486
+
487
+ # Also save as YAML for readability
488
+ master_yaml = self.output_dir / "ospac_license_database.yaml"
489
+ with open(master_yaml, "w") as f:
490
+ yaml.dump(master_db, f, default_flow_style=False)
491
+
492
+ def _count_categories(self, licenses: List[Dict[str, Any]]) -> Dict[str, int]:
493
+ """Count licenses by category."""
494
+ categories = {}
495
+ for license_data in licenses:
496
+ cat = license_data.get("category", "unknown")
497
+ categories[cat] = categories.get(cat, 0) + 1
498
+ return categories
499
+
500
+ def _validate_generated_data(self, licenses: List[Dict[str, Any]]) -> Dict[str, Any]:
501
+ """Validate the generated data for completeness and consistency."""
502
+ report = {
503
+ "total_licenses": len(licenses),
504
+ "missing_category": 0,
505
+ "missing_permissions": 0,
506
+ "missing_obligations": 0,
507
+ "missing_compatibility": 0,
508
+ "validation_errors": []
509
+ }
510
+
511
+ for license_data in licenses:
512
+ license_id = license_data.get("license_id", "unknown")
513
+
514
+ if not license_data.get("category"):
515
+ report["missing_category"] += 1
516
+ report["validation_errors"].append(f"{license_id}: Missing category")
517
+
518
+ if not license_data.get("permissions"):
519
+ report["missing_permissions"] += 1
520
+ report["validation_errors"].append(f"{license_id}: Missing permissions")
521
+
522
+ if not license_data.get("obligations"):
523
+ report["missing_obligations"] += 1
524
+
525
+ if not license_data.get("compatibility_rules"):
526
+ report["missing_compatibility"] += 1
527
+
528
+ report["is_valid"] = len(report["validation_errors"]) == 0
529
+
530
+ return report