@aborruso/ckan-mcp-server 0.4.13 → 0.4.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,299 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Metadata Quality Scoring for CKAN Datasets
4
+
5
+ Advanced quality scoring system based on:
6
+ - Completeness (required and recommended fields)
7
+ - Richness (descriptions, tags, temporal coverage)
8
+ - Resources quality (formats, accessibility)
9
+ - Temporal freshness
10
+
11
+ Score: 0-100 points
12
+ """
13
+
14
+ from datetime import datetime
15
+ from typing import Any
16
+
17
+
18
+ class MetadataQualityScorer:
19
+ """Calculate metadata quality score for CKAN datasets."""
20
+
21
+ # Quality thresholds
22
+ EXCELLENT = 80
23
+ GOOD = 60
24
+ ACCEPTABLE = 40
25
+ POOR = 0
26
+
27
+ @classmethod
28
+ def score_dataset(cls, dataset: dict[str, Any]) -> dict[str, Any]:
29
+ """
30
+ Calculate comprehensive quality score.
31
+
32
+ Returns:
33
+ {
34
+ "score": 75, # Total score 0-100
35
+ "level": "good", # excellent/good/acceptable/poor
36
+ "breakdown": {
37
+ "completeness": 20, # out of 30
38
+ "richness": 15, # out of 30
39
+ "resources": 25, # out of 30
40
+ "freshness": 8 # out of 10
41
+ },
42
+ "issues": ["Missing license", ...]
43
+ }
44
+ """
45
+ issues = []
46
+ breakdown = {
47
+ "completeness": cls._score_completeness(dataset, issues),
48
+ "richness": cls._score_richness(dataset, issues),
49
+ "resources": cls._score_resources(dataset, issues),
50
+ "freshness": cls._score_freshness(dataset, issues),
51
+ }
52
+
53
+ total_score = sum(breakdown.values())
54
+ level = cls._get_level(total_score)
55
+
56
+ return {
57
+ "score": total_score,
58
+ "level": level,
59
+ "breakdown": breakdown,
60
+ "issues": issues,
61
+ }
62
+
63
+ @classmethod
64
+ def _score_completeness(cls, dataset: dict, issues: list) -> int:
65
+ """Score 0-30: Required and recommended fields."""
66
+ score = 0
67
+
68
+ # Required fields (15 points)
69
+ if dataset.get("title"):
70
+ score += 5
71
+ else:
72
+ issues.append("Missing title")
73
+
74
+ if dataset.get("notes"): # Description
75
+ score += 5
76
+ else:
77
+ issues.append("Missing description")
78
+
79
+ if dataset.get("name"): # Identifier
80
+ score += 5
81
+ else:
82
+ issues.append("Missing identifier")
83
+
84
+ # Recommended fields (15 points)
85
+ if dataset.get("license_id"):
86
+ score += 3
87
+ else:
88
+ issues.append("Missing license")
89
+
90
+ if dataset.get("author") or dataset.get("maintainer"):
91
+ score += 3
92
+ else:
93
+ issues.append("Missing author/maintainer")
94
+
95
+ if dataset.get("author_email") or dataset.get("maintainer_email"):
96
+ score += 3
97
+ else:
98
+ issues.append("Missing contact email")
99
+
100
+ # Organization
101
+ if dataset.get("organization"):
102
+ score += 3
103
+ else:
104
+ issues.append("Not assigned to organization")
105
+
106
+ # Geographical coverage
107
+ if dataset.get("extras"):
108
+ has_geo = any(
109
+ e.get("key") in ["spatial", "geographic_coverage"]
110
+ for e in dataset.get("extras", [])
111
+ )
112
+ if has_geo:
113
+ score += 3
114
+
115
+ return score
116
+
117
+ @classmethod
118
+ def _score_richness(cls, dataset: dict, issues: list) -> int:
119
+ """Score 0-30: Richness of metadata."""
120
+ score = 0
121
+
122
+ # Description quality (10 points)
123
+ notes = dataset.get("notes", "")
124
+ if len(notes) > 200:
125
+ score += 10
126
+ elif len(notes) > 100:
127
+ score += 5
128
+ elif len(notes) > 0:
129
+ score += 2
130
+ else:
131
+ issues.append("Very short or missing description")
132
+
133
+ # Tags (10 points)
134
+ tags = dataset.get("tags", [])
135
+ num_tags = len(tags)
136
+ if num_tags >= 5:
137
+ score += 10
138
+ elif num_tags >= 3:
139
+ score += 6
140
+ elif num_tags >= 1:
141
+ score += 3
142
+ else:
143
+ issues.append("No tags")
144
+
145
+ # Temporal coverage (5 points)
146
+ extras = {e.get("key"): e.get("value") for e in dataset.get("extras", [])}
147
+ if "temporal_start" in extras or "temporal_end" in extras:
148
+ score += 3
149
+
150
+ # Frequency/update schedule (5 points)
151
+ if extras.get("frequency") or extras.get("update_frequency"):
152
+ score += 2
153
+
154
+ return score
155
+
156
+ @classmethod
157
+ def _score_resources(cls, dataset: dict, issues: list) -> int:
158
+ """Score 0-30: Resources quality."""
159
+ score = 0
160
+ resources = dataset.get("resources", [])
161
+
162
+ if not resources:
163
+ issues.append("No resources")
164
+ return 0
165
+
166
+ # At least one resource (5 points)
167
+ score += 5
168
+
169
+ # Check formats (10 points)
170
+ formats = {r.get("format", "").upper() for r in resources}
171
+ open_formats = {"CSV", "JSON", "GEOJSON", "XML", "RDF", "JSONLD"}
172
+ if formats & open_formats:
173
+ score += 10
174
+ if "CSV" in formats:
175
+ score += 2 # Bonus for CSV
176
+ else:
177
+ issues.append("No open formats (CSV/JSON/XML)")
178
+
179
+ # Resource descriptions (5 points)
180
+ described = sum(1 for r in resources if r.get("description"))
181
+ if described == len(resources):
182
+ score += 5
183
+ elif described > 0:
184
+ score += 2
185
+
186
+ # DataStore availability (5 points)
187
+ has_datastore = any(r.get("datastore_active") for r in resources)
188
+ if has_datastore:
189
+ score += 5
190
+
191
+ # URLs validity (5 points)
192
+ valid_urls = sum(
193
+ 1 for r in resources if r.get("url") and r["url"].startswith("http")
194
+ )
195
+ if valid_urls == len(resources):
196
+ score += 5
197
+ elif valid_urls > 0:
198
+ score += 2
199
+ else:
200
+ issues.append("Invalid or missing resource URLs")
201
+
202
+ return score
203
+
204
+ @classmethod
205
+ def _score_freshness(cls, dataset: dict, issues: list) -> int:
206
+ """Score 0-10: Temporal freshness."""
207
+ score = 0
208
+
209
+ # Check metadata_modified
210
+ modified_str = dataset.get("metadata_modified")
211
+ if not modified_str:
212
+ issues.append("No last modified date")
213
+ return 0
214
+
215
+ try:
216
+ modified = datetime.fromisoformat(modified_str.replace("Z", "+00:00"))
217
+ now = datetime.now(modified.tzinfo)
218
+ days_old = (now - modified).days
219
+
220
+ if days_old < 90: # < 3 months
221
+ score = 10
222
+ elif days_old < 180: # < 6 months
223
+ score = 7
224
+ elif days_old < 365: # < 1 year
225
+ score = 5
226
+ elif days_old < 730: # < 2 years
227
+ score = 3
228
+ else:
229
+ score = 1
230
+ issues.append(f"Last updated {days_old} days ago")
231
+
232
+ except (ValueError, AttributeError):
233
+ issues.append("Invalid date format")
234
+
235
+ return score
236
+
237
+ @classmethod
238
+ def _get_level(cls, score: int) -> str:
239
+ """Convert score to quality level."""
240
+ if score >= cls.EXCELLENT:
241
+ return "excellent"
242
+ elif score >= cls.GOOD:
243
+ return "good"
244
+ elif score >= cls.ACCEPTABLE:
245
+ return "acceptable"
246
+ else:
247
+ return "poor"
248
+
249
+
250
+ # Example usage
251
+ if __name__ == "__main__":
252
+ # Sample dataset
253
+ sample_dataset = {
254
+ "title": "Sample Dataset",
255
+ "name": "sample-dataset",
256
+ "notes": "This is a sample dataset with a detailed description " * 5,
257
+ "license_id": "cc-by-4.0",
258
+ "author": "Mario Rossi",
259
+ "author_email": "mario@example.com",
260
+ "organization": {"name": "comune-roma"},
261
+ "tags": [
262
+ {"name": "environment"},
263
+ {"name": "air-quality"},
264
+ {"name": "open-data"},
265
+ ],
266
+ "resources": [
267
+ {
268
+ "format": "CSV",
269
+ "url": "https://example.com/data.csv",
270
+ "description": "Data in CSV format",
271
+ "datastore_active": True,
272
+ },
273
+ {
274
+ "format": "JSON",
275
+ "url": "https://example.com/data.json",
276
+ "description": "Data in JSON format",
277
+ },
278
+ ],
279
+ "metadata_modified": "2025-01-15T10:00:00Z",
280
+ }
281
+
282
+ scorer = MetadataQualityScorer()
283
+ result = scorer.score_dataset(sample_dataset)
284
+
285
+ print("Metadata Quality Assessment")
286
+ print("=" * 50)
287
+ print(f"Overall Score: {result['score']}/100")
288
+ print(f"Quality Level: {result['level'].upper()}")
289
+ print(f"\nBreakdown:")
290
+ for category, score in result["breakdown"].items():
291
+ print(
292
+ f" {category.capitalize():15} {score:2}/30"
293
+ if category != "freshness"
294
+ else f" {category.capitalize():15} {score:2}/10"
295
+ )
296
+ if result["issues"]:
297
+ print(f"\nIssues ({len(result['issues'])}):")
298
+ for issue in result["issues"]:
299
+ print(f" - {issue}")
@@ -0,0 +1,12 @@
1
+ # LangGraph Examples - Python Dependencies
2
+
3
+ # Core dependencies
4
+ langgraph>=0.2.0
5
+ langchain-core>=0.3.0
6
+
7
+ # MCP Python SDK for client connection
8
+ mcp>=1.0.0
9
+
10
+ # Optional: LangSmith for debugging/tracing
11
+ # Uncomment if you want to use LangSmith
12
+ # langsmith>=0.1.0
@@ -0,0 +1,32 @@
1
+ #!/bin/bash
2
+ # Setup script for LangGraph examples
3
+
4
+ set -e
5
+
6
+ echo "Setting up LangGraph examples environment..."
7
+
8
+ # Check Python version
9
+ python3 --version
10
+
11
+ # Create virtual environment if it doesn't exist
12
+ if [ ! -d "venv" ]; then
13
+ echo "Creating virtual environment..."
14
+ python3 -m venv venv
15
+ fi
16
+
17
+ # Activate virtual environment
18
+ source venv/bin/activate
19
+
20
+ # Install dependencies
21
+ echo "Installing dependencies..."
22
+ pip install -r requirements.txt
23
+
24
+ echo ""
25
+ echo "✓ Setup complete!"
26
+ echo ""
27
+ echo "To activate the environment:"
28
+ echo " source venv/bin/activate"
29
+ echo ""
30
+ echo "To run examples:"
31
+ echo " python 01_basic_workflow.py"
32
+ echo " python 02_data_exploration.py"
@@ -0,0 +1,106 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Quick test to verify LangGraph + MCP setup
4
+
5
+ Run:
6
+ uvx --with langgraph --with mcp --with langchain-core python test_setup.py
7
+ """
8
+
9
+ import sys
10
+
11
+
12
+ def test_imports():
13
+ """Test that all required packages are available."""
14
+ print("Testing imports...")
15
+ errors = []
16
+
17
+ try:
18
+ import langgraph # noqa: F401
19
+
20
+ print("✓ langgraph")
21
+ except ImportError as e:
22
+ errors.append(f"✗ langgraph: {e}")
23
+
24
+ try:
25
+ import mcp # noqa: F401
26
+
27
+ print("✓ mcp")
28
+ except ImportError as e:
29
+ errors.append(f"✗ mcp: {e}")
30
+
31
+ try:
32
+ import langchain_core # noqa: F401
33
+
34
+ print("✓ langchain_core")
35
+ except ImportError as e:
36
+ errors.append(f"✗ langchain_core: {e}")
37
+
38
+ return errors
39
+
40
+
41
+ def test_mcp_server():
42
+ """Test that MCP server file exists."""
43
+ import os
44
+
45
+ print("\nTesting MCP server...")
46
+ server_path = os.path.join(os.path.dirname(__file__), "../../dist/index.js")
47
+
48
+ if os.path.exists(server_path):
49
+ print(f"✓ MCP server found: {server_path}")
50
+ return []
51
+ else:
52
+ return [
53
+ f"✗ MCP server not found: {server_path}",
54
+ " Run: cd ../.. && npm run build",
55
+ ]
56
+
57
+
58
+ def test_node():
59
+ """Test that Node.js is available."""
60
+ import subprocess
61
+
62
+ print("\nTesting Node.js...")
63
+ try:
64
+ result = subprocess.run(
65
+ ["node", "--version"], capture_output=True, text=True, check=True
66
+ )
67
+ version = result.stdout.strip()
68
+ print(f"✓ Node.js {version}")
69
+ return []
70
+ except (subprocess.CalledProcessError, FileNotFoundError):
71
+ return ["✗ Node.js not found or not in PATH"]
72
+
73
+
74
+ def main():
75
+ """Run all tests."""
76
+ print("=" * 60)
77
+ print("LangGraph + CKAN MCP Setup Test")
78
+ print("=" * 60)
79
+
80
+ all_errors = []
81
+
82
+ # Run tests
83
+ all_errors.extend(test_imports())
84
+ all_errors.extend(test_node())
85
+ all_errors.extend(test_mcp_server())
86
+
87
+ # Summary
88
+ print("\n" + "=" * 60)
89
+ if all_errors:
90
+ print("SETUP INCOMPLETE")
91
+ print("=" * 60)
92
+ for error in all_errors:
93
+ print(error)
94
+ print("\nSee README.md for setup instructions")
95
+ sys.exit(1)
96
+ else:
97
+ print("✓ ALL TESTS PASSED")
98
+ print("=" * 60)
99
+ print("\nYou can now run:")
100
+ print(" python 01_basic_workflow.py")
101
+ print(" python 02_data_exploration.py")
102
+ sys.exit(0)
103
+
104
+
105
+ if __name__ == "__main__":
106
+ main()
@@ -0,0 +1,21 @@
1
+ # Change: Add MQA Quality Score Tool for dati.gov.it
2
+
3
+ ## Why
4
+ Datasets on dati.gov.it display quality scores (Eccellente, Buono, etc.) calculated by data.europa.eu's MQA (Metadata Quality Assurance) system. Currently there's no way to access these quality metrics through the MCP server, limiting users' ability to evaluate dataset quality programmatically.
5
+
6
+ ## What Changes
7
+ - Add `ckan_get_mqa_quality` tool for retrieving quality metrics from data.europa.eu
8
+ - Tool works only with dati.gov.it server (validated at runtime)
9
+ - Fetches dataset identifier from CKAN, then queries MQA API
10
+ - Returns quality score and detailed metrics (accessibility, reusability, interoperability, findability)
11
+ - Supports both markdown and JSON output formats
12
+
13
+ ## Impact
14
+ - Affected specs: New capability `ckan-quality`
15
+ - Affected code:
16
+ - New file: `src/tools/quality.ts` (tool handler)
17
+ - New file: `tests/integration/quality.test.ts` (tests with mocked responses)
18
+ - New file: `tests/fixtures/responses/mqa-quality.json` (mock data)
19
+ - Modified: `src/server.ts` (register new tool)
20
+ - Modified: `README.md` (document new tool)
21
+ - Modified: `EXAMPLES.md` (add usage examples)
@@ -0,0 +1,71 @@
1
+ ## ADDED Requirements
2
+
3
+ ### Requirement: MQA Quality Score Retrieval
4
+ The system SHALL provide a tool to retrieve MQA (Metadata Quality Assurance) quality metrics from data.europa.eu for datasets published on dati.gov.it.
5
+
6
+ #### Scenario: Successful quality score retrieval
7
+ - **GIVEN** a valid dataset ID from dati.gov.it
8
+ - **WHEN** user requests quality metrics
9
+ - **THEN** system SHALL fetch identifier field from CKAN package_show
10
+ - **AND** system SHALL query data.europa.eu MQA API
11
+ - **AND** system SHALL return quality score and detailed metrics (accessibility, reusability, interoperability, findability)
12
+
13
+ #### Scenario: Identifier fallback to name
14
+ - **GIVEN** a dataset with empty identifier field
15
+ - **WHEN** user requests quality metrics
16
+ - **THEN** system SHALL use the name field as fallback identifier for MQA API query
17
+
18
+ #### Scenario: Dataset not found
19
+ - **GIVEN** an invalid or non-existent dataset ID
20
+ - **WHEN** user requests quality metrics
21
+ - **THEN** system SHALL return clear error message indicating dataset not found
22
+
23
+ #### Scenario: MQA API unavailable
24
+ - **GIVEN** data.europa.eu MQA API is unavailable or returns error
25
+ - **WHEN** user requests quality metrics
26
+ - **THEN** system SHALL return clear error message indicating MQA service unavailability
27
+
28
+ ### Requirement: Server Validation
29
+ The system SHALL restrict MQA quality queries to dati.gov.it server only.
30
+
31
+ #### Scenario: Valid dati.gov.it server
32
+ - **GIVEN** server_url is "https://www.dati.gov.it/opendata" or "https://dati.gov.it/opendata"
33
+ - **WHEN** user requests quality metrics
34
+ - **THEN** system SHALL proceed with MQA query
35
+
36
+ #### Scenario: Invalid server URL
37
+ - **GIVEN** server_url is not dati.gov.it (e.g., "https://catalog.data.gov")
38
+ - **WHEN** user requests quality metrics
39
+ - **THEN** system SHALL reject request with error message explaining MQA is only available for dati.gov.it
40
+
41
+ ### Requirement: Output Formats
42
+ The system SHALL support both markdown and JSON output formats for quality metrics.
43
+
44
+ #### Scenario: Markdown format (default)
45
+ - **GIVEN** user does not specify response_format or specifies "markdown"
46
+ - **WHEN** quality metrics are retrieved
47
+ - **THEN** system SHALL return human-readable markdown with:
48
+ - Overall quality score
49
+ - Breakdown by dimension (accessibility, reusability, interoperability, findability)
50
+ - Key findings and recommendations
51
+
52
+ #### Scenario: JSON format
53
+ - **GIVEN** user specifies response_format as "json"
54
+ - **WHEN** quality metrics are retrieved
55
+ - **THEN** system SHALL return complete MQA API response as structured JSON
56
+
57
+ ### Requirement: Tool Parameters
58
+ The system SHALL accept the following parameters for the MQA quality tool:
59
+ - server_url (required): Base URL of dati.gov.it portal
60
+ - dataset_id (required): Dataset ID or name
61
+ - response_format (optional): "markdown" (default) or "json"
62
+
63
+ #### Scenario: Minimal parameters
64
+ - **GIVEN** user provides only server_url and dataset_id
65
+ - **WHEN** tool is invoked
66
+ - **THEN** system SHALL use default markdown format
67
+
68
+ #### Scenario: All parameters specified
69
+ - **GIVEN** user provides server_url, dataset_id, and response_format
70
+ - **WHEN** tool is invoked
71
+ - **THEN** system SHALL use specified format for output
@@ -0,0 +1,29 @@
1
+ # Implementation Tasks
2
+
3
+ ## 1. Core Implementation
4
+ - [x] 1.1 Create `src/tools/quality.ts` with `ckan_get_mqa_quality` tool handler
5
+ - [x] 1.2 Implement server URL validation (dati.gov.it only)
6
+ - [x] 1.3 Add CKAN package_show call to extract identifier field
7
+ - [x] 1.4 Add MQA API client (https://data.europa.eu/api/mqa/cache/datasets/{id})
8
+ - [x] 1.5 Implement markdown and JSON formatters for quality metrics
9
+ - [x] 1.6 Register tool in `src/server.ts`
10
+
11
+ ## 2. Testing
12
+ - [x] 2.1 Create mock fixtures for CKAN package_show response
13
+ - [x] 2.2 Create mock fixtures for MQA API response
14
+ - [x] 2.3 Write integration tests for successful quality retrieval
15
+ - [x] 2.4 Write tests for error scenarios (invalid server, dataset not found, MQA API unavailable)
16
+ - [x] 2.5 Write tests for fallback from identifier to name field
17
+ - [x] 2.6 Verify test coverage matches project standards
18
+
19
+ ## 3. Documentation
20
+ - [x] 3.1 Add tool description to README.md
21
+ - [x] 3.2 Add usage examples to EXAMPLES.md
22
+ - [x] 3.3 Document server restriction (dati.gov.it only)
23
+ - [x] 3.4 Document quality metrics structure (score, accessibility, reusability, interoperability, findability)
24
+
25
+ ## 4. Validation
26
+ - [x] 4.1 Run full test suite (npm test) - 212 tests passing
27
+ - [x] 4.2 Test manually with real dati.gov.it dataset
28
+ - [x] 4.3 Verify error handling for non-dati.gov.it servers
29
+ - [x] 4.4 Build project successfully (npm run build)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@aborruso/ckan-mcp-server",
3
- "version": "0.4.13",
3
+ "version": "0.4.15",
4
4
  "description": "MCP server for interacting with CKAN open data portals",
5
5
  "main": "dist/index.js",
6
6
  "type": "module",