@aborruso/ckan-mcp-server 0.4.13 → 0.4.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,277 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Basic LangGraph Workflow with CKAN MCP Server
4
+
5
+ Demonstrates simple sequential workflow:
6
+ 1. Search datasets by keyword
7
+ 2. Filter by metadata quality using scoring system
8
+ 3. Extract CSV resources
9
+ 4. Display results
10
+
11
+ Run:
12
+ uvx --with langgraph --with mcp --with langchain-core python 01_basic_workflow.py
13
+ """
14
+
15
+ import asyncio
16
+ import json
17
+ import os
18
+ from typing import Annotated, TypedDict
19
+
20
+ from langgraph.graph import StateGraph, START, END
21
+ from langgraph.graph.message import add_messages
22
+ from mcp import ClientSession, StdioServerParameters
23
+ from mcp.client.stdio import stdio_client
24
+
25
+ from metadata_quality import MetadataQualityScorer
26
+
27
+ # Configuration
28
+ CKAN_SERVER = "https://www.dati.gov.it/opendata"
29
+ MCP_SERVER_PATH = os.path.join(os.path.dirname(__file__), "../../dist/index.js")
30
+ QUALITY_THRESHOLD = 40 # Minimum quality score (0-100)
31
+ SEARCH_ROWS = 5 # Limit rows to avoid JSON truncation in MCP responses
32
+
33
+
34
+ # State definition
35
+ class WorkflowState(TypedDict):
36
+ """State tracked through workflow."""
37
+
38
+ messages: Annotated[list, add_messages]
39
+ query: str
40
+ datasets: list[dict]
41
+ filtered_datasets: list[dict]
42
+ csv_resources: list[dict]
43
+ error: str | None
44
+
45
+
46
+ # MCP Client helper
47
+ class CKANMCPClient:
48
+ """Helper for calling CKAN MCP Server tools."""
49
+
50
+ def __init__(self, session: ClientSession):
51
+ self.session = session
52
+
53
+ async def search_packages(self, query: str, rows: int = SEARCH_ROWS) -> dict:
54
+ """Search CKAN packages."""
55
+ result = await self.session.call_tool(
56
+ "ckan_package_search",
57
+ arguments={
58
+ "server_url": CKAN_SERVER,
59
+ "q": query,
60
+ "rows": rows,
61
+ "response_format": "json",
62
+ },
63
+ )
64
+
65
+ # Parse JSON from text content
66
+ for content in result.content:
67
+ if content.type == "text":
68
+ try:
69
+ text = content.text
70
+ # Handle truncation marker if present
71
+ if "[Response truncated" in text:
72
+ text = text.split("[Response truncated")[0].strip()
73
+ return json.loads(text)
74
+ except json.JSONDecodeError as e:
75
+ return {"error": f"JSON parse error: {e}"}
76
+
77
+ return {"error": "No content in response"}
78
+
79
+
80
+ # Workflow nodes
81
+ async def search_datasets_node(
82
+ state: WorkflowState, mcp_client: CKANMCPClient
83
+ ) -> WorkflowState:
84
+ """Node 1: Search datasets."""
85
+ print(f"\n[1/3] Searching datasets for: '{state['query']}'")
86
+
87
+ try:
88
+ response = await mcp_client.search_packages(state["query"])
89
+
90
+ if "error" in response:
91
+ state["error"] = response["error"]
92
+ print(f" ✗ Error: {response['error']}")
93
+ return state
94
+
95
+ if "results" in response:
96
+ datasets = response["results"]
97
+ state["datasets"] = datasets
98
+ state["messages"].append(
99
+ {"role": "assistant", "content": f"Found {len(datasets)} datasets"}
100
+ )
101
+ print(
102
+ f" ✓ Found {response.get('count', len(datasets))} total, showing {len(datasets)}"
103
+ )
104
+ else:
105
+ state["error"] = "Unexpected response structure"
106
+ print(f" ✗ Error: missing 'results' key")
107
+
108
+ except Exception as e:
109
+ state["error"] = str(e)
110
+ print(f" ✗ Error: {e}")
111
+
112
+ return state
113
+
114
+
115
+ async def filter_quality_node(state: WorkflowState) -> WorkflowState:
116
+ """Node 2: Filter by metadata quality using scoring system."""
117
+ print("\n[2/3] Filtering by metadata quality")
118
+
119
+ if state.get("error"):
120
+ return state
121
+
122
+ scorer = MetadataQualityScorer()
123
+ filtered = []
124
+
125
+ for ds in state["datasets"]:
126
+ quality = scorer.score_dataset(ds)
127
+ ds["_quality"] = quality # Attach quality info to dataset
128
+
129
+ if quality["score"] >= QUALITY_THRESHOLD:
130
+ filtered.append(ds)
131
+ print(
132
+ f" ✓ {ds['title'][:50]}: {quality['score']}/100 ({quality['level']})"
133
+ )
134
+ else:
135
+ print(f" ✗ {ds['title'][:50]}: {quality['score']}/100 (rejected)")
136
+
137
+ state["filtered_datasets"] = filtered
138
+ state["messages"].append(
139
+ {
140
+ "role": "assistant",
141
+ "content": f"Filtered to {len(filtered)} quality datasets",
142
+ }
143
+ )
144
+ print(
145
+ f"\n → {len(filtered)}/{len(state['datasets'])} datasets pass quality threshold ({QUALITY_THRESHOLD})"
146
+ )
147
+
148
+ return state
149
+
150
+ # Filter: must have title, notes, and at least one resource
151
+ filtered = [
152
+ ds
153
+ for ds in state["datasets"]
154
+ if ds.get("title") and ds.get("notes") and ds.get("num_resources", 0) > 0
155
+ ]
156
+
157
+ state["filtered_datasets"] = filtered
158
+ state["messages"].append(
159
+ {
160
+ "role": "assistant",
161
+ "content": f"Filtered to {len(filtered)} quality datasets",
162
+ }
163
+ )
164
+ print(f" ✓ {len(filtered)} datasets with good metadata")
165
+
166
+ return state
167
+
168
+
169
+ async def extract_csv_node(state: WorkflowState) -> WorkflowState:
170
+ """Node 3: Extract CSV resources."""
171
+ print("\n[3/3] Extracting CSV resources")
172
+
173
+ if state.get("error"):
174
+ return state
175
+
176
+ csv_resources = []
177
+ for dataset in state["filtered_datasets"][:5]: # Limit to first 5
178
+ for resource in dataset.get("resources", []):
179
+ if resource.get("format", "").lower() == "csv":
180
+ csv_resources.append(
181
+ {
182
+ "dataset_name": dataset["name"],
183
+ "dataset_title": dataset["title"],
184
+ "resource_name": resource.get("name", "Untitled"),
185
+ "url": resource.get("url"),
186
+ }
187
+ )
188
+
189
+ state["csv_resources"] = csv_resources
190
+ state["messages"].append(
191
+ {
192
+ "role": "assistant",
193
+ "content": f"Extracted {len(csv_resources)} CSV resources",
194
+ }
195
+ )
196
+ print(f" ✓ Found {len(csv_resources)} CSV resources")
197
+
198
+ return state
199
+
200
+
201
+ # Build graph
202
+ async def build_workflow(mcp_client: CKANMCPClient) -> StateGraph:
203
+ """Build LangGraph workflow."""
204
+ graph = StateGraph(WorkflowState)
205
+
206
+ # Add nodes - wrap async functions properly
207
+ async def search_wrapper(state: WorkflowState) -> WorkflowState:
208
+ return await search_datasets_node(state, mcp_client)
209
+
210
+ graph.add_node("search", search_wrapper)
211
+ graph.add_node("filter", filter_quality_node)
212
+ graph.add_node("extract", extract_csv_node)
213
+
214
+ # Define edges
215
+ graph.add_edge(START, "search")
216
+ graph.add_edge("search", "filter")
217
+ graph.add_edge("filter", "extract")
218
+ graph.add_edge("extract", END)
219
+
220
+ return graph.compile()
221
+
222
+
223
+ async def main():
224
+ """Run workflow."""
225
+ print("=" * 60)
226
+ print("LangGraph + CKAN MCP Server - Basic Workflow")
227
+ print("=" * 60)
228
+
229
+ # Connect to MCP server
230
+ server_params = StdioServerParameters(command="node", args=[MCP_SERVER_PATH])
231
+
232
+ async with stdio_client(server_params) as (read, write):
233
+ async with ClientSession(read, write) as session:
234
+ await session.initialize()
235
+ print("\n✓ Connected to CKAN MCP Server")
236
+
237
+ # Build workflow
238
+ mcp_client = CKANMCPClient(session)
239
+ workflow = await build_workflow(mcp_client)
240
+
241
+ # Execute workflow
242
+ initial_state: WorkflowState = {
243
+ "messages": [],
244
+ "query": "mobilità urbana",
245
+ "datasets": [],
246
+ "filtered_datasets": [],
247
+ "csv_resources": [],
248
+ "error": None,
249
+ }
250
+
251
+ result = await workflow.ainvoke(initial_state)
252
+
253
+ # Display results
254
+ print("\n" + "=" * 60)
255
+ print("RESULTS")
256
+ print("=" * 60)
257
+
258
+ if result["error"]:
259
+ print(f"\n✗ Workflow failed: {result['error']}")
260
+ else:
261
+ print(f"\nQuery: {result['query']}")
262
+ print(f"Total datasets found: {len(result['datasets'])}")
263
+ print(f"Quality datasets: {len(result['filtered_datasets'])}")
264
+ print(f"CSV resources: {len(result['csv_resources'])}")
265
+
266
+ if result["csv_resources"]:
267
+ print("\nFirst 3 CSV resources:")
268
+ for i, res in enumerate(result["csv_resources"][:3], 1):
269
+ print(f"\n{i}. {res['resource_name']}")
270
+ print(f" Dataset: {res['dataset_title']}")
271
+ print(f" URL: {res['url']}")
272
+
273
+ print("\n" + "=" * 60)
274
+
275
+
276
+ if __name__ == "__main__":
277
+ asyncio.run(main())
@@ -0,0 +1,366 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Data Exploration Workflow with State and Conditionals
4
+
5
+ Advanced workflow demonstrating:
6
+ - Conditional branching (DataStore vs CSV)
7
+ - State persistence across decisions
8
+ - Human-in-the-loop for resource selection
9
+ - SQL queries on DataStore resources
10
+
11
+ Run:
12
+ python 02_data_exploration.py
13
+ """
14
+
15
+ import asyncio
16
+ import json
17
+ import os
18
+ from typing import Annotated, Literal
19
+
20
+ from langgraph.graph import StateGraph, START, END
21
+ from langgraph.graph.message import add_messages
22
+ from mcp import ClientSession, StdioServerParameters
23
+ from mcp.client.stdio import stdio_client
24
+
25
+
26
+ # Configuration
27
+ CKAN_SERVER = "https://www.dati.gov.it/opendata"
28
+ MCP_SERVER_PATH = os.path.join(os.path.dirname(__file__), "../../dist/index.js")
29
+ SEARCH_ROWS = 5 # Markdown format handles truncation gracefully
30
+ # Note: Some queries return very large metadata. Use specific queries like "trasporti"
31
+ # instead of generic ones like "CSV" or "popolazione" to avoid JSON truncation.
32
+
33
+
34
+ # State definition
35
+ class ExplorationState(dict):
36
+ """State for data exploration workflow."""
37
+
38
+ messages: Annotated[list, add_messages]
39
+ query: str
40
+ datasets: list[dict]
41
+ selected_dataset: dict | None
42
+ selected_resource: dict | None
43
+ resource_type: Literal["datastore", "csv", "unknown"] | None
44
+ analysis_result: dict | None
45
+ error: str | None
46
+
47
+
48
+ # MCP Client
49
+ class CKANMCPClient:
50
+ """Helper for CKAN MCP operations."""
51
+
52
+ def __init__(self, session: ClientSession):
53
+ self.session = session
54
+
55
+ async def search_packages(self, query: str, rows: int = SEARCH_ROWS) -> dict:
56
+ """Search packages."""
57
+ result = await self.session.call_tool(
58
+ "ckan_package_search",
59
+ arguments={
60
+ "server_url": CKAN_SERVER,
61
+ "q": query,
62
+ "rows": rows,
63
+ "response_format": "json",
64
+ },
65
+ )
66
+ for content in result.content:
67
+ if content.type == "text":
68
+ try:
69
+ text = content.text
70
+ if "[Response truncated" in text:
71
+ text = text.split("[Response truncated")[0].strip()
72
+ return json.loads(text)
73
+ except json.JSONDecodeError as e:
74
+ return {"error": f"JSON parse error: {e}"}
75
+ return {"error": "No content in response"}
76
+
77
+ async def datastore_search(self, resource_id: str, limit: int = 3) -> dict:
78
+ """Query DataStore."""
79
+ result = await self.session.call_tool(
80
+ "ckan_datastore_search",
81
+ arguments={
82
+ "server_url": CKAN_SERVER,
83
+ "resource_id": resource_id,
84
+ "limit": limit,
85
+ "response_format": "json",
86
+ },
87
+ )
88
+ for content in result.content:
89
+ if content.type == "text":
90
+ try:
91
+ text = content.text
92
+ if "[Response truncated" in text:
93
+ text = text.split("[Response truncated")[0].strip()
94
+ return json.loads(text)
95
+ except json.JSONDecodeError as e:
96
+ return {"error": f"JSON parse error: {e}"}
97
+ return {"error": "No content in response"}
98
+
99
+
100
+ # Workflow nodes
101
+ async def search_node(
102
+ state: ExplorationState, mcp_client: CKANMCPClient
103
+ ) -> ExplorationState:
104
+ """Search for datasets."""
105
+ print(f"\n[SEARCH] Query: '{state['query']}'")
106
+
107
+ try:
108
+ response = await mcp_client.search_packages(state["query"])
109
+
110
+ if "error" in response:
111
+ state["error"] = response["error"]
112
+ print(f" ✗ Error: {response['error']}")
113
+ return state
114
+
115
+ if "results" in response:
116
+ datasets = response["results"]
117
+ state["datasets"] = datasets
118
+ print(
119
+ f" ✓ Found {response.get('count', len(datasets))} total, showing {len(datasets)}"
120
+ )
121
+ else:
122
+ state["error"] = "Unexpected response structure"
123
+
124
+ except Exception as e:
125
+ state["error"] = str(e)
126
+ print(f" ✗ Error: {e}")
127
+
128
+ return state
129
+
130
+
131
+ async def select_dataset_node(state: ExplorationState) -> ExplorationState:
132
+ """Human-in-the-loop: select dataset."""
133
+ print("\n[SELECT DATASET] Available datasets:")
134
+
135
+ if state.get("error") or not state.get("datasets"):
136
+ return state
137
+
138
+ # Show top 3 datasets
139
+ for i, ds in enumerate(state["datasets"][:3], 1):
140
+ print(f"\n{i}. {ds['title']}")
141
+ print(f" Resources: {ds.get('num_resources', 0)}")
142
+ print(f" Org: {ds.get('organization', {}).get('title', 'N/A')}")
143
+
144
+ # Simulate user selection (in real app, use input())
145
+ selection = 0 # Select first
146
+ state["selected_dataset"] = state["datasets"][selection]
147
+ print(f"\n → Selected: {state['selected_dataset']['title']}")
148
+
149
+ return state
150
+
151
+
152
+ async def select_resource_node(state: ExplorationState) -> ExplorationState:
153
+ """Select resource and detect type."""
154
+ print("\n[SELECT RESOURCE]")
155
+
156
+ if state.get("error") or not state.get("selected_dataset"):
157
+ return state
158
+
159
+ resources = state["selected_dataset"].get("resources", [])
160
+ if not resources:
161
+ state["error"] = "No resources available"
162
+ return state
163
+
164
+ print("Available resources:")
165
+ for i, res in enumerate(resources[:3], 1):
166
+ print(f"{i}. {res.get('name', 'Untitled')} ({res.get('format', 'N/A')})")
167
+
168
+ # Select first resource
169
+ selected = resources[0]
170
+ state["selected_resource"] = selected
171
+
172
+ # Detect type
173
+ if selected.get("datastore_active"):
174
+ state["resource_type"] = "datastore"
175
+ print(f"\n → Type: DataStore (SQL queries available)")
176
+ elif selected.get("format", "").lower() == "csv":
177
+ state["resource_type"] = "csv"
178
+ print(f"\n → Type: CSV (download required)")
179
+ else:
180
+ state["resource_type"] = "unknown"
181
+ print(f"\n → Type: Unknown format")
182
+
183
+ return state
184
+
185
+
186
+ async def analyze_datastore_node(
187
+ state: ExplorationState, mcp_client: CKANMCPClient
188
+ ) -> ExplorationState:
189
+ """Analyze DataStore resource."""
190
+ print("\n[ANALYZE DATASTORE]")
191
+
192
+ if state.get("error"):
193
+ return state
194
+
195
+ try:
196
+ resource_id = state["selected_resource"]["id"]
197
+ result = await mcp_client.datastore_search(resource_id, limit=3)
198
+
199
+ if "error" in result:
200
+ state["error"] = result["error"]
201
+ print(f" ✗ Error: {result['error']}")
202
+ return state
203
+
204
+ if "records" in result:
205
+ records = result["records"]
206
+ fields = result.get("fields", [])
207
+
208
+ state["analysis_result"] = {
209
+ "type": "datastore",
210
+ "record_count": len(records),
211
+ "fields": [f["id"] for f in fields if isinstance(f, dict)],
212
+ "sample_records": records,
213
+ }
214
+
215
+ print(f" ✓ Fields: {', '.join(state['analysis_result']['fields'][:5])}")
216
+ print(f" ✓ Sample: {len(records)} records")
217
+ else:
218
+ state["error"] = "DataStore query failed"
219
+
220
+ except Exception as e:
221
+ state["error"] = str(e)
222
+ print(f" ✗ Error: {e}")
223
+
224
+ return state
225
+
226
+
227
+ async def analyze_csv_node(state: ExplorationState) -> ExplorationState:
228
+ """Analyze CSV resource (placeholder)."""
229
+ print("\n[ANALYZE CSV]")
230
+
231
+ if state.get("error"):
232
+ return state
233
+
234
+ # In real app: download and analyze with pandas/duckdb
235
+ state["analysis_result"] = {
236
+ "type": "csv",
237
+ "url": state["selected_resource"].get("url"),
238
+ "format": state["selected_resource"].get("format"),
239
+ }
240
+
241
+ print(f" → URL: {state['analysis_result']['url']}")
242
+ print(" (Download and analyze with DuckDB/pandas)")
243
+
244
+ return state
245
+
246
+
247
+ async def skip_analysis_node(state: ExplorationState) -> ExplorationState:
248
+ """Skip analysis for unknown formats."""
249
+ print("\n[SKIP ANALYSIS] Unknown format, cannot analyze")
250
+ state["analysis_result"] = {"type": "unknown", "skipped": True}
251
+ return state
252
+
253
+
254
+ # Routing function
255
+ def route_by_resource_type(state: ExplorationState) -> str:
256
+ """Route based on resource type."""
257
+ if state.get("error"):
258
+ return "end"
259
+
260
+ resource_type = state.get("resource_type")
261
+ if resource_type == "datastore":
262
+ return "analyze_datastore"
263
+ elif resource_type == "csv":
264
+ return "analyze_csv"
265
+ else:
266
+ return "skip_analysis"
267
+
268
+
269
+ # Build workflow
270
+ async def build_workflow(mcp_client: CKANMCPClient) -> StateGraph:
271
+ """Build exploration workflow with conditional branching."""
272
+ graph = StateGraph(ExplorationState)
273
+
274
+ # Add nodes with async wrappers
275
+ async def search_wrapper(state: ExplorationState) -> ExplorationState:
276
+ return await search_node(state, mcp_client)
277
+
278
+ async def analyze_wrapper(state: ExplorationState) -> ExplorationState:
279
+ return await analyze_datastore_node(state, mcp_client)
280
+
281
+ graph.add_node("search", search_wrapper)
282
+ graph.add_node("select_dataset", select_dataset_node)
283
+ graph.add_node("select_resource", select_resource_node)
284
+ graph.add_node("analyze_datastore", analyze_wrapper)
285
+ graph.add_node("analyze_csv", analyze_csv_node)
286
+ graph.add_node("skip_analysis", skip_analysis_node)
287
+
288
+ # Define edges
289
+ graph.add_edge(START, "search")
290
+ graph.add_edge("search", "select_dataset")
291
+ graph.add_edge("select_dataset", "select_resource")
292
+
293
+ # Conditional routing based on resource type
294
+ graph.add_conditional_edges(
295
+ "select_resource",
296
+ route_by_resource_type,
297
+ {
298
+ "analyze_datastore": "analyze_datastore",
299
+ "analyze_csv": "analyze_csv",
300
+ "skip_analysis": "skip_analysis",
301
+ "end": END,
302
+ },
303
+ )
304
+
305
+ # All analysis paths lead to END
306
+ graph.add_edge("analyze_datastore", END)
307
+ graph.add_edge("analyze_csv", END)
308
+ graph.add_edge("skip_analysis", END)
309
+
310
+ return graph.compile()
311
+
312
+
313
+ async def main():
314
+ """Run exploration workflow."""
315
+ print("=" * 60)
316
+ print("LangGraph + CKAN MCP - Data Exploration Workflow")
317
+ print("=" * 60)
318
+
319
+ server_params = StdioServerParameters(command="node", args=[MCP_SERVER_PATH])
320
+
321
+ async with stdio_client(server_params) as (read, write):
322
+ async with ClientSession(read, write) as session:
323
+ await session.initialize()
324
+ print("\n✓ Connected to CKAN MCP Server")
325
+
326
+ mcp_client = CKANMCPClient(session)
327
+ workflow = await build_workflow(mcp_client)
328
+
329
+ # Execute workflow
330
+ initial_state: ExplorationState = {
331
+ "messages": [],
332
+ "query": "trasporti", # Query that returns manageable datasets
333
+ "datasets": [],
334
+ "selected_dataset": None,
335
+ "selected_resource": None,
336
+ "resource_type": None,
337
+ "analysis_result": None,
338
+ "error": None,
339
+ }
340
+
341
+ result = await workflow.ainvoke(initial_state)
342
+
343
+ # Display results
344
+ print("\n" + "=" * 60)
345
+ print("WORKFLOW RESULT")
346
+ print("=" * 60)
347
+
348
+ if result.get("error"):
349
+ print(f"\n✗ Error: {result['error']}")
350
+ elif result.get("analysis_result"):
351
+ analysis = result["analysis_result"]
352
+ print(f"\nAnalysis Type: {analysis['type']}")
353
+
354
+ if analysis["type"] == "datastore":
355
+ print(f"Fields: {', '.join(analysis['fields'][:5])}")
356
+ print(f"Records sampled: {analysis['record_count']}")
357
+ elif analysis["type"] == "csv":
358
+ print(f"URL: {analysis['url']}")
359
+ else:
360
+ print("Skipped (unknown format)")
361
+
362
+ print("\n" + "=" * 60)
363
+
364
+
365
+ if __name__ == "__main__":
366
+ asyncio.run(main())