PyPI - wbgapi360 - Versions diffs - 0.2.1__py3-none-any.whl - Mend

wbgapi360 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

wbgapi360/__init__.py +51 -0
wbgapi360/ai/agent.py +114 -0
wbgapi360/api.py +257 -0
wbgapi360/cli.py +83 -0
wbgapi360/config.py +41 -0
wbgapi360/core/auditor.py +107 -0
wbgapi360/core/client.py +163 -0
wbgapi360/core/models.py +24 -0
wbgapi360/core/transformers.py +70 -0
wbgapi360/core/utils.py +38 -0
wbgapi360/data/builder.py +192 -0
wbgapi360/mcp/server.py +590 -0
wbgapi360/metadata/builder.py +87 -0
wbgapi360/metadata/iso_mapping.py +228 -0
wbgapi360/metadata/resolver.py +136 -0
wbgapi360/search/engine.py +143 -0
wbgapi360/visual/__init__.py +1 -0
wbgapi360/visual/charts.py +1093 -0
wbgapi360-0.2.1.dist-info/METADATA +230 -0
wbgapi360-0.2.1.dist-info/RECORD +24 -0
wbgapi360-0.2.1.dist-info/WHEEL +5 -0
wbgapi360-0.2.1.dist-info/entry_points.txt +2 -0
wbgapi360-0.2.1.dist-info/licenses/LICENSE +21 -0
wbgapi360-0.2.1.dist-info/top_level.txt +1 -0

wbgapi360/__init__.py ADDED Viewed

@@ -0,0 +1,51 @@
+from .core.client import Data360Client
+from .search.engine import SearchEngine
+from .data.builder import DataBuilder
+from .ai.agent import DataAgent
+from .metadata.builder import MetadataBuilder
+from .visual import viz
+class API:
+    def __init__(self):
+        self._client = None
+    @property
+    def client(self):
+         if not self._client:
+             self._client = Data360Client()
+         return self._client
+    @property
+    def search(self):
+        return SearchEngine(self.client)
+    @property
+    def data(self):
+        return DataBuilder(self.client)
+    @property
+    def metadata(self):
+        return MetadataBuilder(self.client)
+    @property
+    def ai(self):
+        return DataAgent(self.client)
+    @property
+    def visual(self):
+        return viz
+    async def close(self):
+        if self._client:
+            await self._client.close()
+__version__ = "0.2.1"
+__author__ = "Maykol Medrano"
+__email__ = "mmedrano2@uc.cl"
+__credits__ = ["Applied Economist Policy Data Scientist"]
+# Expose the human-friendly API at top level
+from wbgapi360.api import search, get_data, plot
+__all__ = ["search", "get_data", "plot"]

wbgapi360/ai/agent.py ADDED Viewed

@@ -0,0 +1,114 @@
+from ..core.client import Data360Client
+from ..search.engine import SearchEngine
+from ..data.builder import DataBuilder
+from typing import Dict, Any, List
+import logging
+logger = logging.getLogger("wbgapi360")
+class DataAgent:
+    """
+    The 'Smart' interface. Relies on the API's vector search
+    to resolve natural language to IDs.
+    """
+    def __init__(self, client: Data360Client):
+        self.client = client
+        self.search = SearchEngine(client)
+    async def get_context(self, natural_query: str) -> Dict[str, Any]:
+        """
+        Understands the query using vector search and returns a DataContext.
+        """
+        # 1. Search for the indicator using semantic search
+        # 1. Search for the indicator using semantic search, preferring WDI
+        logger.info(f"[AI] Thinking about '{natural_query}'...")
+        results = await self.search.semantic_explore(natural_query, database_id="WB_WDI")
+        if not results:
+            logger.info(f"[AI] No results found for '{natural_query}'.")
+            return {"error": f"I couldn't find any relevant data for '{natural_query}' in the World Bank 360 API."}
+        # 2. Pick the top result, but verify it has minimal checks
+        # In a real agent, we might present the top 3 to the user if confidence is low.
+        best_match = results[0]
+        series_desc = best_match.get('series_description', {})
+        indicator_id = series_desc.get('idno')
+        name = series_desc.get('name')
+        database_id = series_desc.get('database_id')
+        if not indicator_id:
+             return {"error": "Found a match but it lacked a valid Indicator ID."}
+        logger.info(f"[AI] I found: {name} (ID: {indicator_id}, DB: {database_id})")
+        return {
+            "indicator": indicator_id,
+            "database_id": database_id or "WB_WDI",
+            "name": name,
+            "raw_match": best_match
+        }
+    async def get_available_dimensions(self, indicator_id: str) -> Dict[str, List[str]]:
+        """
+        Queries /disaggregation to see what dims are valid.
+        Returns a dict of dim_name -> list of valid codes.
+        """
+        try:
+            # The disaggregation endpoint returns metadata about valid filters
+            # We use the generic 'get_data' since disaggregation is a GET endpoint
+            response = await self.client.get_data("/disaggregation", params={"indicatorId": indicator_id})
+            # Response handling logic (simplified for prototype)
+            # Assuming response structure is list of objects with dimension info
+            dims = {}
+            if isinstance(response, dict) and "value" in response:
+                vals = response["value"]
+                # Heuristic parsing of dimension metadata
+                # Assuming structure might be list of dicts with 'id', 'name', or 'code'
+                if isinstance(vals, list):
+                    for v in vals:
+                        # Try to find the dimension name and its valid codes
+                        # This is speculative without the specific API contract for /disaggregation
+                        # But we look for common keys.
+                        dim_id = v.get('id') or v.get('code')
+                        if dim_id:
+                            # If the API returns valid values for this dimension, store them
+                            # For now, we just map the dimension ID to a placeholder or count
+                            dims[dim_id] = []
+                            # If there's a nested 'values' list, capture it
+                            if 'values' in v and isinstance(v['values'], list):
+                                dims[dim_id] = [sub.get('id') for sub in v['values'] if 'id' in sub]
+            return dims
+        except Exception as e:
+            logger.warning(f"[AI] Warning: Could not introspect dimensions: {e}")
+            return {}
+    async def ask(self, natural_query: str, economy: str = "WLD", years: int = 5):
+        """
+        End-to-end flow: Question -> Data.
+        """
+        ctx = await self.get_context(natural_query)
+        if "error" in ctx:
+            return ctx
+        indicator_id = ctx["indicator"]
+        database_id = ctx.get("database_id", "WB_WDI")
+        # 3. Introspect (Smart Step)
+        logger.info(f"[AI] Inspecting dimensions for {indicator_id}...")
+        # For this prototype we just log that we are doing it.
+        # In a full version, we would check if 'economy' or 'years' is valid,
+        # or if we need to add specific filters based on the query text (e.g. 'rural').
+        # 4. Fetch data via Builder
+        logger.info(f"[AI] Fetching data for {economy} from {database_id}...")
+        builder = DataBuilder(self.client, dataset_id=database_id)
+        data = await builder.indicator(indicator_id).economy(economy).limit(years).get()
+        return {
+            "answer": f"Here is the data for '{ctx['name']}'",
+            "data": data,
+            "source_indicator": indicator_id,
+            "name": ctx['name']
+        }

wbgapi360/api.py ADDED Viewed

@@ -0,0 +1,257 @@
+import logging
+import asyncio
+import nest_asyncio
+import pandas as pd
+import datetime
+import os
+from typing import List, Dict, Any, Union, Optional
+# --- CORE IMPORTS (Decoupled Architectue) ---
+from wbgapi360.core.client import Data360Client
+from wbgapi360.search.engine import SearchEngine
+from wbgapi360.data.builder import DataBuilder
+from wbgapi360.visual.charts import Visualizer
+from wbgapi360.core.utils import normalize_codes, resolve_economies
+from wbgapi360.core.transformers import DataStandardizer
+from wbgapi360.core.auditor import DataAuditor
+# --- SETUP ---
+logger = logging.getLogger("wbgapi360")
+logger.addHandler(logging.NullHandler())
+# Singletons for Sync API
+# _client = Data360Client() -> PREVIOUSLY GLOBAL, NOW INSTANTIATED PER CALL TO FIX ASYNCIO LIFECYCLE
+_viz = Visualizer()
+def _run_sync(coro):
+    """
+    Helper privado: Ejecuta una corutina de forma síncrona.
+    Aplica nest_asyncio solo si detecta un loop en ejecución (Jupyter).
+    """
+    try:
+        loop = asyncio.get_event_loop()
+    except RuntimeError:
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+    if loop.is_running():
+        # Scoped Side-Effect: Correcto, solo bajo demanda.
+        nest_asyncio.apply()
+        return loop.run_until_complete(coro)
+    else:
+        return asyncio.run(coro)
+# --- INTERNAL ASYNC LOGIC (Replicated from server to decouple) ---
+async def _async_get_data(
+    indicator: Union[str, List[str]],
+    economies: Union[str, List[str]],
+    years: int = 5,
+    database_id: str = "WB_WDI",
+    labels: bool = False,  # Convert codes to human names
+    as_frame: bool = True  # Default to True for Python API
+) -> Union[pd.DataFrame, str]:
+    async with Data360Client() as client:
+        # 1. Normalize Inputs
+        codes = normalize_codes(indicator, database_id)
+        eco_list = resolve_economies(economies)
+        # 2. Builder
+        builder = DataBuilder(client, dataset_id=database_id)
+        builder.indicator(codes).economy(eco_list)
+        # 3. Time Logic (MRV vs Trend)
+        current_year = datetime.datetime.now().year
+        if years == 1:
+            # MRV Logic (Simplified)
+            start_year = current_year - 5
+            builder.time(f"{start_year}:{current_year}")
+            # Note: True MRV filter logic is complex, for v0.3 we take simple range
+            # and let the DataFrame user filter the last column if needed,
+            # or we could implement the full logic here. For now, strict range.
+            mrv_mode = True
+        else:
+            mrv_mode = False
+            start_year = current_year - years
+            builder.time(f"{start_year}:{current_year}")
+        # 4. Fetch
+        try:
+            # Pivot=True is standard for human analysis
+            # labels=True converts codes to names (preserves codes for maps)
+            df = await builder.to_dataframe(pivot=True, labels=labels)
+            if df.empty:
+                return pd.DataFrame() if as_frame else "No data found."
+            # MRV Post-processing (Keep only valid latest value)
+            if mrv_mode and not df.empty:
+                def _is_year(c):
+                    s = str(c)
+                    if s.isdigit(): return True
+                    try:
+                        f = float(s)
+                        return f.is_integer() and 1900 < f < 2100
+                    except ValueError:
+                        return False
+                year_cols = [c for c in df.columns if _is_year(c)]
+                if year_cols:
+                    year_cols.sort()
+                    # ffill to get last valid observation
+                    last_vals = df[year_cols].ffill(axis=1).iloc[:, -1]
+                    # Clean up structure
+                    df = df.drop(columns=year_cols)
+                    df[f'mrv_{current_year}'] = pd.to_numeric(last_vals, errors='coerce')
+                    df = df.dropna()
+            if as_frame:
+                return df
+            return df.reset_index().to_json(orient='records')
+        except Exception as e:
+            logger.error(f"Data retrieval failed: {e}")
+            return pd.DataFrame() if as_frame else f"Error: {e}"
+# --- PUBLIC API (USER FACING) ---
+def search(query: str, limit: int = 10) -> List[Dict[str, Any]]:
+    """
+    Search for World Bank indicators using natural language and Smart Ranking.
+    Args:
+        query: Search term (e.g., "Inflation", "education spending")
+        limit: Maximum number of results.
+    """
+    logger.info(f"Searching for: '{query}'")
+    async def _do_search():
+        async with Data360Client() as client:
+            search_engine = SearchEngine(client)
+            raw = await search_engine.semantic_explore(query, database_id="WB_WDI")
+            # Format for humans
+            return [
+                {
+                    "code": item.get('series_description', {}).get('idno'),
+                    "name": item.get('series_description', {}).get('name'),
+                    "source": item.get('series_description', {}).get('database_id')
+                }
+                for item in raw[:limit]
+            ]
+    return _run_sync(_do_search())
+def get_data(
+    indicator: Union[str, List[str]],
+    economies: Union[str, List[str]],
+    years: int = 5,
+    labels: bool = False,  # Convert codes to names
+    as_json: bool = False
+) -> Union[pd.DataFrame, str]:
+    """
+    Descarga datos corregidos y listos para usar (DataFrame por defecto).
+    Args:
+        indicator: Código(s) del indicador (ej: "NY.GDP.MKTP.KD" o "GDP")
+        economies: Código(s) de país (ej: ["CHL", "PER"] o "USA")
+        years: Número de años hacia atrás (default: 5). Si es 1, busca el valor más reciente (MRV).
+        labels: Si es True, convierte códigos ISO (USA) a nombres legibles (United States).
+               Preserva códigos originales en columna REF_AREA_CODE para compatibilidad con mapas.
+        as_json: Si es True, Returns un string JSON en vez de DataFrame.
+    Returns:
+        pd.DataFrame (index=[REF_AREA, INDICATOR], columns=[Years...])
+        o String JSON si as_json=True
+    """
+    logger.info(f"Fetching data via Senior Analyst Engine for {economies}...")
+    # 1. Fetch Raw Data using internal async logic
+    # Force as_frame=True so we can standardize. If user wants JSON, we convert at end.
+    df = _run_sync(_async_get_data(indicator, economies, years, labels=labels, as_frame=True))
+    if isinstance(df, str): # Error message
+        return df
+    # --- SENIOR ANALYST LAYER ---
+    try:
+        # 2. Standardize
+        df = DataStandardizer.ensure_tidy(df)
+        # 3. Audit
+        report = DataAuditor.audit(df)
+        DataAuditor.print_report(report)
+    except Exception as e:
+        # Fail safe: if analyst crashes, return raw data but log warning
+        logger.warning(f"Senior Analyst Layer error: {e}")
+    if as_json:
+        return df.to_json(orient='records')
+    return df
+def plot(chart_type: str, data: Union[str, pd.DataFrame], title: str = "", subtitle: str = "", **kwargs) -> str:
+    """
+    Generate Financial Times-style chart with editorial aesthetics.
+    Args:
+        chart_type: Chart type (trend, bar, scatter, map, map_bubble, map_diverging, map_categorical, etc.)
+        data: DataFrame or JSON string with data
+        title: Chart title
+        subtitle: Chart subtitle
+        **kwargs: Additional chart-specific arguments
+                  (e.g., bins, labels for map_categorical)
+    Returns:
+        Absolute path to generated image.
+    """
+    logger.info(f"Plotting chart type: {chart_type}")
+    # Logic adapted for local dispatch
+    dispatch_table = {
+        'trend': _viz.plot_trend,
+        'line': _viz.plot_trend,
+        'bar': _viz.plot_bar,
+        'column': _viz.plot_column,
+        'scatter': _viz.plot_scatter,
+        'map': _viz.plot_map,
+        'map_bubble': _viz.plot_map_bubble,
+        'map_diverging': _viz.plot_map_diverging,
+        'map_categorical': _viz.plot_map_categorical,
+        'dumbbell': _viz.plot_dumbbell,
+        'stacked': _viz.plot_stacked_bar,
+        'stacked_bar': _viz.plot_stacked_bar,
+        'area': _viz.plot_area,
+        'heatmap': _viz.plot_heatmap,
+        'bump': _viz.plot_bump,
+        'treemap': _viz.plot_treemap,
+        'donut': _viz.plot_donut,
+        'pie': _viz.plot_donut
+    }
+    func = dispatch_table.get(chart_type)
+    if not func:
+        raise ValueError(f"Unknown chart type: {chart_type}")
+    # Handle JSON input
+    if isinstance(data, str):
+        try:
+            import pandas as pd
+            import json
+            data = pd.DataFrame(json.loads(data))
+        except Exception as e:
+            raise ValueError(f"Invalid JSON data: {e}")
+    # Generatetete unique path
+    import tempfile, uuid
+    chart_id = str(uuid.uuid4())[:8]
+    path = os.path.join(tempfile.gettempdir(), f"wbg_plot_{chart_id}_{chart_type}.png")
+    # Execute the plot function with all kwargs
+    func(data, title=title, subtitle=subtitle, save_path=path, **kwargs)
+    return path

wbgapi360/cli.py ADDED Viewed

@@ -0,0 +1,83 @@
+import argparse
+import asyncio
+import sys
+import json
+from . import api
+from .config import settings
+async def run_search(args):
+    """Execute search command."""
+    print(f"Searching for: {args.query}...")
+    try:
+        results = await api.search.semantic_explore(args.query)
+        if args.json:
+            print(json.dumps(results, indent=2))
+        else:
+            print(f"Found {len(results)} results:")
+            for r in results[:10]:
+                 desc = r.get('series_description', {})
+                 print(f"[{desc.get('idno')}] {desc.get('name')}")
+    except Exception as e:
+        print(f"Error: {e}")
+async def run_data(args):
+    """Execute data fetch command."""
+    print(f"Fetching data: {args.indicator} for {args.economy}...")
+    try:
+        data = await api.data.indicator(args.indicator).economy(args.economy).limit(args.limit).get()
+        if args.json:
+            print(json.dumps(data, indent=2))
+        else:
+            print(f"Returned {len(data)} rows.")
+            # Simple table print for CLI
+            print(f"{'Economy':<10} {'Year':<10} {'Value':<15}")
+            print("-" * 35)
+            for row in data:
+                # Based on OAS, fields might be uppercase like OBS_VALUE
+                eco = row.get('REF_AREA') or row.get('economy') or 'N/A'
+                time = row.get('TIME_PERIOD') or row.get('time') or 'N/A'
+                val = row.get('OBS_VALUE') or row.get('value') or 'N/A'
+                print(f"{eco:<10} {time:<10} {val:<15}")
+    except Exception as e:
+        print(f"Error: {e}")
+async def async_main():
+    parser = argparse.ArgumentParser(description="wbgapi360 Enterprise CLI")
+    subparsers = parser.add_subparsers(dest="command", required=True)
+    # Search Command
+    search_parser = subparsers.add_parser("search", help="Semantic search for indicators")
+    search_parser.add_argument("query", help="Search term or natural language query")
+    search_parser.add_argument("--json", action="store_true", help="Output raw JSON")
+    # Data Command
+    data_parser = subparsers.add_parser("data", help="Fetch data")
+    data_parser.add_argument("--indicator", required=True, help="Indicator ID")
+    data_parser.add_argument("--economy", default="WLD", help="Economy code (default: WLD)")
+    data_parser.add_argument("--limit", type=int, default=5, help="Limit results")
+    data_parser.add_argument("--json", action="store_true", help="Output raw JSON")
+    # Config Command
+    config_parser = subparsers.add_parser("config", help="Show current configuration")
+    args = parser.parse_args()
+    if args.command == "search":
+        await run_search(args)
+    elif args.command == "data":
+        await run_data(args)
+    elif args.command == "config":
+        print("Current Configuration:")
+        print(settings.model_dump_json(indent=2))
+    await api.close()
+def main():
+    try:
+        asyncio.run(async_main())
+    except KeyboardInterrupt:
+        print("\nAborted.")
+if __name__ == "__main__":
+    main()

wbgapi360/config.py ADDED Viewed

@@ -0,0 +1,41 @@
+import os
+from pydantic import BaseModel, Field
+class Settings(BaseModel):
+    """
+    Enterprise Configuration Management.
+    Loads from Environment Variables tailored for Docker/K8s.
+    """
+    API_URL: str = Field(default="https://data360api.worldbank.org/data360", alias="WBG360_API_URL")
+    TIMEOUT: int = Field(default=30, alias="WBG360_TIMEOUT")
+    MAX_RETRIES: int = Field(default=3, alias="WBG360_MAX_RETRIES")
+    ENABLE_CACHE: bool = Field(default=True, alias="WBG360_ENABLE_CACHE")
+    CACHE_TTL: int = Field(default=300, alias="WBG360_CACHE_TTL") # 5 minutes default
+    CACHE_DIR: str = Field(default=os.path.join(os.path.expanduser("~"), ".wbgapi360"), alias="WBG360_CACHE_DIR")
+    # Logging
+    LOG_LEVEL: str = Field(default="INFO", alias="WBG360_LOG_LEVEL")
+    @classmethod
+    def load(cls):
+        # Initial load from os.environ
+        # Pydantic v2 usually uses pydantic-settings, but to avoid extra deps for this prototype step,
+        # we manually populate from os.environ referencing the aliases if present.
+        data = {}
+        defaults = cls().model_dump()
+        for name, field in cls.model_fields.items():
+            alias = field.alias or name
+            if alias in os.environ:
+                val = os.environ[alias]
+                # Simple boolean conversion
+                if field.annotation is bool:
+                    val = val.lower() in ('true', '1', 'yes')
+                # Simple int conversion
+                elif field.annotation is int:
+                    val = int(val)
+                data[name] = val
+        return cls(**data)
+settings = Settings.load()

wbgapi360/core/auditor.py ADDED Viewed

@@ -0,0 +1,107 @@
+import pandas as pd
+import logging
+logger = logging.getLogger(__name__)
+class DataAuditor:
+    """
+    The 'Eyes' of the Senior Analyst.
+    Checks data before it reaches the user.
+    """
+    @staticmethod
+    def audit(df: pd.DataFrame) -> dict:
+        """
+        Performs a full audit on the dataframe.
+        Returns a report dictionary.
+        """
+        if df.empty:
+            return {"status": "CRITICAL", "message": "No data returned."}
+        report = {"status": "OK", "warnings": []}
+        # 1. Gaps Audit (The "Venezuela Check")
+        # Ensure we have column 'OBS_VALUE'
+        val_col = 'OBS_VALUE'
+        if val_col not in df.columns:
+             # Try to find numeric column
+             nums = df.select_dtypes(include=['number'])
+             if not nums.empty:
+                 val_col = nums.columns[0]
+        if val_col in df.columns:
+            missing = df[val_col].isna().sum()
+            total = len(df)
+            completeness = 1.0 - (missing / total) if total > 0 else 0
+            if completeness < 0.8:
+                report['warnings'].append(f"High data gap detected. Completeness: {completeness:.1%}. Check for missing countries/years.")
+                report['status'] = "WARNING"
+            if completeness == 0:
+                 report['status'] = "CRITICAL"
+                 report['message'] = "Data is completely empty (Black Hole)."
+        # 2. Scale Audit (The "Japan vs Zimbabwe Check")
+        # Check variance / magnitude
+        if val_col in df.columns and pd.api.types.is_numeric_dtype(df[val_col]):
+            vmin = df[val_col].min()
+            vmax = df[val_col].max()
+            if vmax > 0 and vmin > 0:
+                ratio = vmax / vmin
+                if ratio > 10_000: # 4 orders of magnitude
+                    report['warnings'].append(f"Extreme Scale Differences detected (Max/Min ratio: {ratio:,.0f}). Consider Log Scale.")
+        return report
+    @staticmethod
+    def detect_scale_conflict(df: pd.DataFrame) -> dict:
+        """
+        Analyzes if the dataframe contains mixed series with incompatible scales
+        (e.g., Billions vs Percentages).
+        Returns a dict with 'has_conflict', 'micro_cols', 'macro_cols'.
+        """
+        conflict_info = {'has_conflict': False, 'micro_cols': [], 'macro_cols': []}
+        # Identify numeric columns (excluding Year if it's a column)
+        numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
+        numeric_cols = [c for c in numeric_cols if c.lower() not in ['year', 'time_period']]
+        if len(numeric_cols) < 2:
+            return conflict_info
+        micro = []
+        macro = []
+        for col in numeric_cols:
+            # Check magnitude (using median to be robust against outliers)
+            median_val = df[col].abs().median()
+            # Logic: Micro < 500 (covers %, indices 0-100, etc)
+            #        Macro > 1,000,000 (Millions, Billions)
+            if median_val < 500:
+                micro.append(col)
+            elif median_val > 1_000_000:
+                macro.append(col)
+        # Conflict exists if we have BOTH types
+        if micro and macro:
+            conflict_info['has_conflict'] = True
+            conflict_info['micro_cols'] = micro
+            conflict_info['macro_cols'] = macro
+        return conflict_info
+    @staticmethod
+    def print_report(report):
+        """Prints a user-friendly audit report to console"""
+        if report['status'] == 'OK' and not report['warnings']:
+            # Silent if perfect? Or subtle nod?
+            return
+        print(f"\n[SENIOR ANALYST AUDIT] Status: {report['status']}")
+        for w in report.get('warnings', []):
+            print(f"  ⚠ {w}")
+        if 'message' in report:
+            print(f"  ! {report['message']}")