PyPI - bits-bie - Versions diffs - 0.2.0__py3-none-any.whl - Mend

bits-bie 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

bie/__init__.py +60 -0
bie/agents/__init__.py +315 -0
bie/api/__init__.py +457 -0
bie/auth/__init__.py +255 -0
bie/chunker.py +83 -0
bie/cli.py +136 -0
bie/client.py +214 -0
bie/compliance/__init__.py +472 -0
bie/config.py +57 -0
bie/context/__init__.py +87 -0
bie/contradiction/__init__.py +204 -0
bie/crawler/__init__.py +325 -0
bie/crawler.py +109 -0
bie/engine.py +132 -0
bie/gateway/__init__.py +132 -0
bie/index.py +225 -0
bie/indexer/__init__.py +376 -0
bie/kg/__init__.py +394 -0
bie/mcp/__init__.py +3 -0
bie/mcp/server.py +101 -0
bie/models.py +76 -0
bie/quicksearch.py +37 -0
bie/regions/__init__.py +236 -0
bie/retriever/__init__.py +2 -0
bie/server.py +138 -0
bie/spiders/__init__.py +3 -0
bie/spiders/generic.py +117 -0
bie/trust/__init__.py +99 -0
bie/verifier/__init__.py +216 -0
bits_bie-0.2.0.dist-info/METADATA +281 -0
bits_bie-0.2.0.dist-info/RECORD +34 -0
bits_bie-0.2.0.dist-info/WHEEL +4 -0
bits_bie-0.2.0.dist-info/entry_points.txt +2 -0
bits_bie-0.2.0.dist-info/licenses/LICENSE +21 -0

bie/auth/__init__.py ADDED Viewed

@@ -0,0 +1,255 @@
+"""
+Enterprise Authentication & Authorization
+===========================================
+SSO (OAuth2/OIDC), JWT session tokens, API-key tenancy, and
+role-based access control (RBAC) for the v1.0 Enterprise tier.
+Components:
+  - ``APIKeyStore``     — per-tenant API keys with tier + quota
+  - ``JWTManager``      — issue/verify short-lived session JWTs (post-SSO)
+  - ``OIDCConfig``      — OIDC provider configuration (Okta/Azure AD/Google)
+  - ``RBAC``            — role → permission mapping
+  - ``require_role``    — FastAPI dependency factory
+This module has zero hard dependency on a real IdP — ``OIDCConfig``
+holds connection details, and ``verify_oidc_token`` validates tokens
+issued by any standards-compliant OIDC provider via JWKS.
+"""
+from __future__ import annotations
+import time
+import uuid
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Optional
+from jose import jwt, JWTError
+from pydantic import BaseModel, Field
+from bie.config import BIESettings, settings
+# ── Roles & permissions (RBAC) ─────────────────────────────────────────────────
+class Role(str, Enum):
+    VIEWER = "viewer"          # search only
+    DEVELOPER = "developer"    # search + agent + crawl
+    ADMIN = "admin"            # all + indices/update + webhooks
+    OWNER = "owner"            # all + billing + tenant management
+_ROLE_PERMISSIONS: dict[Role, set[str]] = {
+    Role.VIEWER: {"search:read"},
+    Role.DEVELOPER: {"search:read", "agent:read", "crawl:write", "feedback:write"},
+    Role.ADMIN: {
+        "search:read", "agent:read", "crawl:write", "feedback:write",
+        "indices:write", "webhooks:write", "metrics:read", "kg:read",
+    },
+    Role.OWNER: {
+        "search:read", "agent:read", "crawl:write", "feedback:write",
+        "indices:write", "webhooks:write", "metrics:read", "kg:read",
+        "tenant:manage", "billing:manage",
+    },
+}
+class RBAC:
+    @staticmethod
+    def has_permission(role: Role, permission: str) -> bool:
+        return permission in _ROLE_PERMISSIONS.get(role, set())
+    @staticmethod
+    def permissions_for(role: Role) -> set[str]:
+        return _ROLE_PERMISSIONS.get(role, set())
+# ── Tenant / API key model ──────────────────────────────────────────────────────
+class PricingTier(str, Enum):
+    FREE = "free"
+    STARTUP = "startup"
+    BUSINESS = "business"
+    ENTERPRISE = "enterprise"
+_TIER_QUOTAS: dict[PricingTier, int] = {
+    PricingTier.FREE: 50_000,        # queries / month
+    PricingTier.STARTUP: 1_000_000,
+    PricingTier.BUSINESS: 10_000_000,
+    PricingTier.ENTERPRISE: -1,      # unlimited
+}
+@dataclass
+class Tenant:
+    tenant_id: str = field(default_factory=lambda: f"tn_{uuid.uuid4().hex[:12]}")
+    name: str = ""
+    tier: PricingTier = PricingTier.FREE
+    region: str = "us-east-1"
+    sso_enabled: bool = False
+    oidc_issuer: Optional[str] = None
+    created_at: float = field(default_factory=time.time)
+@dataclass
+class APIKeyRecord:
+    api_key: str
+    tenant_id: str
+    role: Role = Role.DEVELOPER
+    monthly_quota: int = -1
+    requests_this_month: int = 0
+    period_start: float = field(default_factory=time.time)
+    active: bool = True
+class APIKeyStore:
+    """
+    In-memory multi-tenant API key store with quota tracking.
+    Swap for a Postgres/DynamoDB-backed implementation in production —
+    the interface (`validate`, `record_usage`, `create_key`) stays the same.
+    """
+    def __init__(self):
+        self._tenants: dict[str, Tenant] = {}
+        self._keys: dict[str, APIKeyRecord] = {}
+        self._seed_dev_key()
+    def _seed_dev_key(self) -> None:
+        tenant = Tenant(name="dev-tenant", tier=PricingTier.ENTERPRISE)
+        self._tenants[tenant.tenant_id] = tenant
+        self._keys["dev-key-12345"] = APIKeyRecord(
+            api_key="dev-key-12345",
+            tenant_id=tenant.tenant_id,
+            role=Role.OWNER,
+            monthly_quota=-1,
+        )
+    def create_tenant(self, name: str, tier: PricingTier, region: str = "us-east-1") -> Tenant:
+        tenant = Tenant(name=name, tier=tier, region=region)
+        self._tenants[tenant.tenant_id] = tenant
+        return tenant
+    def create_key(self, tenant_id: str, role: Role = Role.DEVELOPER) -> APIKeyRecord:
+        tenant = self._tenants.get(tenant_id)
+        if tenant is None:
+            raise ValueError(f"Unknown tenant {tenant_id}")
+        quota = _TIER_QUOTAS[tenant.tier]
+        key = APIKeyRecord(
+            api_key=f"bie_{uuid.uuid4().hex}",
+            tenant_id=tenant_id,
+            role=role,
+            monthly_quota=quota,
+        )
+        self._keys[key.api_key] = key
+        return key
+    def validate(self, api_key: str) -> tuple[APIKeyRecord, Tenant] | None:
+        record = self._keys.get(api_key)
+        if record is None or not record.active:
+            return None
+        tenant = self._tenants.get(record.tenant_id)
+        if tenant is None:
+            return None
+        self._maybe_reset_period(record)
+        return record, tenant
+    def record_usage(self, api_key: str) -> bool:
+        """Returns False if quota exceeded."""
+        record = self._keys.get(api_key)
+        if record is None:
+            return False
+        self._maybe_reset_period(record)
+        if record.monthly_quota >= 0 and record.requests_this_month >= record.monthly_quota:
+            return False
+        record.requests_this_month += 1
+        return True
+    def _maybe_reset_period(self, record: APIKeyRecord) -> None:
+        elapsed = time.time() - record.period_start
+        if elapsed > 30 * 86400:  # 30-day rolling period
+            record.requests_this_month = 0
+            record.period_start = time.time()
+    def quota_status(self, api_key: str) -> dict:
+        record = self._keys.get(api_key)
+        if record is None:
+            return {}
+        return {
+            "quota": record.monthly_quota,
+            "used": record.requests_this_month,
+            "remaining": (
+                "unlimited" if record.monthly_quota < 0
+                else max(0, record.monthly_quota - record.requests_this_month)
+            ),
+        }
+# ── JWT session management (post-SSO) ───────────────────────────────────────────
+class JWTManager:
+    """
+    Issues and verifies short-lived JWTs after a successful SSO/OIDC
+    login. Used for browser-based dashboard sessions; API traffic uses
+    API keys (`APIKeyStore`).
+    """
+    def __init__(self, cfg: BIESettings = settings, ttl_seconds: int = 3600):
+        self._secret = cfg.secret_key
+        self._ttl = ttl_seconds
+        self._algorithm = "HS256"
+    def issue(self, subject: str, tenant_id: str, role: Role) -> str:
+        now = int(time.time())
+        payload = {
+            "sub": subject,
+            "tenant_id": tenant_id,
+            "role": role.value,
+            "iat": now,
+            "exp": now + self._ttl,
+        }
+        return jwt.encode(payload, self._secret, algorithm=self._algorithm)
+    def verify(self, token: str) -> dict | None:
+        try:
+            return jwt.decode(token, self._secret, algorithms=[self._algorithm])
+        except JWTError:
+            return None
+# ── OIDC / SSO configuration ─────────────────────────────────────────────────────
+class OIDCConfig(BaseModel):
+    """
+    Connection details for an enterprise SSO provider
+    (Okta, Azure AD, Google Workspace, OneLogin, etc).
+    Tokens issued by the provider are validated via JWKS against
+    `jwks_uri` — no provider-specific code required.
+    """
+    issuer: str
+    client_id: str
+    jwks_uri: str
+    audience: str
+    algorithms: list[str] = Field(default_factory=lambda: ["RS256"])
+async def verify_oidc_token(token: str, oidc: OIDCConfig, jwks_keys: list[dict]) -> dict | None:
+    """
+    Verifies an OIDC ID token against the provider's JWKS.
+    `jwks_keys` should be fetched from `oidc.jwks_uri` and cached
+    by the caller (e.g. refreshed hourly).
+    """
+    try:
+        header = jwt.get_unverified_header(token)
+        kid = header.get("kid")
+        key = next((k for k in jwks_keys if k.get("kid") == kid), None)
+        if key is None:
+            return None
+        claims = jwt.decode(
+            token, key, algorithms=oidc.algorithms,
+            audience=oidc.audience, issuer=oidc.issuer,
+        )
+        return claims
+    except JWTError:
+        return None

bie/chunker.py ADDED Viewed

@@ -0,0 +1,83 @@
+"""
+Lightweight text chunker — splits cleaned document text into
+paragraph/section-sized chunks for indexing (PRD Module 8: Context Builder).
+No heavy NLP deps; sentence/paragraph aware, with overlap.
+"""
+from __future__ import annotations
+import re
+from bie.models import Chunk, Document
+_PARA_SPLIT = re.compile(r"\n\s*\n+")
+_SENT_SPLIT = re.compile(r"(?<=[.!?])\s+")
+def chunk_document(doc: Document, chunk_size: int = 800, overlap: int = 100) -> list[Chunk]:
+    """Split a document's text into overlapping chunks.
+    Strategy:
+      1. Split on paragraph boundaries.
+      2. Greedily pack paragraphs into chunks up to ``chunk_size`` chars.
+      3. If a single paragraph exceeds ``chunk_size``, split it by sentence.
+      4. Apply a small character-overlap between consecutive chunks so
+         retrieval doesn't lose context at boundaries.
+    """
+    text = (doc.text or "").strip()
+    if not text:
+        return []
+    paragraphs = [p.strip() for p in _PARA_SPLIT.split(text) if p.strip()]
+    if not paragraphs:
+        paragraphs = [text]
+    units: list[str] = []
+    for para in paragraphs:
+        if len(para) <= chunk_size:
+            units.append(para)
+        else:
+            sentences = _SENT_SPLIT.split(para)
+            buf = ""
+            for sent in sentences:
+                if len(buf) + len(sent) + 1 <= chunk_size:
+                    buf = f"{buf} {sent}".strip()
+                else:
+                    if buf:
+                        units.append(buf)
+                    buf = sent
+            if buf:
+                units.append(buf)
+    chunks: list[Chunk] = []
+    buf = ""
+    offset = 0
+    for unit in units:
+        candidate = f"{buf}\n\n{unit}".strip() if buf else unit
+        if len(candidate) <= chunk_size:
+            buf = candidate
+            continue
+        if buf:
+            chunks.append(_make_chunk(doc, buf, offset))
+            offset += max(len(buf) - overlap, 0)
+            tail = buf[-overlap:] if overlap else ""
+            buf = f"{tail}\n\n{unit}".strip() if tail else unit
+        else:
+            buf = unit
+    if buf:
+        chunks.append(_make_chunk(doc, buf, offset))
+    return chunks
+def _make_chunk(doc: Document, text: str, start_offset: int) -> Chunk:
+    return Chunk(
+        doc_id=doc.doc_id,
+        text=text,
+        start_offset=start_offset,
+        end_offset=start_offset + len(text),
+        metadata={"site": doc.site, "title": doc.title},
+    )

bie/cli.py ADDED Viewed

@@ -0,0 +1,136 @@
+"""
+BIE command-line interface.
+Examples::
+    bie search "AI regulation 2026" --url https://example.com/news
+    bie crawl https://example.com --max-pages 20 --out docs.jsonl
+    bie serve --port 8000
+    bie mcp
+"""
+from __future__ import annotations
+import json
+import sys
+import click
+from bie import __version__
+from bie.config import BIESettings
+from bie.engine import BIE
+@click.group()
+@click.version_option(__version__, prog_name="bie")
+def cli() -> None:
+    """BIE — BitSearch Intelligence Engine. Real-time web search & extraction for AI apps."""
+@cli.command()
+@click.argument("query")
+@click.option("--url", "urls", multiple=True, required=True, help="Seed URL(s) to crawl & search")
+@click.option("--top-k", default=10, show_default=True, help="Number of results to return")
+@click.option("--max-pages", default=10, show_default=True, help="Max pages to crawl per seed URL")
+@click.option("--no-embeddings", is_flag=True, help="Disable semantic/vector search (BM25 only)")
+@click.option("--json", "as_json", is_flag=True, help="Output raw JSON")
+def search(query: str, urls: tuple[str, ...], top_k: int, max_pages: int, no_embeddings: bool, as_json: bool) -> None:
+    """Crawl URL(s) and search the freshly indexed content for QUERY."""
+    settings = BIESettings(max_pages=max_pages, use_embeddings=not no_embeddings)
+    engine = BIE(settings)
+    click.echo(f"Crawling {len(urls)} source(s)...", err=True)
+    n = engine.crawl(list(urls))
+    click.echo(f"Indexed {n} document(s). Searching...", err=True)
+    response = engine.search_full(query, top_k=top_k)
+    if as_json:
+        click.echo(response.model_dump_json(indent=2))
+        return
+    if not response.results:
+        click.echo("No results found.")
+        return
+    for i, r in enumerate(response.results, 1):
+        click.echo(f"\n{i}. {r.title}")
+        click.echo(f"   {r.url}")
+        click.echo(f"   score={r.score:.4f}  trust={r.trust_score:.2f}")
+        click.echo(f"   {r.snippet}")
+    click.echo(f"\n({response.took_ms} ms, {response.total_indexed_documents} docs indexed)")
+@cli.command()
+@click.argument("urls", nargs=-1, required=True)
+@click.option("--max-pages", default=40, show_default=True)
+@click.option("--max-depth", default=2, show_default=True)
+@click.option("--out", "output", default=None, help="Write extracted documents as JSONL to this path")
+def crawl(urls: tuple[str, ...], max_pages: int, max_depth: int, output: str | None) -> None:
+    """Crawl URLS using the Bitscrape-powered spider and print/save extracted docs."""
+    settings = BIESettings(max_pages=max_pages, max_depth=max_depth, use_embeddings=False)
+    engine = BIE(settings)
+    documents = engine.crawler.crawl(list(urls))
+    if output:
+        with open(output, "w", encoding="utf-8") as f:
+            for doc in documents:
+                f.write(doc.model_dump_json() + "\n")
+        click.echo(f"Wrote {len(documents)} document(s) to {output}")
+    else:
+        for doc in documents:
+            click.echo(json.dumps({"url": doc.url, "title": doc.title, "chars": len(doc.text)}))
+        click.echo(f"\n{len(documents)} document(s) crawled.", err=True)
+@cli.command()
+@click.option("--host", default=None, help="Bind host (default from settings / 0.0.0.0)")
+@click.option("--port", default=None, type=int, help="Bind port (default from settings / 8000)")
+@click.option("--reload", is_flag=True, help="Auto-reload on code changes (dev only)")
+def serve(host: str | None, port: int | None, reload: bool) -> None:
+    """Run the BIE REST API server (FastAPI + Uvicorn)."""
+    try:
+        import uvicorn
+    except ImportError:
+        click.echo("uvicorn is required: pip install 'bits-bie[server]'", err=True)
+        sys.exit(1)
+    settings = BIESettings()
+    uvicorn.run(
+        "bie.server:app",
+        host=host or settings.host,
+        port=port or settings.port,
+        reload=reload,
+    )
+@cli.command()
+def mcp() -> None:
+    """Run BIE as a Model Context Protocol (MCP) server over stdio.
+    Add to your MCP client config (e.g. Claude Desktop) as a command:
+    \b
+        {
+          "mcpServers": {
+            "bie": {
+              "command": "bie",
+              "args": ["mcp"]
+            }
+          }
+        }
+    """
+    try:
+        from bie.mcp.server import run_mcp_server
+    except ImportError:
+        click.echo("MCP support requires: pip install 'bits-bie[mcp]'", err=True)
+        sys.exit(1)
+    run_mcp_server()
+def main() -> None:
+    cli()
+if __name__ == "__main__":
+    main()

bie/client.py ADDED Viewed

@@ -0,0 +1,214 @@
+"""
+BIE Python SDK — High-level client
+====================================
+Use this in your own AI applications to search BIE programmatically.
+Example::
+    import asyncio
+    from bie import BIEClient
+    async def main():
+        async with BIEClient(base_url="http://localhost:8000", api_key="my-key") as client:
+            # Simple hybrid search
+            resp = await client.search("latest AI research 2026", top_k=5)
+            for r in resp.results:
+                print(r.rank, r.title, r.url, r.trust_score)
+            # RAG: grounded LLM answer with citations
+            answer = await client.agent_query("What happened in TSMC Q2 2026?")
+            print(answer.answer)
+            for c in answer.citations:
+                print(f"  [{c.index}] {c.url}")
+            # On-demand crawl
+            await client.crawl_url("https://example.com/new-article")
+    asyncio.run(main())
+Sync wrapper::
+    from bie.client import BIEClientSync
+    client = BIEClientSync(base_url="http://localhost:8000", api_key="my-key")
+    resp = client.search("semiconductor supply chain")
+"""
+from __future__ import annotations
+import asyncio
+from typing import AsyncIterator
+import httpx
+from bie.models import (
+    AgentResponse,
+    CrawlRequest,
+    CrawlResponse,
+    HealthResponse,
+    SearchFilters,
+    SearchRequest,
+    SearchResponse,
+)
+class BIEClient:
+    """
+    Async HTTP client for the BIE REST API.
+    Use as an async context manager or call `.close()` manually.
+    """
+    def __init__(
+        self,
+        base_url: str = "http://localhost:8000",
+        api_key: str = "dev-key",
+        timeout: float = 30.0,
+    ):
+        self._base_url = base_url.rstrip("/")
+        self._api_key = api_key
+        self._client = httpx.AsyncClient(
+            base_url=self._base_url,
+            headers={"X-API-Key": api_key},
+            timeout=timeout,
+        )
+    async def __aenter__(self) -> "BIEClient":
+        return self
+    async def __aexit__(self, *_) -> None:
+        await self.close()
+    async def close(self) -> None:
+        await self._client.aclose()
+    # ── Search ────────────────────────────────────────────────────────────────
+    async def search(
+        self,
+        query: str,
+        top_k: int = 10,
+        filters: SearchFilters | None = None,
+        use_reranker: bool = True,
+    ) -> SearchResponse:
+        """Hybrid BM25 + vector search. Returns ranked results."""
+        payload = SearchRequest(
+            query=query,
+            top_k=top_k,
+            filters=filters or SearchFilters(),
+            use_reranker=use_reranker,
+        )
+        resp = await self._client.post("/search", content=payload.model_dump_json())
+        resp.raise_for_status()
+        return SearchResponse.model_validate(resp.json())
+    async def search_stream(
+        self, query: str, top_k: int = 10
+    ) -> AsyncIterator[str]:
+        """Stream search results as SSE events."""
+        async with self._client.stream(
+            "GET", "/search/stream", params={"query": query, "top_k": top_k}
+        ) as resp:
+            resp.raise_for_status()
+            async for line in resp.aiter_lines():
+                if line.startswith("data: "):
+                    yield line[6:]
+    # ── Agent / RAG ───────────────────────────────────────────────────────────
+    async def agent_query(
+        self,
+        query: str,
+        top_k: int = 10,
+        filters: SearchFilters | None = None,
+    ) -> AgentResponse:
+        """Full RAG: retrieve → build context → LLM answer with citations."""
+        payload = SearchRequest(
+            query=query,
+            top_k=top_k,
+            filters=filters or SearchFilters(),
+        )
+        resp = await self._client.post("/agent/query", content=payload.model_dump_json())
+        resp.raise_for_status()
+        return AgentResponse.model_validate(resp.json())
+    async def agent_stream(
+        self, query: str, top_k: int = 10
+    ) -> AsyncIterator[str]:
+        """Stream LLM tokens via SSE."""
+        async with self._client.stream(
+            "GET", "/agent/stream", params={"query": query, "top_k": top_k}
+        ) as resp:
+            resp.raise_for_status()
+            async for line in resp.aiter_lines():
+                if line.startswith("data: "):
+                    data = line[6:]
+                    if data != "[DONE]":
+                        yield data
+    # ── Crawler ───────────────────────────────────────────────────────────────
+    async def crawl_url(self, url: str, priority: int = 5) -> CrawlResponse:
+        """Trigger an on-demand crawl + index of a single URL."""
+        payload = CrawlRequest(url=url, priority=priority)
+        resp = await self._client.post("/crawl/url", content=payload.model_dump_json())
+        resp.raise_for_status()
+        return CrawlResponse.model_validate(resp.json())
+    async def crawl_batch(self, urls: list[str]) -> dict:
+        """Batch crawl up to 50 URLs."""
+        resp = await self._client.post("/crawl/batch", json=urls)
+        resp.raise_for_status()
+        return resp.json()
+    # ── Feedback ──────────────────────────────────────────────────────────────
+    async def feedback(self, url: str, positive: bool) -> None:
+        """Send thumbs-up / thumbs-down to improve trust scoring."""
+        resp = await self._client.post(
+            "/feedback", params={"url": url, "positive": str(positive).lower()}
+        )
+        resp.raise_for_status()
+    # ── Ops ───────────────────────────────────────────────────────────────────
+    async def health(self) -> HealthResponse:
+        resp = await self._client.get("/health")
+        resp.raise_for_status()
+        return HealthResponse.model_validate(resp.json())
+    async def metrics(self) -> dict:
+        resp = await self._client.get("/metrics")
+        resp.raise_for_status()
+        return resp.json()
+# ── Sync wrapper ───────────────────────────────────────────────────────────────
+class BIEClientSync:
+    """
+    Synchronous wrapper around BIEClient.
+    Useful in scripts, Jupyter notebooks, or non-async frameworks.
+    """
+    def __init__(self, **kwargs):
+        self._async_client = BIEClient(**kwargs)
+        self._loop = asyncio.new_event_loop()
+    def _run(self, coro):
+        return self._loop.run_until_complete(coro)
+    def search(self, query: str, **kwargs) -> SearchResponse:
+        return self._run(self._async_client.search(query, **kwargs))
+    def agent_query(self, query: str, **kwargs) -> AgentResponse:
+        return self._run(self._async_client.agent_query(query, **kwargs))
+    def crawl_url(self, url: str, **kwargs) -> CrawlResponse:
+        return self._run(self._async_client.crawl_url(url, **kwargs))
+    def health(self) -> HealthResponse:
+        return self._run(self._async_client.health())
+    def close(self):
+        self._run(self._async_client.close())
+        self._loop.close()