bits-bie 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bie/__init__.py ADDED
@@ -0,0 +1,85 @@
1
+ """
2
+ BIE — BitSearch Intelligence Engine
3
+ =====================================
4
+
5
+ A real-time web search engine and crawling toolkit for AI applications —
6
+ built on top of **Bitscrape** (https://pypi.org/project/bitscrape/). No
7
+ API keys, no subscriptions, no third-party search services.
8
+
9
+ Core primitives
10
+ ----------------
11
+
12
+ - :func:`websearch` — search the live internet for a query (no URLs needed)
13
+ - :func:`search` — crawl + rank specific URLs against a query
14
+ - :func:`extract` — get clean Markdown from a single URL
15
+ - :func:`map_site` — discover a site's sitemap before crawling
16
+ - :func:`crawl_site` — crawl a site guided by a natural-language instruction
17
+ - :class:`BIE` — build a persistent, queryable index
18
+
19
+ Quick start
20
+ -----------
21
+
22
+ .. code-block:: python
23
+
24
+ import bie
25
+
26
+ # Search the live internet — no URLs, no API key, no subscription
27
+ results = bie.websearch("who won the latest F1 race")
28
+ for r in results:
29
+ print(r.title, r.url)
30
+ print(r.snippet)
31
+
32
+ # Get clean markdown from a specific page
33
+ page = bie.extract("https://example.com/article")
34
+ print(page.markdown)
35
+
36
+ # Discover a site's structure before crawling
37
+ sitemap = bie.map_site("https://example.com")
38
+ print(sitemap.urls[:10])
39
+
40
+ # Crawl a site guided by an instruction
41
+ engine, results = bie.crawl_site(
42
+ ["https://docs.example.com"],
43
+ instruction="authentication and rate limits",
44
+ )
45
+
46
+ Run as a server::
47
+
48
+ bie serve --port 8000
49
+
50
+ Run as an MCP tool (for Claude Desktop, Claude Code, etc.)::
51
+
52
+ bie mcp
53
+ """
54
+
55
+ from __future__ import annotations
56
+
57
+ from bie.config import BIESettings
58
+ from bie.engine import BIE
59
+ from bie.extract import ExtractError, ExtractResult, extract
60
+ from bie.models import Document, SearchResult
61
+ from bie.quicksearch import search, websearch
62
+ from bie.security import SecurityFinding, SecurityReport, scan_for_prompt_injection
63
+ from bie.sitecrawl import crawl_site
64
+ from bie.sitemap import SiteMap, map_site
65
+
66
+ __version__ = "0.5.0"
67
+
68
+ __all__ = [
69
+ "BIE",
70
+ "BIESettings",
71
+ "Document",
72
+ "SearchResult",
73
+ "search",
74
+ "websearch",
75
+ "extract",
76
+ "ExtractResult",
77
+ "ExtractError",
78
+ "map_site",
79
+ "SiteMap",
80
+ "crawl_site",
81
+ "scan_for_prompt_injection",
82
+ "SecurityReport",
83
+ "SecurityFinding",
84
+ "__version__",
85
+ ]
bie/chunker.py ADDED
@@ -0,0 +1,83 @@
1
+ """
2
+ Lightweight text chunker — splits cleaned document text into
3
+ paragraph/section-sized chunks for indexing (PRD Module 8: Context Builder).
4
+
5
+ No heavy NLP deps; sentence/paragraph aware, with overlap.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import re
11
+
12
+ from bie.models import Chunk, Document
13
+
14
+ _PARA_SPLIT = re.compile(r"\n\s*\n+")
15
+ _SENT_SPLIT = re.compile(r"(?<=[.!?])\s+")
16
+
17
+
18
+ def chunk_document(doc: Document, chunk_size: int = 800, overlap: int = 100) -> list[Chunk]:
19
+ """Split a document's text into overlapping chunks.
20
+
21
+ Strategy:
22
+ 1. Split on paragraph boundaries.
23
+ 2. Greedily pack paragraphs into chunks up to ``chunk_size`` chars.
24
+ 3. If a single paragraph exceeds ``chunk_size``, split it by sentence.
25
+ 4. Apply a small character-overlap between consecutive chunks so
26
+ retrieval doesn't lose context at boundaries.
27
+ """
28
+ text = (doc.text or "").strip()
29
+ if not text:
30
+ return []
31
+
32
+ paragraphs = [p.strip() for p in _PARA_SPLIT.split(text) if p.strip()]
33
+ if not paragraphs:
34
+ paragraphs = [text]
35
+
36
+ units: list[str] = []
37
+ for para in paragraphs:
38
+ if len(para) <= chunk_size:
39
+ units.append(para)
40
+ else:
41
+ sentences = _SENT_SPLIT.split(para)
42
+ buf = ""
43
+ for sent in sentences:
44
+ if len(buf) + len(sent) + 1 <= chunk_size:
45
+ buf = f"{buf} {sent}".strip()
46
+ else:
47
+ if buf:
48
+ units.append(buf)
49
+ buf = sent
50
+ if buf:
51
+ units.append(buf)
52
+
53
+ chunks: list[Chunk] = []
54
+ buf = ""
55
+ offset = 0
56
+ for unit in units:
57
+ candidate = f"{buf}\n\n{unit}".strip() if buf else unit
58
+ if len(candidate) <= chunk_size:
59
+ buf = candidate
60
+ continue
61
+
62
+ if buf:
63
+ chunks.append(_make_chunk(doc, buf, offset))
64
+ offset += max(len(buf) - overlap, 0)
65
+ tail = buf[-overlap:] if overlap else ""
66
+ buf = f"{tail}\n\n{unit}".strip() if tail else unit
67
+ else:
68
+ buf = unit
69
+
70
+ if buf:
71
+ chunks.append(_make_chunk(doc, buf, offset))
72
+
73
+ return chunks
74
+
75
+
76
+ def _make_chunk(doc: Document, text: str, start_offset: int) -> Chunk:
77
+ return Chunk(
78
+ doc_id=doc.doc_id,
79
+ text=text,
80
+ start_offset=start_offset,
81
+ end_offset=start_offset + len(text),
82
+ metadata={"site": doc.site, "title": doc.title},
83
+ )
bie/cli.py ADDED
@@ -0,0 +1,275 @@
1
+ """
2
+ BIE command-line interface.
3
+
4
+ Examples::
5
+
6
+ bie search "AI regulation 2026" --url https://example.com/news
7
+ bie search-live "who won the latest F1 race"
8
+ bie extract https://example.com/article
9
+ bie map https://example.com
10
+ bie crawl https://example.com --max-pages 20 --instruction "pricing pages"
11
+ bie serve --port 8000
12
+ bie mcp
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import json
18
+ import sys
19
+
20
+ import click
21
+
22
+ from bie import __version__
23
+ from bie.config import BIESettings
24
+ from bie.engine import BIE
25
+
26
+
27
+ @click.group()
28
+ @click.version_option(__version__, prog_name="bie")
29
+ def cli() -> None:
30
+ """BIE — BitSearch Intelligence Engine. Real-time web search & extraction for AI apps."""
31
+
32
+
33
+ @cli.command()
34
+ @click.argument("query")
35
+ @click.option("--url", "urls", multiple=True, required=True, help="Seed URL(s) to crawl & search")
36
+ @click.option("--top-k", default=10, show_default=True, help="Number of results to return")
37
+ @click.option("--max-pages", default=10, show_default=True, help="Max pages to crawl per seed URL")
38
+ @click.option("--no-embeddings", is_flag=True, help="Disable semantic/vector search (BM25 only)")
39
+ @click.option("--json", "as_json", is_flag=True, help="Output raw JSON")
40
+ def search(query: str, urls: tuple[str, ...], top_k: int, max_pages: int, no_embeddings: bool, as_json: bool) -> None:
41
+ """Crawl URL(s) and search the freshly indexed content for QUERY."""
42
+ settings = BIESettings(max_pages=max_pages, use_embeddings=not no_embeddings)
43
+ engine = BIE(settings)
44
+ click.echo(f"Crawling {len(urls)} source(s)...", err=True)
45
+ n = engine.crawl(list(urls))
46
+ click.echo(f"Indexed {n} document(s). Searching...", err=True)
47
+
48
+ response = engine.search_full(query, top_k=top_k)
49
+
50
+ if as_json:
51
+ click.echo(response.model_dump_json(indent=2))
52
+ return
53
+
54
+ if not response.results:
55
+ click.echo("No results found.")
56
+ return
57
+
58
+ for i, r in enumerate(response.results, 1):
59
+ click.echo(f"\n{i}. {r.title}")
60
+ click.echo(f" {r.url}")
61
+ click.echo(f" score={r.score:.4f} trust={r.trust_score:.2f}")
62
+ click.echo(f" {r.snippet}")
63
+ click.echo(f"\n({response.took_ms} ms, {response.total_indexed_documents} docs indexed)")
64
+
65
+
66
+ @cli.command(name="search-live")
67
+ @click.argument("query")
68
+ @click.option("--top-k", default=10, show_default=True, help="Number of results to return")
69
+ @click.option("--discovery-results", default=8, show_default=True, help="Candidate URLs to discover")
70
+ @click.option("--no-deep", is_flag=True, help="Skip crawling; return raw discovery order without snippets")
71
+ @click.option("--no-embeddings", is_flag=True, help="Disable semantic/vector re-ranking (BM25 only)")
72
+ @click.option("--json", "as_json", is_flag=True, help="Output raw JSON")
73
+ def search_live(
74
+ query: str,
75
+ top_k: int,
76
+ discovery_results: int,
77
+ no_deep: bool,
78
+ no_embeddings: bool,
79
+ as_json: bool,
80
+ ) -> None:
81
+ """Search the live internet for QUERY — no seed URLs, no API key, no subscription.
82
+
83
+ Discovers relevant URLs via free public search endpoints (DuckDuckGo,
84
+ with a Bing fallback), crawls them with Bitscrape, and ranks the
85
+ extracted content against QUERY using BIE's hybrid BM25+vector index.
86
+ """
87
+ import bie
88
+
89
+ results = bie.websearch(
90
+ query,
91
+ top_k=top_k,
92
+ discovery_results=discovery_results,
93
+ deep=not no_deep,
94
+ use_embeddings=not no_embeddings,
95
+ )
96
+
97
+ if as_json:
98
+ click.echo(json.dumps([r.model_dump() for r in results], indent=2))
99
+ return
100
+
101
+ if not results:
102
+ click.echo(
103
+ "No results found. The free search backends may be temporarily "
104
+ "rate-limiting — try again in a moment."
105
+ )
106
+ return
107
+
108
+ for i, r in enumerate(results, 1):
109
+ click.echo(f"\n{i}. {r.title}")
110
+ click.echo(f" {r.url}")
111
+ click.echo(f" score={r.score:.4f}")
112
+ if r.snippet:
113
+ click.echo(f" {r.snippet}")
114
+
115
+
116
+ @cli.command()
117
+ @click.argument("urls", nargs=-1, required=True)
118
+ @click.option("--max-pages", default=40, show_default=True)
119
+ @click.option("--max-depth", default=2, show_default=True)
120
+ @click.option(
121
+ "--instruction",
122
+ default="",
123
+ help="Guide link-following toward pages matching this description "
124
+ "(e.g. 'pricing and plans pages')",
125
+ )
126
+ @click.option("--out", "output", default=None, help="Write extracted documents as JSONL to this path")
127
+ def crawl(
128
+ urls: tuple[str, ...], max_pages: int, max_depth: int, instruction: str, output: str | None
129
+ ) -> None:
130
+ """Crawl URLS using the Bitscrape-powered spider and print/save extracted docs.
131
+
132
+ With --instruction, outgoing links are prioritized by keyword overlap
133
+ with the instruction (a heuristic, not full NL understanding — see
134
+ bie.crawl_site docs).
135
+ """
136
+ settings = BIESettings(max_pages=max_pages, max_depth=max_depth, use_embeddings=False)
137
+ engine = BIE(settings)
138
+ documents = engine.crawler.crawl(list(urls), instruction=instruction)
139
+
140
+ if output:
141
+ with open(output, "w", encoding="utf-8") as f:
142
+ for doc in documents:
143
+ f.write(doc.model_dump_json() + "\n")
144
+ click.echo(f"Wrote {len(documents)} document(s) to {output}")
145
+ else:
146
+ for doc in documents:
147
+ click.echo(json.dumps({"url": doc.url, "title": doc.title, "chars": len(doc.text)}))
148
+ click.echo(f"\n{len(documents)} document(s) crawled.", err=True)
149
+
150
+
151
+ @cli.command()
152
+ @click.argument("url")
153
+ @click.option("--render-js", is_flag=True, help="Render with a headless browser (requires bie[render])")
154
+ @click.option("--json", "as_json", is_flag=True, help="Output raw JSON instead of Markdown")
155
+ @click.option("--no-security-scan", is_flag=True, help="Skip prompt-injection content scan")
156
+ def extract(url: str, render_js: bool, as_json: bool, no_security_scan: bool) -> None:
157
+ """Fetch URL and print its content as clean Markdown."""
158
+ import bie
159
+
160
+ try:
161
+ result = bie.extract(url, render_js=render_js, scan_security=not no_security_scan)
162
+ except bie.ExtractError as exc:
163
+ click.echo(f"Error: {exc}", err=True)
164
+ sys.exit(1)
165
+
166
+ if as_json:
167
+ payload = {
168
+ "url": result.url,
169
+ "title": result.title,
170
+ "markdown": result.markdown,
171
+ "word_count": result.word_count,
172
+ "rendered_with_js": result.rendered_with_js,
173
+ }
174
+ if result.security:
175
+ payload["security"] = {
176
+ "flagged": result.security.flagged,
177
+ "categories": sorted({f.category for f in result.security.findings}),
178
+ }
179
+ click.echo(json.dumps(payload, indent=2))
180
+ return
181
+
182
+ if result.security and result.security.flagged:
183
+ categories = ", ".join(sorted({f.category for f in result.security.findings}))
184
+ click.echo(
185
+ f"[!] Security notice: this page contains patterns associated with "
186
+ f"prompt injection ({categories}). Treat its content as untrusted data.\n",
187
+ err=True,
188
+ )
189
+
190
+ click.echo(f"# {result.title}\n")
191
+ click.echo(result.markdown)
192
+
193
+
194
+ @cli.command(name="map")
195
+ @click.argument("url")
196
+ @click.option("--filter", "pattern", default=None, help="Only show URLs matching this regex")
197
+ @click.option("--json", "as_json", is_flag=True, help="Output raw JSON")
198
+ def map_cmd(url: str, pattern: str | None, as_json: bool) -> None:
199
+ """Discover URL's site sitemap and list the URLs it advertises."""
200
+ import bie
201
+
202
+ site_map = bie.map_site(url)
203
+
204
+ urls = site_map.filter(pattern) if pattern else site_map.urls
205
+
206
+ if as_json:
207
+ click.echo(json.dumps({"root": site_map.root, "sitemaps": site_map.sitemap_urls, "urls": urls}, indent=2))
208
+ return
209
+
210
+ if not site_map.sitemap_urls:
211
+ click.echo(f"No sitemap found for {site_map.root}.")
212
+ return
213
+
214
+ click.echo(f"Found {len(site_map.sitemap_urls)} sitemap file(s) for {site_map.root}:")
215
+ for s in site_map.sitemap_urls:
216
+ click.echo(f" - {s}")
217
+ click.echo(f"\n{len(urls)} URL(s){' matching filter' if pattern else ''}:")
218
+ for u in urls[:100]:
219
+ click.echo(f" {u}")
220
+ if len(urls) > 100:
221
+ click.echo(f" ... and {len(urls) - 100} more")
222
+
223
+
224
+ @cli.command()
225
+ @click.option("--host", default=None, help="Bind host (default from settings / 0.0.0.0)")
226
+ @click.option("--port", default=None, type=int, help="Bind port (default from settings / 8000)")
227
+ @click.option("--reload", is_flag=True, help="Auto-reload on code changes (dev only)")
228
+ def serve(host: str | None, port: int | None, reload: bool) -> None:
229
+ """Run the BIE REST API server (FastAPI + Uvicorn)."""
230
+ try:
231
+ import uvicorn
232
+ except ImportError:
233
+ click.echo("uvicorn is required: pip install 'bits-bie[server]'", err=True)
234
+ sys.exit(1)
235
+
236
+ settings = BIESettings()
237
+ uvicorn.run(
238
+ "bie.server:app",
239
+ host=host or settings.host,
240
+ port=port or settings.port,
241
+ reload=reload,
242
+ )
243
+
244
+
245
+ @cli.command()
246
+ def mcp() -> None:
247
+ """Run BIE as a Model Context Protocol (MCP) server over stdio.
248
+
249
+ Add to your MCP client config (e.g. Claude Desktop) as a command:
250
+
251
+ \b
252
+ {
253
+ "mcpServers": {
254
+ "bie": {
255
+ "command": "bie",
256
+ "args": ["mcp"]
257
+ }
258
+ }
259
+ }
260
+ """
261
+ try:
262
+ from bie.mcp.server import run_mcp_server
263
+ except ImportError:
264
+ click.echo("MCP support requires: pip install 'bits-bie[mcp]'", err=True)
265
+ sys.exit(1)
266
+
267
+ run_mcp_server()
268
+
269
+
270
+ def main() -> None:
271
+ cli()
272
+
273
+
274
+ if __name__ == "__main__":
275
+ main()
bie/config.py ADDED
@@ -0,0 +1,57 @@
1
+ """
2
+ BIE configuration.
3
+
4
+ All settings can be overridden via environment variables prefixed with
5
+ ``BIE_`` (e.g. ``BIE_MAX_PAGES=200``) or passed directly to
6
+ ``BIESettings(...)``.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from pydantic import Field
12
+ from pydantic_settings import BaseSettings, SettingsConfigDict
13
+
14
+
15
+ class BIESettings(BaseSettings):
16
+ # --- Crawl behaviour (delegated to Bitscrape) -----------------------
17
+ max_pages: int = Field(40, ge=1, description="Max pages to crawl per source URL")
18
+ max_depth: int = Field(2, ge=0, description="Max link-follow depth")
19
+ concurrent_requests: int = Field(16, ge=1, le=256)
20
+ download_delay: float = Field(0.0, ge=0.0)
21
+ user_agent: str = "BIE/0.1 (+https://github.com/Sudharsansm/BIE) bitscrape"
22
+ robotstxt_obey: bool = True
23
+ request_timeout: float = Field(20.0, ge=1.0)
24
+ use_playwright: bool = False
25
+
26
+ # --- Indexing / retrieval --------------------------------------------
27
+ chunk_size: int = Field(800, ge=100, description="Approx characters per chunk")
28
+ chunk_overlap: int = Field(100, ge=0)
29
+ use_embeddings: bool = Field(
30
+ True,
31
+ description="Enable semantic (vector) search via sentence-transformers. "
32
+ "Falls back to BM25-only if the model can't be loaded.",
33
+ )
34
+ embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
35
+ bm25_weight: float = Field(0.5, ge=0.0, le=1.0)
36
+ vector_weight: float = Field(0.5, ge=0.0, le=1.0)
37
+
38
+ # --- Storage -----------------------------------------------------------
39
+ index_dir: str = Field(".bie_index", description="Directory for persisted index")
40
+ persist: bool = Field(False, description="Persist index to disk between runs")
41
+
42
+ # --- Server --------------------------------------------------------------
43
+ host: str = "0.0.0.0"
44
+ port: int = 8000
45
+ api_key: str | None = Field(
46
+ default=None,
47
+ description="If set, all /search and /crawl endpoints require "
48
+ "an `Authorization: Bearer <key>` header.",
49
+ )
50
+
51
+ model_config = SettingsConfigDict(
52
+ env_prefix="BIE_",
53
+ env_file=".env",
54
+ env_file_encoding="utf-8",
55
+ case_sensitive=False,
56
+ extra="ignore",
57
+ )
bie/crawler.py ADDED
@@ -0,0 +1,115 @@
1
+ """
2
+ BIE Crawler — thin orchestration layer over Bitscrape's Engine.
3
+
4
+ Implements PRD Module 1 (Crawler) for the OSS edition: runs the
5
+ :class:`bie.spiders.generic.BIESpider` against one or more seed URLs,
6
+ collects extracted pages in-memory as :class:`bie.models.Document`
7
+ objects, ready for chunking + indexing.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import asyncio
13
+ import logging
14
+ from typing import Any
15
+ from urllib.parse import urlparse
16
+
17
+ import bitscrape
18
+ from bitscrape.pipeline.pipelines import BasePipeline
19
+
20
+ from bie.config import BIESettings
21
+ from bie.models import Document
22
+ from bie.spiders.generic import BIESpider
23
+
24
+ logger = logging.getLogger("bie.crawler")
25
+
26
+
27
+ class _CollectorPipeline(BasePipeline):
28
+ """Collects every scraped item into an in-memory list."""
29
+
30
+ def __init__(self) -> None:
31
+ self.items: list[dict[str, Any]] = []
32
+
33
+ async def process_item(self, item: Any, spider: Any) -> Any:
34
+ self.items.append(item)
35
+ return item
36
+
37
+
38
+ class Crawler:
39
+ """Crawls a list of seed URLs using Bitscrape and returns Documents."""
40
+
41
+ def __init__(self, settings: BIESettings | None = None) -> None:
42
+ self.settings = settings or BIESettings()
43
+
44
+ def crawl(
45
+ self, urls: list[str], allowed_domains: list[str] | None = None, instruction: str = ""
46
+ ) -> list[Document]:
47
+ """Synchronous convenience wrapper around :meth:`acrawl`."""
48
+ return asyncio.run(self.acrawl(urls, allowed_domains, instruction))
49
+
50
+ async def acrawl(
51
+ self,
52
+ urls: list[str],
53
+ allowed_domains: list[str] | None = None,
54
+ instruction: str = "",
55
+ ) -> list[Document]:
56
+ if not urls:
57
+ return []
58
+
59
+ if allowed_domains is None:
60
+ allowed_domains = sorted({urlparse(u).netloc for u in urls if urlparse(u).netloc})
61
+
62
+ bs_settings = bitscrape.Settings(
63
+ concurrent_requests=self.settings.concurrent_requests,
64
+ download_delay=self.settings.download_delay,
65
+ user_agent=self.settings.user_agent,
66
+ robotstxt_obey=self.settings.robotstxt_obey,
67
+ download_timeout=self.settings.request_timeout,
68
+ max_depth=self.settings.max_depth,
69
+ )
70
+
71
+ spider = BIESpider(settings=bs_settings)
72
+ spider.start_urls = list(urls)
73
+ spider.allowed_domains = allowed_domains
74
+ spider.max_pages = self.settings.max_pages
75
+ spider.max_depth = self.settings.max_depth
76
+ spider.instruction = instruction
77
+
78
+ collector = _CollectorPipeline()
79
+
80
+ middlewares = [
81
+ bitscrape.UserAgentMiddleware(),
82
+ bitscrape.CookieMiddleware(),
83
+ ]
84
+ if bs_settings.robotstxt_obey:
85
+ middlewares.insert(0, bitscrape.RobotsMiddleware())
86
+
87
+ engine = bitscrape.Engine(
88
+ spider=spider,
89
+ settings=bs_settings,
90
+ pipelines=[collector],
91
+ middlewares=middlewares,
92
+ )
93
+
94
+ stats = await engine.run()
95
+ logger.info(
96
+ "Crawled %d page(s) from %d seed URL(s) — %d failed",
97
+ stats.items_scraped,
98
+ len(urls),
99
+ stats.requests_failed,
100
+ )
101
+
102
+ documents: list[Document] = []
103
+ for item in collector.items:
104
+ if not item.get("text"):
105
+ continue
106
+ documents.append(
107
+ Document(
108
+ url=item["url"],
109
+ title=item.get("title", item["url"]),
110
+ text=item["text"],
111
+ site=urlparse(item["url"]).netloc,
112
+ metadata={"depth": item.get("depth", 0)},
113
+ )
114
+ )
115
+ return documents