bits-bie 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bie/__init__.py +85 -0
- bie/chunker.py +83 -0
- bie/cli.py +275 -0
- bie/config.py +57 -0
- bie/crawler.py +115 -0
- bie/discovery.py +214 -0
- bie/engine.py +151 -0
- bie/extract.py +218 -0
- bie/index.py +225 -0
- bie/integrations/__init__.py +7 -0
- bie/integrations/langchain.py +142 -0
- bie/mcp/__init__.py +3 -0
- bie/mcp/server.py +193 -0
- bie/models.py +76 -0
- bie/query_expansion.py +99 -0
- bie/quicksearch.py +194 -0
- bie/security.py +124 -0
- bie/server.py +248 -0
- bie/sitecrawl.py +93 -0
- bie/sitemap.py +174 -0
- bie/spiders/__init__.py +3 -0
- bie/spiders/generic.py +178 -0
- bits_bie-1.2.0.dist-info/METADATA +447 -0
- bits_bie-1.2.0.dist-info/RECORD +27 -0
- bits_bie-1.2.0.dist-info/WHEEL +4 -0
- bits_bie-1.2.0.dist-info/entry_points.txt +2 -0
- bits_bie-1.2.0.dist-info/licenses/LICENSE +21 -0
bie/__init__.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""
|
|
2
|
+
BIE — BitSearch Intelligence Engine
|
|
3
|
+
=====================================
|
|
4
|
+
|
|
5
|
+
A real-time web search engine and crawling toolkit for AI applications —
|
|
6
|
+
built on top of **Bitscrape** (https://pypi.org/project/bitscrape/). No
|
|
7
|
+
API keys, no subscriptions, no third-party search services.
|
|
8
|
+
|
|
9
|
+
Core primitives
|
|
10
|
+
----------------
|
|
11
|
+
|
|
12
|
+
- :func:`websearch` — search the live internet for a query (no URLs needed)
|
|
13
|
+
- :func:`search` — crawl + rank specific URLs against a query
|
|
14
|
+
- :func:`extract` — get clean Markdown from a single URL
|
|
15
|
+
- :func:`map_site` — discover a site's sitemap before crawling
|
|
16
|
+
- :func:`crawl_site` — crawl a site guided by a natural-language instruction
|
|
17
|
+
- :class:`BIE` — build a persistent, queryable index
|
|
18
|
+
|
|
19
|
+
Quick start
|
|
20
|
+
-----------
|
|
21
|
+
|
|
22
|
+
.. code-block:: python
|
|
23
|
+
|
|
24
|
+
import bie
|
|
25
|
+
|
|
26
|
+
# Search the live internet — no URLs, no API key, no subscription
|
|
27
|
+
results = bie.websearch("who won the latest F1 race")
|
|
28
|
+
for r in results:
|
|
29
|
+
print(r.title, r.url)
|
|
30
|
+
print(r.snippet)
|
|
31
|
+
|
|
32
|
+
# Get clean markdown from a specific page
|
|
33
|
+
page = bie.extract("https://example.com/article")
|
|
34
|
+
print(page.markdown)
|
|
35
|
+
|
|
36
|
+
# Discover a site's structure before crawling
|
|
37
|
+
sitemap = bie.map_site("https://example.com")
|
|
38
|
+
print(sitemap.urls[:10])
|
|
39
|
+
|
|
40
|
+
# Crawl a site guided by an instruction
|
|
41
|
+
engine, results = bie.crawl_site(
|
|
42
|
+
["https://docs.example.com"],
|
|
43
|
+
instruction="authentication and rate limits",
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
Run as a server::
|
|
47
|
+
|
|
48
|
+
bie serve --port 8000
|
|
49
|
+
|
|
50
|
+
Run as an MCP tool (for Claude Desktop, Claude Code, etc.)::
|
|
51
|
+
|
|
52
|
+
bie mcp
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
from __future__ import annotations
|
|
56
|
+
|
|
57
|
+
from bie.config import BIESettings
|
|
58
|
+
from bie.engine import BIE
|
|
59
|
+
from bie.extract import ExtractError, ExtractResult, extract
|
|
60
|
+
from bie.models import Document, SearchResult
|
|
61
|
+
from bie.quicksearch import search, websearch
|
|
62
|
+
from bie.security import SecurityFinding, SecurityReport, scan_for_prompt_injection
|
|
63
|
+
from bie.sitecrawl import crawl_site
|
|
64
|
+
from bie.sitemap import SiteMap, map_site
|
|
65
|
+
|
|
66
|
+
__version__ = "0.5.0"
|
|
67
|
+
|
|
68
|
+
__all__ = [
|
|
69
|
+
"BIE",
|
|
70
|
+
"BIESettings",
|
|
71
|
+
"Document",
|
|
72
|
+
"SearchResult",
|
|
73
|
+
"search",
|
|
74
|
+
"websearch",
|
|
75
|
+
"extract",
|
|
76
|
+
"ExtractResult",
|
|
77
|
+
"ExtractError",
|
|
78
|
+
"map_site",
|
|
79
|
+
"SiteMap",
|
|
80
|
+
"crawl_site",
|
|
81
|
+
"scan_for_prompt_injection",
|
|
82
|
+
"SecurityReport",
|
|
83
|
+
"SecurityFinding",
|
|
84
|
+
"__version__",
|
|
85
|
+
]
|
bie/chunker.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Lightweight text chunker — splits cleaned document text into
|
|
3
|
+
paragraph/section-sized chunks for indexing (PRD Module 8: Context Builder).
|
|
4
|
+
|
|
5
|
+
No heavy NLP deps; sentence/paragraph aware, with overlap.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
|
|
12
|
+
from bie.models import Chunk, Document
|
|
13
|
+
|
|
14
|
+
_PARA_SPLIT = re.compile(r"\n\s*\n+")
|
|
15
|
+
_SENT_SPLIT = re.compile(r"(?<=[.!?])\s+")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def chunk_document(doc: Document, chunk_size: int = 800, overlap: int = 100) -> list[Chunk]:
|
|
19
|
+
"""Split a document's text into overlapping chunks.
|
|
20
|
+
|
|
21
|
+
Strategy:
|
|
22
|
+
1. Split on paragraph boundaries.
|
|
23
|
+
2. Greedily pack paragraphs into chunks up to ``chunk_size`` chars.
|
|
24
|
+
3. If a single paragraph exceeds ``chunk_size``, split it by sentence.
|
|
25
|
+
4. Apply a small character-overlap between consecutive chunks so
|
|
26
|
+
retrieval doesn't lose context at boundaries.
|
|
27
|
+
"""
|
|
28
|
+
text = (doc.text or "").strip()
|
|
29
|
+
if not text:
|
|
30
|
+
return []
|
|
31
|
+
|
|
32
|
+
paragraphs = [p.strip() for p in _PARA_SPLIT.split(text) if p.strip()]
|
|
33
|
+
if not paragraphs:
|
|
34
|
+
paragraphs = [text]
|
|
35
|
+
|
|
36
|
+
units: list[str] = []
|
|
37
|
+
for para in paragraphs:
|
|
38
|
+
if len(para) <= chunk_size:
|
|
39
|
+
units.append(para)
|
|
40
|
+
else:
|
|
41
|
+
sentences = _SENT_SPLIT.split(para)
|
|
42
|
+
buf = ""
|
|
43
|
+
for sent in sentences:
|
|
44
|
+
if len(buf) + len(sent) + 1 <= chunk_size:
|
|
45
|
+
buf = f"{buf} {sent}".strip()
|
|
46
|
+
else:
|
|
47
|
+
if buf:
|
|
48
|
+
units.append(buf)
|
|
49
|
+
buf = sent
|
|
50
|
+
if buf:
|
|
51
|
+
units.append(buf)
|
|
52
|
+
|
|
53
|
+
chunks: list[Chunk] = []
|
|
54
|
+
buf = ""
|
|
55
|
+
offset = 0
|
|
56
|
+
for unit in units:
|
|
57
|
+
candidate = f"{buf}\n\n{unit}".strip() if buf else unit
|
|
58
|
+
if len(candidate) <= chunk_size:
|
|
59
|
+
buf = candidate
|
|
60
|
+
continue
|
|
61
|
+
|
|
62
|
+
if buf:
|
|
63
|
+
chunks.append(_make_chunk(doc, buf, offset))
|
|
64
|
+
offset += max(len(buf) - overlap, 0)
|
|
65
|
+
tail = buf[-overlap:] if overlap else ""
|
|
66
|
+
buf = f"{tail}\n\n{unit}".strip() if tail else unit
|
|
67
|
+
else:
|
|
68
|
+
buf = unit
|
|
69
|
+
|
|
70
|
+
if buf:
|
|
71
|
+
chunks.append(_make_chunk(doc, buf, offset))
|
|
72
|
+
|
|
73
|
+
return chunks
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _make_chunk(doc: Document, text: str, start_offset: int) -> Chunk:
|
|
77
|
+
return Chunk(
|
|
78
|
+
doc_id=doc.doc_id,
|
|
79
|
+
text=text,
|
|
80
|
+
start_offset=start_offset,
|
|
81
|
+
end_offset=start_offset + len(text),
|
|
82
|
+
metadata={"site": doc.site, "title": doc.title},
|
|
83
|
+
)
|
bie/cli.py
ADDED
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
"""
|
|
2
|
+
BIE command-line interface.
|
|
3
|
+
|
|
4
|
+
Examples::
|
|
5
|
+
|
|
6
|
+
bie search "AI regulation 2026" --url https://example.com/news
|
|
7
|
+
bie search-live "who won the latest F1 race"
|
|
8
|
+
bie extract https://example.com/article
|
|
9
|
+
bie map https://example.com
|
|
10
|
+
bie crawl https://example.com --max-pages 20 --instruction "pricing pages"
|
|
11
|
+
bie serve --port 8000
|
|
12
|
+
bie mcp
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import json
|
|
18
|
+
import sys
|
|
19
|
+
|
|
20
|
+
import click
|
|
21
|
+
|
|
22
|
+
from bie import __version__
|
|
23
|
+
from bie.config import BIESettings
|
|
24
|
+
from bie.engine import BIE
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@click.group()
|
|
28
|
+
@click.version_option(__version__, prog_name="bie")
|
|
29
|
+
def cli() -> None:
|
|
30
|
+
"""BIE — BitSearch Intelligence Engine. Real-time web search & extraction for AI apps."""
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@cli.command()
|
|
34
|
+
@click.argument("query")
|
|
35
|
+
@click.option("--url", "urls", multiple=True, required=True, help="Seed URL(s) to crawl & search")
|
|
36
|
+
@click.option("--top-k", default=10, show_default=True, help="Number of results to return")
|
|
37
|
+
@click.option("--max-pages", default=10, show_default=True, help="Max pages to crawl per seed URL")
|
|
38
|
+
@click.option("--no-embeddings", is_flag=True, help="Disable semantic/vector search (BM25 only)")
|
|
39
|
+
@click.option("--json", "as_json", is_flag=True, help="Output raw JSON")
|
|
40
|
+
def search(query: str, urls: tuple[str, ...], top_k: int, max_pages: int, no_embeddings: bool, as_json: bool) -> None:
|
|
41
|
+
"""Crawl URL(s) and search the freshly indexed content for QUERY."""
|
|
42
|
+
settings = BIESettings(max_pages=max_pages, use_embeddings=not no_embeddings)
|
|
43
|
+
engine = BIE(settings)
|
|
44
|
+
click.echo(f"Crawling {len(urls)} source(s)...", err=True)
|
|
45
|
+
n = engine.crawl(list(urls))
|
|
46
|
+
click.echo(f"Indexed {n} document(s). Searching...", err=True)
|
|
47
|
+
|
|
48
|
+
response = engine.search_full(query, top_k=top_k)
|
|
49
|
+
|
|
50
|
+
if as_json:
|
|
51
|
+
click.echo(response.model_dump_json(indent=2))
|
|
52
|
+
return
|
|
53
|
+
|
|
54
|
+
if not response.results:
|
|
55
|
+
click.echo("No results found.")
|
|
56
|
+
return
|
|
57
|
+
|
|
58
|
+
for i, r in enumerate(response.results, 1):
|
|
59
|
+
click.echo(f"\n{i}. {r.title}")
|
|
60
|
+
click.echo(f" {r.url}")
|
|
61
|
+
click.echo(f" score={r.score:.4f} trust={r.trust_score:.2f}")
|
|
62
|
+
click.echo(f" {r.snippet}")
|
|
63
|
+
click.echo(f"\n({response.took_ms} ms, {response.total_indexed_documents} docs indexed)")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@cli.command(name="search-live")
|
|
67
|
+
@click.argument("query")
|
|
68
|
+
@click.option("--top-k", default=10, show_default=True, help="Number of results to return")
|
|
69
|
+
@click.option("--discovery-results", default=8, show_default=True, help="Candidate URLs to discover")
|
|
70
|
+
@click.option("--no-deep", is_flag=True, help="Skip crawling; return raw discovery order without snippets")
|
|
71
|
+
@click.option("--no-embeddings", is_flag=True, help="Disable semantic/vector re-ranking (BM25 only)")
|
|
72
|
+
@click.option("--json", "as_json", is_flag=True, help="Output raw JSON")
|
|
73
|
+
def search_live(
|
|
74
|
+
query: str,
|
|
75
|
+
top_k: int,
|
|
76
|
+
discovery_results: int,
|
|
77
|
+
no_deep: bool,
|
|
78
|
+
no_embeddings: bool,
|
|
79
|
+
as_json: bool,
|
|
80
|
+
) -> None:
|
|
81
|
+
"""Search the live internet for QUERY — no seed URLs, no API key, no subscription.
|
|
82
|
+
|
|
83
|
+
Discovers relevant URLs via free public search endpoints (DuckDuckGo,
|
|
84
|
+
with a Bing fallback), crawls them with Bitscrape, and ranks the
|
|
85
|
+
extracted content against QUERY using BIE's hybrid BM25+vector index.
|
|
86
|
+
"""
|
|
87
|
+
import bie
|
|
88
|
+
|
|
89
|
+
results = bie.websearch(
|
|
90
|
+
query,
|
|
91
|
+
top_k=top_k,
|
|
92
|
+
discovery_results=discovery_results,
|
|
93
|
+
deep=not no_deep,
|
|
94
|
+
use_embeddings=not no_embeddings,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
if as_json:
|
|
98
|
+
click.echo(json.dumps([r.model_dump() for r in results], indent=2))
|
|
99
|
+
return
|
|
100
|
+
|
|
101
|
+
if not results:
|
|
102
|
+
click.echo(
|
|
103
|
+
"No results found. The free search backends may be temporarily "
|
|
104
|
+
"rate-limiting — try again in a moment."
|
|
105
|
+
)
|
|
106
|
+
return
|
|
107
|
+
|
|
108
|
+
for i, r in enumerate(results, 1):
|
|
109
|
+
click.echo(f"\n{i}. {r.title}")
|
|
110
|
+
click.echo(f" {r.url}")
|
|
111
|
+
click.echo(f" score={r.score:.4f}")
|
|
112
|
+
if r.snippet:
|
|
113
|
+
click.echo(f" {r.snippet}")
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@cli.command()
|
|
117
|
+
@click.argument("urls", nargs=-1, required=True)
|
|
118
|
+
@click.option("--max-pages", default=40, show_default=True)
|
|
119
|
+
@click.option("--max-depth", default=2, show_default=True)
|
|
120
|
+
@click.option(
|
|
121
|
+
"--instruction",
|
|
122
|
+
default="",
|
|
123
|
+
help="Guide link-following toward pages matching this description "
|
|
124
|
+
"(e.g. 'pricing and plans pages')",
|
|
125
|
+
)
|
|
126
|
+
@click.option("--out", "output", default=None, help="Write extracted documents as JSONL to this path")
|
|
127
|
+
def crawl(
|
|
128
|
+
urls: tuple[str, ...], max_pages: int, max_depth: int, instruction: str, output: str | None
|
|
129
|
+
) -> None:
|
|
130
|
+
"""Crawl URLS using the Bitscrape-powered spider and print/save extracted docs.
|
|
131
|
+
|
|
132
|
+
With --instruction, outgoing links are prioritized by keyword overlap
|
|
133
|
+
with the instruction (a heuristic, not full NL understanding — see
|
|
134
|
+
bie.crawl_site docs).
|
|
135
|
+
"""
|
|
136
|
+
settings = BIESettings(max_pages=max_pages, max_depth=max_depth, use_embeddings=False)
|
|
137
|
+
engine = BIE(settings)
|
|
138
|
+
documents = engine.crawler.crawl(list(urls), instruction=instruction)
|
|
139
|
+
|
|
140
|
+
if output:
|
|
141
|
+
with open(output, "w", encoding="utf-8") as f:
|
|
142
|
+
for doc in documents:
|
|
143
|
+
f.write(doc.model_dump_json() + "\n")
|
|
144
|
+
click.echo(f"Wrote {len(documents)} document(s) to {output}")
|
|
145
|
+
else:
|
|
146
|
+
for doc in documents:
|
|
147
|
+
click.echo(json.dumps({"url": doc.url, "title": doc.title, "chars": len(doc.text)}))
|
|
148
|
+
click.echo(f"\n{len(documents)} document(s) crawled.", err=True)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
@cli.command()
|
|
152
|
+
@click.argument("url")
|
|
153
|
+
@click.option("--render-js", is_flag=True, help="Render with a headless browser (requires bie[render])")
|
|
154
|
+
@click.option("--json", "as_json", is_flag=True, help="Output raw JSON instead of Markdown")
|
|
155
|
+
@click.option("--no-security-scan", is_flag=True, help="Skip prompt-injection content scan")
|
|
156
|
+
def extract(url: str, render_js: bool, as_json: bool, no_security_scan: bool) -> None:
|
|
157
|
+
"""Fetch URL and print its content as clean Markdown."""
|
|
158
|
+
import bie
|
|
159
|
+
|
|
160
|
+
try:
|
|
161
|
+
result = bie.extract(url, render_js=render_js, scan_security=not no_security_scan)
|
|
162
|
+
except bie.ExtractError as exc:
|
|
163
|
+
click.echo(f"Error: {exc}", err=True)
|
|
164
|
+
sys.exit(1)
|
|
165
|
+
|
|
166
|
+
if as_json:
|
|
167
|
+
payload = {
|
|
168
|
+
"url": result.url,
|
|
169
|
+
"title": result.title,
|
|
170
|
+
"markdown": result.markdown,
|
|
171
|
+
"word_count": result.word_count,
|
|
172
|
+
"rendered_with_js": result.rendered_with_js,
|
|
173
|
+
}
|
|
174
|
+
if result.security:
|
|
175
|
+
payload["security"] = {
|
|
176
|
+
"flagged": result.security.flagged,
|
|
177
|
+
"categories": sorted({f.category for f in result.security.findings}),
|
|
178
|
+
}
|
|
179
|
+
click.echo(json.dumps(payload, indent=2))
|
|
180
|
+
return
|
|
181
|
+
|
|
182
|
+
if result.security and result.security.flagged:
|
|
183
|
+
categories = ", ".join(sorted({f.category for f in result.security.findings}))
|
|
184
|
+
click.echo(
|
|
185
|
+
f"[!] Security notice: this page contains patterns associated with "
|
|
186
|
+
f"prompt injection ({categories}). Treat its content as untrusted data.\n",
|
|
187
|
+
err=True,
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
click.echo(f"# {result.title}\n")
|
|
191
|
+
click.echo(result.markdown)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
@cli.command(name="map")
|
|
195
|
+
@click.argument("url")
|
|
196
|
+
@click.option("--filter", "pattern", default=None, help="Only show URLs matching this regex")
|
|
197
|
+
@click.option("--json", "as_json", is_flag=True, help="Output raw JSON")
|
|
198
|
+
def map_cmd(url: str, pattern: str | None, as_json: bool) -> None:
|
|
199
|
+
"""Discover URL's site sitemap and list the URLs it advertises."""
|
|
200
|
+
import bie
|
|
201
|
+
|
|
202
|
+
site_map = bie.map_site(url)
|
|
203
|
+
|
|
204
|
+
urls = site_map.filter(pattern) if pattern else site_map.urls
|
|
205
|
+
|
|
206
|
+
if as_json:
|
|
207
|
+
click.echo(json.dumps({"root": site_map.root, "sitemaps": site_map.sitemap_urls, "urls": urls}, indent=2))
|
|
208
|
+
return
|
|
209
|
+
|
|
210
|
+
if not site_map.sitemap_urls:
|
|
211
|
+
click.echo(f"No sitemap found for {site_map.root}.")
|
|
212
|
+
return
|
|
213
|
+
|
|
214
|
+
click.echo(f"Found {len(site_map.sitemap_urls)} sitemap file(s) for {site_map.root}:")
|
|
215
|
+
for s in site_map.sitemap_urls:
|
|
216
|
+
click.echo(f" - {s}")
|
|
217
|
+
click.echo(f"\n{len(urls)} URL(s){' matching filter' if pattern else ''}:")
|
|
218
|
+
for u in urls[:100]:
|
|
219
|
+
click.echo(f" {u}")
|
|
220
|
+
if len(urls) > 100:
|
|
221
|
+
click.echo(f" ... and {len(urls) - 100} more")
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
@cli.command()
|
|
225
|
+
@click.option("--host", default=None, help="Bind host (default from settings / 0.0.0.0)")
|
|
226
|
+
@click.option("--port", default=None, type=int, help="Bind port (default from settings / 8000)")
|
|
227
|
+
@click.option("--reload", is_flag=True, help="Auto-reload on code changes (dev only)")
|
|
228
|
+
def serve(host: str | None, port: int | None, reload: bool) -> None:
|
|
229
|
+
"""Run the BIE REST API server (FastAPI + Uvicorn)."""
|
|
230
|
+
try:
|
|
231
|
+
import uvicorn
|
|
232
|
+
except ImportError:
|
|
233
|
+
click.echo("uvicorn is required: pip install 'bits-bie[server]'", err=True)
|
|
234
|
+
sys.exit(1)
|
|
235
|
+
|
|
236
|
+
settings = BIESettings()
|
|
237
|
+
uvicorn.run(
|
|
238
|
+
"bie.server:app",
|
|
239
|
+
host=host or settings.host,
|
|
240
|
+
port=port or settings.port,
|
|
241
|
+
reload=reload,
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
@cli.command()
|
|
246
|
+
def mcp() -> None:
|
|
247
|
+
"""Run BIE as a Model Context Protocol (MCP) server over stdio.
|
|
248
|
+
|
|
249
|
+
Add to your MCP client config (e.g. Claude Desktop) as a command:
|
|
250
|
+
|
|
251
|
+
\b
|
|
252
|
+
{
|
|
253
|
+
"mcpServers": {
|
|
254
|
+
"bie": {
|
|
255
|
+
"command": "bie",
|
|
256
|
+
"args": ["mcp"]
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
"""
|
|
261
|
+
try:
|
|
262
|
+
from bie.mcp.server import run_mcp_server
|
|
263
|
+
except ImportError:
|
|
264
|
+
click.echo("MCP support requires: pip install 'bits-bie[mcp]'", err=True)
|
|
265
|
+
sys.exit(1)
|
|
266
|
+
|
|
267
|
+
run_mcp_server()
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def main() -> None:
|
|
271
|
+
cli()
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
if __name__ == "__main__":
|
|
275
|
+
main()
|
bie/config.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""
|
|
2
|
+
BIE configuration.
|
|
3
|
+
|
|
4
|
+
All settings can be overridden via environment variables prefixed with
|
|
5
|
+
``BIE_`` (e.g. ``BIE_MAX_PAGES=200``) or passed directly to
|
|
6
|
+
``BIESettings(...)``.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from pydantic import Field
|
|
12
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BIESettings(BaseSettings):
|
|
16
|
+
# --- Crawl behaviour (delegated to Bitscrape) -----------------------
|
|
17
|
+
max_pages: int = Field(40, ge=1, description="Max pages to crawl per source URL")
|
|
18
|
+
max_depth: int = Field(2, ge=0, description="Max link-follow depth")
|
|
19
|
+
concurrent_requests: int = Field(16, ge=1, le=256)
|
|
20
|
+
download_delay: float = Field(0.0, ge=0.0)
|
|
21
|
+
user_agent: str = "BIE/0.1 (+https://github.com/Sudharsansm/BIE) bitscrape"
|
|
22
|
+
robotstxt_obey: bool = True
|
|
23
|
+
request_timeout: float = Field(20.0, ge=1.0)
|
|
24
|
+
use_playwright: bool = False
|
|
25
|
+
|
|
26
|
+
# --- Indexing / retrieval --------------------------------------------
|
|
27
|
+
chunk_size: int = Field(800, ge=100, description="Approx characters per chunk")
|
|
28
|
+
chunk_overlap: int = Field(100, ge=0)
|
|
29
|
+
use_embeddings: bool = Field(
|
|
30
|
+
True,
|
|
31
|
+
description="Enable semantic (vector) search via sentence-transformers. "
|
|
32
|
+
"Falls back to BM25-only if the model can't be loaded.",
|
|
33
|
+
)
|
|
34
|
+
embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
|
|
35
|
+
bm25_weight: float = Field(0.5, ge=0.0, le=1.0)
|
|
36
|
+
vector_weight: float = Field(0.5, ge=0.0, le=1.0)
|
|
37
|
+
|
|
38
|
+
# --- Storage -----------------------------------------------------------
|
|
39
|
+
index_dir: str = Field(".bie_index", description="Directory for persisted index")
|
|
40
|
+
persist: bool = Field(False, description="Persist index to disk between runs")
|
|
41
|
+
|
|
42
|
+
# --- Server --------------------------------------------------------------
|
|
43
|
+
host: str = "0.0.0.0"
|
|
44
|
+
port: int = 8000
|
|
45
|
+
api_key: str | None = Field(
|
|
46
|
+
default=None,
|
|
47
|
+
description="If set, all /search and /crawl endpoints require "
|
|
48
|
+
"an `Authorization: Bearer <key>` header.",
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
model_config = SettingsConfigDict(
|
|
52
|
+
env_prefix="BIE_",
|
|
53
|
+
env_file=".env",
|
|
54
|
+
env_file_encoding="utf-8",
|
|
55
|
+
case_sensitive=False,
|
|
56
|
+
extra="ignore",
|
|
57
|
+
)
|
bie/crawler.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""
|
|
2
|
+
BIE Crawler — thin orchestration layer over Bitscrape's Engine.
|
|
3
|
+
|
|
4
|
+
Implements PRD Module 1 (Crawler) for the OSS edition: runs the
|
|
5
|
+
:class:`bie.spiders.generic.BIESpider` against one or more seed URLs,
|
|
6
|
+
collects extracted pages in-memory as :class:`bie.models.Document`
|
|
7
|
+
objects, ready for chunking + indexing.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import asyncio
|
|
13
|
+
import logging
|
|
14
|
+
from typing import Any
|
|
15
|
+
from urllib.parse import urlparse
|
|
16
|
+
|
|
17
|
+
import bitscrape
|
|
18
|
+
from bitscrape.pipeline.pipelines import BasePipeline
|
|
19
|
+
|
|
20
|
+
from bie.config import BIESettings
|
|
21
|
+
from bie.models import Document
|
|
22
|
+
from bie.spiders.generic import BIESpider
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger("bie.crawler")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class _CollectorPipeline(BasePipeline):
|
|
28
|
+
"""Collects every scraped item into an in-memory list."""
|
|
29
|
+
|
|
30
|
+
def __init__(self) -> None:
|
|
31
|
+
self.items: list[dict[str, Any]] = []
|
|
32
|
+
|
|
33
|
+
async def process_item(self, item: Any, spider: Any) -> Any:
|
|
34
|
+
self.items.append(item)
|
|
35
|
+
return item
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class Crawler:
|
|
39
|
+
"""Crawls a list of seed URLs using Bitscrape and returns Documents."""
|
|
40
|
+
|
|
41
|
+
def __init__(self, settings: BIESettings | None = None) -> None:
|
|
42
|
+
self.settings = settings or BIESettings()
|
|
43
|
+
|
|
44
|
+
def crawl(
|
|
45
|
+
self, urls: list[str], allowed_domains: list[str] | None = None, instruction: str = ""
|
|
46
|
+
) -> list[Document]:
|
|
47
|
+
"""Synchronous convenience wrapper around :meth:`acrawl`."""
|
|
48
|
+
return asyncio.run(self.acrawl(urls, allowed_domains, instruction))
|
|
49
|
+
|
|
50
|
+
async def acrawl(
|
|
51
|
+
self,
|
|
52
|
+
urls: list[str],
|
|
53
|
+
allowed_domains: list[str] | None = None,
|
|
54
|
+
instruction: str = "",
|
|
55
|
+
) -> list[Document]:
|
|
56
|
+
if not urls:
|
|
57
|
+
return []
|
|
58
|
+
|
|
59
|
+
if allowed_domains is None:
|
|
60
|
+
allowed_domains = sorted({urlparse(u).netloc for u in urls if urlparse(u).netloc})
|
|
61
|
+
|
|
62
|
+
bs_settings = bitscrape.Settings(
|
|
63
|
+
concurrent_requests=self.settings.concurrent_requests,
|
|
64
|
+
download_delay=self.settings.download_delay,
|
|
65
|
+
user_agent=self.settings.user_agent,
|
|
66
|
+
robotstxt_obey=self.settings.robotstxt_obey,
|
|
67
|
+
download_timeout=self.settings.request_timeout,
|
|
68
|
+
max_depth=self.settings.max_depth,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
spider = BIESpider(settings=bs_settings)
|
|
72
|
+
spider.start_urls = list(urls)
|
|
73
|
+
spider.allowed_domains = allowed_domains
|
|
74
|
+
spider.max_pages = self.settings.max_pages
|
|
75
|
+
spider.max_depth = self.settings.max_depth
|
|
76
|
+
spider.instruction = instruction
|
|
77
|
+
|
|
78
|
+
collector = _CollectorPipeline()
|
|
79
|
+
|
|
80
|
+
middlewares = [
|
|
81
|
+
bitscrape.UserAgentMiddleware(),
|
|
82
|
+
bitscrape.CookieMiddleware(),
|
|
83
|
+
]
|
|
84
|
+
if bs_settings.robotstxt_obey:
|
|
85
|
+
middlewares.insert(0, bitscrape.RobotsMiddleware())
|
|
86
|
+
|
|
87
|
+
engine = bitscrape.Engine(
|
|
88
|
+
spider=spider,
|
|
89
|
+
settings=bs_settings,
|
|
90
|
+
pipelines=[collector],
|
|
91
|
+
middlewares=middlewares,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
stats = await engine.run()
|
|
95
|
+
logger.info(
|
|
96
|
+
"Crawled %d page(s) from %d seed URL(s) — %d failed",
|
|
97
|
+
stats.items_scraped,
|
|
98
|
+
len(urls),
|
|
99
|
+
stats.requests_failed,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
documents: list[Document] = []
|
|
103
|
+
for item in collector.items:
|
|
104
|
+
if not item.get("text"):
|
|
105
|
+
continue
|
|
106
|
+
documents.append(
|
|
107
|
+
Document(
|
|
108
|
+
url=item["url"],
|
|
109
|
+
title=item.get("title", item["url"]),
|
|
110
|
+
text=item["text"],
|
|
111
|
+
site=urlparse(item["url"]).netloc,
|
|
112
|
+
metadata={"depth": item.get("depth", 0)},
|
|
113
|
+
)
|
|
114
|
+
)
|
|
115
|
+
return documents
|