bits-bie 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. bits_bie-0.2.0/.github/workflows/ci.yml +27 -0
  2. bits_bie-0.2.0/.github/workflows/publish.yml +24 -0
  3. bits_bie-0.2.0/.gitignore +14 -0
  4. bits_bie-0.2.0/LICENSE +21 -0
  5. bits_bie-0.2.0/PKG-INFO +281 -0
  6. bits_bie-0.2.0/README.md +236 -0
  7. bits_bie-0.2.0/bie/__init__.py +60 -0
  8. bits_bie-0.2.0/bie/agents/__init__.py +315 -0
  9. bits_bie-0.2.0/bie/api/__init__.py +457 -0
  10. bits_bie-0.2.0/bie/auth/__init__.py +255 -0
  11. bits_bie-0.2.0/bie/chunker.py +83 -0
  12. bits_bie-0.2.0/bie/cli.py +136 -0
  13. bits_bie-0.2.0/bie/client.py +214 -0
  14. bits_bie-0.2.0/bie/compliance/__init__.py +472 -0
  15. bits_bie-0.2.0/bie/config.py +57 -0
  16. bits_bie-0.2.0/bie/context/__init__.py +87 -0
  17. bits_bie-0.2.0/bie/contradiction/__init__.py +204 -0
  18. bits_bie-0.2.0/bie/crawler/__init__.py +325 -0
  19. bits_bie-0.2.0/bie/crawler.py +109 -0
  20. bits_bie-0.2.0/bie/engine.py +132 -0
  21. bits_bie-0.2.0/bie/gateway/__init__.py +132 -0
  22. bits_bie-0.2.0/bie/index.py +225 -0
  23. bits_bie-0.2.0/bie/indexer/__init__.py +376 -0
  24. bits_bie-0.2.0/bie/kg/__init__.py +394 -0
  25. bits_bie-0.2.0/bie/mcp/__init__.py +3 -0
  26. bits_bie-0.2.0/bie/mcp/server.py +101 -0
  27. bits_bie-0.2.0/bie/models.py +76 -0
  28. bits_bie-0.2.0/bie/quicksearch.py +37 -0
  29. bits_bie-0.2.0/bie/regions/__init__.py +236 -0
  30. bits_bie-0.2.0/bie/retriever/__init__.py +2 -0
  31. bits_bie-0.2.0/bie/server.py +138 -0
  32. bits_bie-0.2.0/bie/spiders/__init__.py +3 -0
  33. bits_bie-0.2.0/bie/spiders/generic.py +117 -0
  34. bits_bie-0.2.0/bie/trust/__init__.py +99 -0
  35. bits_bie-0.2.0/bie/verifier/__init__.py +216 -0
  36. bits_bie-0.2.0/docs/API.md +103 -0
  37. bits_bie-0.2.0/examples/basic_search.py +30 -0
  38. bits_bie-0.2.0/examples/quickstart.py +131 -0
  39. bits_bie-0.2.0/examples/reusable_index.py +26 -0
  40. bits_bie-0.2.0/pyproject.toml +85 -0
  41. bits_bie-0.2.0/tests/__init__.py +0 -0
  42. bits_bie-0.2.0/tests/test_bie.py +371 -0
  43. bits_bie-0.2.0/tests/test_bie_v1.py +708 -0
  44. bits_bie-0.2.0/tests/test_chunker.py +27 -0
  45. bits_bie-0.2.0/tests/test_engine.py +30 -0
  46. bits_bie-0.2.0/tests/test_index.py +49 -0
@@ -0,0 +1,27 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.10", "3.11", "3.12"]
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+ - uses: actions/setup-python@v5
18
+ with:
19
+ python-version: ${{ matrix.python-version }}
20
+ - name: Install dependencies
21
+ run: |
22
+ python -m pip install --upgrade pip
23
+ pip install -e ".[dev]"
24
+ - name: Run tests
25
+ run: pytest -v
26
+ - name: Lint
27
+ run: ruff check bie tests
@@ -0,0 +1,24 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+ workflow_dispatch:
7
+
8
+ jobs:
9
+ build-and-publish:
10
+ runs-on: ubuntu-latest
11
+ environment: pypi
12
+ permissions:
13
+ id-token: write
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+ - uses: actions/setup-python@v5
17
+ with:
18
+ python-version: "3.12"
19
+ - name: Install build tools
20
+ run: python -m pip install --upgrade pip build
21
+ - name: Build package
22
+ run: python -m build
23
+ - name: Publish to PyPI
24
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,14 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.egg-info/
4
+ .eggs/
5
+ build/
6
+ dist/
7
+ .pytest_cache/
8
+ .ruff_cache/
9
+ .venv/
10
+ venv/
11
+ .env
12
+ .bie_index/
13
+ *.jsonl
14
+ .DS_Store
bits_bie-0.2.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Sudharsan SM
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,281 @@
1
+ Metadata-Version: 2.4
2
+ Name: bits-bie
3
+ Version: 0.2.0
4
+ Summary: BitSearch Intelligence Engine — real-time, citation-backed web search & extraction for AI apps. Built on Bitscrape.
5
+ Project-URL: Homepage, https://github.com/Sudharsansm/BIE
6
+ Project-URL: Repository, https://github.com/Sudharsansm/BIE
7
+ Project-URL: Issues, https://github.com/Sudharsansm/BIE/issues
8
+ Project-URL: Bitscrape (core crawler), https://github.com/Sudharsansm/Bitscrape
9
+ Author: Sudharsan SM
10
+ License-Expression: MIT
11
+ License-File: LICENSE
12
+ Keywords: ai-agents,bitscrape,information-retrieval,llm,mcp,model-context-protocol,rag,scraping,search,web-crawler
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
21
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
+ Classifier: Topic :: Text Processing :: Indexing
23
+ Requires-Python: >=3.10
24
+ Requires-Dist: bitscrape>=0.1.6
25
+ Requires-Dist: click>=8.0
26
+ Requires-Dist: pydantic-settings>=2.0
27
+ Requires-Dist: pydantic>=2.0
28
+ Provides-Extra: all
29
+ Requires-Dist: fastapi>=0.110; extra == 'all'
30
+ Requires-Dist: mcp>=1.0; extra == 'all'
31
+ Requires-Dist: sentence-transformers>=2.2; extra == 'all'
32
+ Requires-Dist: uvicorn[standard]>=0.27; extra == 'all'
33
+ Provides-Extra: dev
34
+ Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
35
+ Requires-Dist: pytest>=8.0; extra == 'dev'
36
+ Requires-Dist: ruff>=0.4; extra == 'dev'
37
+ Provides-Extra: embeddings
38
+ Requires-Dist: sentence-transformers>=2.2; extra == 'embeddings'
39
+ Provides-Extra: mcp
40
+ Requires-Dist: mcp>=1.0; extra == 'mcp'
41
+ Provides-Extra: server
42
+ Requires-Dist: fastapi>=0.110; extra == 'server'
43
+ Requires-Dist: uvicorn[standard]>=0.27; extra == 'server'
44
+ Description-Content-Type: text/markdown
45
+
46
+ # BIE — BitSearch Intelligence Engine
47
+
48
+ [![PyPI](https://img.shields.io/pypi/v/bie.svg)](https://pypi.org/project/bie/)
49
+ [![Python](https://img.shields.io/pypi/pyversions/bie.svg)](https://pypi.org/project/bie/)
50
+ [![License: MIT](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE)
51
+ [![Built on Bitscrape](https://img.shields.io/badge/built%20on-Bitscrape-orange.svg)](https://github.com/Sudharsansm/Bitscrape)
52
+
53
+ **The fastest, simplest way to give any LLM, RAG pipeline, or AI agent
54
+ real-time, citation-backed web search and extraction.**
55
+
56
+ BIE crawls the live web (powered by [**Bitscrape**](https://pypi.org/project/bitscrape/),
57
+ our high-performance async crawler), builds a hybrid **BM25 + semantic
58
+ vector** index in memory, and returns ranked, source-attributed results —
59
+ all from a single Python call, REST endpoint, CLI command, or
60
+ [MCP](https://modelcontextprotocol.io) tool.
61
+
62
+ ```python
63
+ import bie
64
+
65
+ results = bie.search(
66
+ "latest semiconductor export rules 2026",
67
+ urls=["https://www.reuters.com/technology/"],
68
+ )
69
+
70
+ for r in results:
71
+ print(r.title, "—", r.url, f"(score={r.score:.3f})")
72
+ ```
73
+
74
+ ---
75
+
76
+ ## Why BIE?
77
+
78
+ - 🚀 **Zero infra** — no Elasticsearch, no Milvus, no Kafka. Pure Python,
79
+ in-memory hybrid index. Scale up later if you need to.
80
+ - 🧠 **Hybrid retrieval out of the box** — BM25 lexical search fused with
81
+ sentence-transformer embeddings via Reciprocal Rank Fusion.
82
+ - 🤖 **MCP-ready** — drop-in tool for Claude Desktop, Claude Code, and any
83
+ MCP-compatible AI app.
84
+ - ⚡ **Powered by Bitscrape** — async, polite (robots.txt-aware), and fast
85
+ crawling/extraction under the hood.
86
+ - 🔌 **Use anywhere** — Python library, REST API, CLI, or MCP server.
87
+
88
+ ---
89
+
90
+ ## Install
91
+
92
+ ```bash
93
+ pip install bits-bie
94
+ ```
95
+
96
+ > Note: the PyPI **distribution** is named `bits-bie` (since `bie`
97
+ > was too similar to an existing PyPI project), but you still
98
+ > `import bie` and run the `bie` CLI command — same API as shown below.
99
+
100
+ Optional extras:
101
+
102
+ ```bash
103
+ pip install "bits-bie[embeddings]" # semantic/vector search (sentence-transformers)
104
+ pip install "bits-bie[server]" # FastAPI + Uvicorn REST server
105
+ pip install "bits-bie[mcp]" # Model Context Protocol server
106
+ pip install "bits-bie[all]" # everything
107
+ ```
108
+
109
+ > BIE depends on [`bitscrape`](https://pypi.org/project/bitscrape/), our
110
+ > proprietary async crawling & extraction framework, which is installed
111
+ > automatically.
112
+
113
+ ---
114
+
115
+ ## Usage
116
+
117
+ ### 1. One-shot search (Python)
118
+
119
+ ```python
120
+ import bie
121
+
122
+ results = bie.search("AI regulation news", urls=["https://example.com/news"], top_k=5)
123
+ for r in results:
124
+ print(r)
125
+ ```
126
+
127
+ ### 2. Build a reusable index
128
+
129
+ ```python
130
+ from bie import BIE
131
+
132
+ engine = BIE()
133
+ engine.crawl(["https://example.com/blog", "https://another-site.com"])
134
+
135
+ print(engine.search("quarterly earnings"))
136
+ print(engine.search("product launch")) # reuses the same index
137
+ ```
138
+
139
+ ### 3. Index your own text (no crawling)
140
+
141
+ ```python
142
+ engine.add_text(
143
+ url="internal://doc-1",
144
+ title="Q2 Strategy Memo",
145
+ text="...",
146
+ trust_score=1.0,
147
+ )
148
+ ```
149
+
150
+ ### 4. CLI
151
+
152
+ ```bash
153
+ # Crawl + search in one command
154
+ bie search "global markets today" --url https://www.bbc.com/news --top-k 5
155
+
156
+ # Just crawl & dump extracted pages
157
+ bie crawl https://example.com --max-pages 20 --out docs.jsonl
158
+
159
+ # Run the REST API
160
+ bie serve --port 8000
161
+
162
+ # Run as an MCP server (stdio)
163
+ bie mcp
164
+ ```
165
+
166
+ ### 5. REST API
167
+
168
+ ```bash
169
+ bie serve --port 8000
170
+ ```
171
+
172
+ ```bash
173
+ curl -X POST http://localhost:8000/crawl/url \
174
+ -H "Content-Type: application/json" \
175
+ -d '{"urls": ["https://example.com/news"]}'
176
+
177
+ curl -X POST http://localhost:8000/search \
178
+ -H "Content-Type: application/json" \
179
+ -d '{"query": "latest news", "top_k": 5}'
180
+ ```
181
+
182
+ See the full endpoint contract in [`docs/API.md`](docs/API.md).
183
+
184
+ ### 6. MCP (Model Context Protocol)
185
+
186
+ Add BIE as a tool in your MCP client (e.g. `claude_desktop_config.json`):
187
+
188
+ ```json
189
+ {
190
+ "mcpServers": {
191
+ "bie": {
192
+ "command": "bie",
193
+ "args": ["mcp"]
194
+ }
195
+ }
196
+ }
197
+ ```
198
+
199
+ This exposes three tools to your AI assistant:
200
+
201
+ - `bie_search(query, urls, top_k, max_pages)` — crawl + search in one call
202
+ - `bie_crawl(urls, max_pages)` — crawl & index into a session-persistent store
203
+ - `bie_index_search(query, top_k)` — search the session index
204
+
205
+ ---
206
+
207
+ ## Configuration
208
+
209
+ All settings can be set via environment variables prefixed with `BIE_`,
210
+ or passed directly:
211
+
212
+ ```python
213
+ from bie import BIE, BIESettings
214
+
215
+ engine = BIE(BIESettings(
216
+ max_pages=20,
217
+ max_depth=1,
218
+ use_embeddings=True,
219
+ embedding_model="sentence-transformers/all-MiniLM-L6-v2",
220
+ bm25_weight=0.6,
221
+ vector_weight=0.4,
222
+ ))
223
+ ```
224
+
225
+ | Setting | Env var | Default | Description |
226
+ |---|---|---|---|
227
+ | `max_pages` | `BIE_MAX_PAGES` | `40` | Max pages crawled per seed URL |
228
+ | `max_depth` | `BIE_MAX_DEPTH` | `2` | Max link-follow depth |
229
+ | `concurrent_requests` | `BIE_CONCURRENT_REQUESTS` | `16` | Crawl concurrency |
230
+ | `robotstxt_obey` | `BIE_ROBOTSTXT_OBEY` | `true` | Respect robots.txt |
231
+ | `use_embeddings` | `BIE_USE_EMBEDDINGS` | `true` | Enable semantic search |
232
+ | `chunk_size` | `BIE_CHUNK_SIZE` | `800` | Chars per chunk |
233
+ | `bm25_weight` / `vector_weight` | `BIE_BM25_WEIGHT` / `BIE_VECTOR_WEIGHT` | `0.5` / `0.5` | Fusion weights |
234
+ | `api_key` | `BIE_API_KEY` | `None` | If set, requires `Authorization: Bearer <key>` |
235
+
236
+ ---
237
+
238
+ ## Architecture
239
+
240
+ ```
241
+ ┌─────────────────────────────────────────┐
242
+ │ bie │
243
+ │ │
244
+ urls ──▶ │ Crawler (Bitscrape) │
245
+ │ │ │
246
+ │ ▼ │
247
+ │ Document → Chunker → HybridIndex │
248
+ │ │ │ │
249
+ │ BM25Index VectorIndex │
250
+ │ │ │ │
251
+ │ Fusion (RRF) │
252
+ │ │ │
253
+ query ──▶ │ ▼ │
254
+ │ Ranked SearchResults │
255
+ └─────────────────────────────────────────┘
256
+ │ │ │
257
+ Python API REST API MCP Server
258
+ ```
259
+
260
+ This OSS edition implements the core of the BIE PRD's **Module 1
261
+ (Crawler)**, **Module 2 (Indexes)**, **Module 3 (Hybrid Retriever)**, and
262
+ **Module 11 (Agent API)** as a single lightweight package — no external
263
+ services required. Larger deployments can swap `BM25Index`/`VectorIndex`
264
+ for Elasticsearch/Milvus-backed implementations behind the same
265
+ `HybridIndex` interface.
266
+
267
+ ---
268
+
269
+ ## Built on Bitscrape
270
+
271
+ BIE's crawling and extraction layer is powered by
272
+ [**Bitscrape**](https://github.com/Sudharsansm/Bitscrape)
273
+ (`pip install bitscrape`), our async, robots.txt-aware web scraping
274
+ framework — giving BIE high-performance, polite, production-grade crawling
275
+ out of the box.
276
+
277
+ ---
278
+
279
+ ## License
280
+
281
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,236 @@
1
+ # BIE — BitSearch Intelligence Engine
2
+
3
+ [![PyPI](https://img.shields.io/pypi/v/bie.svg)](https://pypi.org/project/bie/)
4
+ [![Python](https://img.shields.io/pypi/pyversions/bie.svg)](https://pypi.org/project/bie/)
5
+ [![License: MIT](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE)
6
+ [![Built on Bitscrape](https://img.shields.io/badge/built%20on-Bitscrape-orange.svg)](https://github.com/Sudharsansm/Bitscrape)
7
+
8
+ **The fastest, simplest way to give any LLM, RAG pipeline, or AI agent
9
+ real-time, citation-backed web search and extraction.**
10
+
11
+ BIE crawls the live web (powered by [**Bitscrape**](https://pypi.org/project/bitscrape/),
12
+ our high-performance async crawler), builds a hybrid **BM25 + semantic
13
+ vector** index in memory, and returns ranked, source-attributed results —
14
+ all from a single Python call, REST endpoint, CLI command, or
15
+ [MCP](https://modelcontextprotocol.io) tool.
16
+
17
+ ```python
18
+ import bie
19
+
20
+ results = bie.search(
21
+ "latest semiconductor export rules 2026",
22
+ urls=["https://www.reuters.com/technology/"],
23
+ )
24
+
25
+ for r in results:
26
+ print(r.title, "—", r.url, f"(score={r.score:.3f})")
27
+ ```
28
+
29
+ ---
30
+
31
+ ## Why BIE?
32
+
33
+ - 🚀 **Zero infra** — no Elasticsearch, no Milvus, no Kafka. Pure Python,
34
+ in-memory hybrid index. Scale up later if you need to.
35
+ - 🧠 **Hybrid retrieval out of the box** — BM25 lexical search fused with
36
+ sentence-transformer embeddings via Reciprocal Rank Fusion.
37
+ - 🤖 **MCP-ready** — drop-in tool for Claude Desktop, Claude Code, and any
38
+ MCP-compatible AI app.
39
+ - ⚡ **Powered by Bitscrape** — async, polite (robots.txt-aware), and fast
40
+ crawling/extraction under the hood.
41
+ - 🔌 **Use anywhere** — Python library, REST API, CLI, or MCP server.
42
+
43
+ ---
44
+
45
+ ## Install
46
+
47
+ ```bash
48
+ pip install bits-bie
49
+ ```
50
+
51
+ > Note: the PyPI **distribution** is named `bits-bie` (since `bie`
52
+ > was too similar to an existing PyPI project), but you still
53
+ > `import bie` and run the `bie` CLI command — same API as shown below.
54
+
55
+ Optional extras:
56
+
57
+ ```bash
58
+ pip install "bits-bie[embeddings]" # semantic/vector search (sentence-transformers)
59
+ pip install "bits-bie[server]" # FastAPI + Uvicorn REST server
60
+ pip install "bits-bie[mcp]" # Model Context Protocol server
61
+ pip install "bits-bie[all]" # everything
62
+ ```
63
+
64
+ > BIE depends on [`bitscrape`](https://pypi.org/project/bitscrape/), our
65
+ > proprietary async crawling & extraction framework, which is installed
66
+ > automatically.
67
+
68
+ ---
69
+
70
+ ## Usage
71
+
72
+ ### 1. One-shot search (Python)
73
+
74
+ ```python
75
+ import bie
76
+
77
+ results = bie.search("AI regulation news", urls=["https://example.com/news"], top_k=5)
78
+ for r in results:
79
+ print(r)
80
+ ```
81
+
82
+ ### 2. Build a reusable index
83
+
84
+ ```python
85
+ from bie import BIE
86
+
87
+ engine = BIE()
88
+ engine.crawl(["https://example.com/blog", "https://another-site.com"])
89
+
90
+ print(engine.search("quarterly earnings"))
91
+ print(engine.search("product launch")) # reuses the same index
92
+ ```
93
+
94
+ ### 3. Index your own text (no crawling)
95
+
96
+ ```python
97
+ engine.add_text(
98
+ url="internal://doc-1",
99
+ title="Q2 Strategy Memo",
100
+ text="...",
101
+ trust_score=1.0,
102
+ )
103
+ ```
104
+
105
+ ### 4. CLI
106
+
107
+ ```bash
108
+ # Crawl + search in one command
109
+ bie search "global markets today" --url https://www.bbc.com/news --top-k 5
110
+
111
+ # Just crawl & dump extracted pages
112
+ bie crawl https://example.com --max-pages 20 --out docs.jsonl
113
+
114
+ # Run the REST API
115
+ bie serve --port 8000
116
+
117
+ # Run as an MCP server (stdio)
118
+ bie mcp
119
+ ```
120
+
121
+ ### 5. REST API
122
+
123
+ ```bash
124
+ bie serve --port 8000
125
+ ```
126
+
127
+ ```bash
128
+ curl -X POST http://localhost:8000/crawl/url \
129
+ -H "Content-Type: application/json" \
130
+ -d '{"urls": ["https://example.com/news"]}'
131
+
132
+ curl -X POST http://localhost:8000/search \
133
+ -H "Content-Type: application/json" \
134
+ -d '{"query": "latest news", "top_k": 5}'
135
+ ```
136
+
137
+ See the full endpoint contract in [`docs/API.md`](docs/API.md).
138
+
139
+ ### 6. MCP (Model Context Protocol)
140
+
141
+ Add BIE as a tool in your MCP client (e.g. `claude_desktop_config.json`):
142
+
143
+ ```json
144
+ {
145
+ "mcpServers": {
146
+ "bie": {
147
+ "command": "bie",
148
+ "args": ["mcp"]
149
+ }
150
+ }
151
+ }
152
+ ```
153
+
154
+ This exposes three tools to your AI assistant:
155
+
156
+ - `bie_search(query, urls, top_k, max_pages)` — crawl + search in one call
157
+ - `bie_crawl(urls, max_pages)` — crawl & index into a session-persistent store
158
+ - `bie_index_search(query, top_k)` — search the session index
159
+
160
+ ---
161
+
162
+ ## Configuration
163
+
164
+ All settings can be set via environment variables prefixed with `BIE_`,
165
+ or passed directly:
166
+
167
+ ```python
168
+ from bie import BIE, BIESettings
169
+
170
+ engine = BIE(BIESettings(
171
+ max_pages=20,
172
+ max_depth=1,
173
+ use_embeddings=True,
174
+ embedding_model="sentence-transformers/all-MiniLM-L6-v2",
175
+ bm25_weight=0.6,
176
+ vector_weight=0.4,
177
+ ))
178
+ ```
179
+
180
+ | Setting | Env var | Default | Description |
181
+ |---|---|---|---|
182
+ | `max_pages` | `BIE_MAX_PAGES` | `40` | Max pages crawled per seed URL |
183
+ | `max_depth` | `BIE_MAX_DEPTH` | `2` | Max link-follow depth |
184
+ | `concurrent_requests` | `BIE_CONCURRENT_REQUESTS` | `16` | Crawl concurrency |
185
+ | `robotstxt_obey` | `BIE_ROBOTSTXT_OBEY` | `true` | Respect robots.txt |
186
+ | `use_embeddings` | `BIE_USE_EMBEDDINGS` | `true` | Enable semantic search |
187
+ | `chunk_size` | `BIE_CHUNK_SIZE` | `800` | Chars per chunk |
188
+ | `bm25_weight` / `vector_weight` | `BIE_BM25_WEIGHT` / `BIE_VECTOR_WEIGHT` | `0.5` / `0.5` | Fusion weights |
189
+ | `api_key` | `BIE_API_KEY` | `None` | If set, requires `Authorization: Bearer <key>` |
190
+
191
+ ---
192
+
193
+ ## Architecture
194
+
195
+ ```
196
+ ┌─────────────────────────────────────────┐
197
+ │ bie │
198
+ │ │
199
+ urls ──▶ │ Crawler (Bitscrape) │
200
+ │ │ │
201
+ │ ▼ │
202
+ │ Document → Chunker → HybridIndex │
203
+ │ │ │ │
204
+ │ BM25Index VectorIndex │
205
+ │ │ │ │
206
+ │ Fusion (RRF) │
207
+ │ │ │
208
+ query ──▶ │ ▼ │
209
+ │ Ranked SearchResults │
210
+ └─────────────────────────────────────────┘
211
+ │ │ │
212
+ Python API REST API MCP Server
213
+ ```
214
+
215
+ This OSS edition implements the core of the BIE PRD's **Module 1
216
+ (Crawler)**, **Module 2 (Indexes)**, **Module 3 (Hybrid Retriever)**, and
217
+ **Module 11 (Agent API)** as a single lightweight package — no external
218
+ services required. Larger deployments can swap `BM25Index`/`VectorIndex`
219
+ for Elasticsearch/Milvus-backed implementations behind the same
220
+ `HybridIndex` interface.
221
+
222
+ ---
223
+
224
+ ## Built on Bitscrape
225
+
226
+ BIE's crawling and extraction layer is powered by
227
+ [**Bitscrape**](https://github.com/Sudharsansm/Bitscrape)
228
+ (`pip install bitscrape`), our async, robots.txt-aware web scraping
229
+ framework — giving BIE high-performance, polite, production-grade crawling
230
+ out of the box.
231
+
232
+ ---
233
+
234
+ ## License
235
+
236
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,60 @@
1
+ """
2
+ BIE — BitSearch Intelligence Engine
3
+ =====================================
4
+
5
+ The fastest, simplest way to give any LLM, RAG pipeline, or AI agent
6
+ real-time, citation-backed web search and extraction.
7
+
8
+ Built on top of **Bitscrape** (https://pypi.org/project/bitscrape/) —
9
+ BIE adds a hybrid (keyword + semantic) search index, a clean Python API,
10
+ a REST server, a CLI, and a Model Context Protocol (MCP) tool so any
11
+ AI application can call ``search()`` and get fresh, ranked, cited results.
12
+
13
+ Quick start
14
+ -----------
15
+
16
+ .. code-block:: python
17
+
18
+ import bie
19
+
20
+ # One-shot: crawl + index + search, all in memory
21
+ results = bie.search("latest semiconductor export rules 2026", urls=[
22
+ "https://www.reuters.com/technology/",
23
+ "https://www.bloomberg.com/technology",
24
+ ])
25
+
26
+ for r in results:
27
+ print(r.title, r.url, r.score)
28
+
29
+ Or build a persistent index you can query repeatedly::
30
+
31
+ engine = bie.BIE()
32
+ engine.crawl(["https://example.com"])
33
+ hits = engine.search("example query", top_k=5)
34
+
35
+ Run as a server::
36
+
37
+ bie serve --port 8000
38
+
39
+ Run as an MCP tool (for Claude Desktop, Claude Code, etc.)::
40
+
41
+ bie mcp
42
+ """
43
+
44
+ from __future__ import annotations
45
+
46
+ from bie.config import BIESettings
47
+ from bie.engine import BIE
48
+ from bie.models import Document, SearchResult
49
+ from bie.quicksearch import search
50
+
51
+ __version__ = "0.1.0"
52
+
53
+ __all__ = [
54
+ "BIE",
55
+ "BIESettings",
56
+ "Document",
57
+ "SearchResult",
58
+ "search",
59
+ "__version__",
60
+ ]