bridgekit 0.1.1__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {bridgekit-0.1.1/bridgekit.egg-info → bridgekit-0.2.0}/PKG-INFO +49 -2
- bridgekit-0.1.1/PKG-INFO → bridgekit-0.2.0/README.md +42 -20
- bridgekit-0.2.0/bridgekit/__init__.py +5 -0
- bridgekit-0.2.0/bridgekit/search.py +122 -0
- bridgekit-0.1.1/README.md → bridgekit-0.2.0/bridgekit.egg-info/PKG-INFO +67 -1
- {bridgekit-0.1.1 → bridgekit-0.2.0}/bridgekit.egg-info/SOURCES.txt +1 -0
- bridgekit-0.2.0/bridgekit.egg-info/requires.txt +7 -0
- {bridgekit-0.1.1 → bridgekit-0.2.0}/pyproject.toml +10 -1
- bridgekit-0.1.1/bridgekit/__init__.py +0 -4
- bridgekit-0.1.1/bridgekit.egg-info/requires.txt +0 -1
- {bridgekit-0.1.1 → bridgekit-0.2.0}/LICENSE +0 -0
- {bridgekit-0.1.1 → bridgekit-0.2.0}/bridgekit/reviewer.py +0 -0
- {bridgekit-0.1.1 → bridgekit-0.2.0}/bridgekit.egg-info/dependency_links.txt +0 -0
- {bridgekit-0.1.1 → bridgekit-0.2.0}/bridgekit.egg-info/top_level.txt +0 -0
- {bridgekit-0.1.1 → bridgekit-0.2.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: bridgekit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: AI tools that make you a better data scientist, not a redundant one.
|
|
5
5
|
License: MIT
|
|
6
6
|
Project-URL: Homepage, https://github.com/getbridgekit/bridgekit
|
|
@@ -15,6 +15,12 @@ Requires-Python: >=3.9
|
|
|
15
15
|
Description-Content-Type: text/markdown
|
|
16
16
|
License-File: LICENSE
|
|
17
17
|
Requires-Dist: anthropic>=0.20.0
|
|
18
|
+
Requires-Dist: chromadb>=0.4.0
|
|
19
|
+
Requires-Dist: sentence-transformers>=2.0.0
|
|
20
|
+
Requires-Dist: pypdf>=3.0.0
|
|
21
|
+
Requires-Dist: python-docx>=1.0.0
|
|
22
|
+
Requires-Dist: python-pptx>=0.6.0
|
|
23
|
+
Requires-Dist: nbformat>=5.0.0
|
|
18
24
|
Dynamic: license-file
|
|
19
25
|
|
|
20
26
|
# Bridgekit
|
|
@@ -134,6 +140,47 @@ Paste your writeup as a string and call `evaluate()` — that's it.
|
|
|
134
140
|
|
|
135
141
|
---
|
|
136
142
|
|
|
143
|
+
## Tool #2: Analysis Search
|
|
144
|
+
|
|
145
|
+
Ask questions across a collection of your past analysis documents. Point it at a folder and get answers grounded in your actual work — no digging through files manually.
|
|
146
|
+
|
|
147
|
+
Uses a vector database and semantic similarity to find relevant context across your documents — not keyword matching.
|
|
148
|
+
|
|
149
|
+
Supports `.txt`, `.md`, `.pdf`, `.docx`, `.pptx`, and `.ipynb` files.
|
|
150
|
+
|
|
151
|
+
**From a folder:**
|
|
152
|
+
```python
|
|
153
|
+
from bridgekit import ask
|
|
154
|
+
|
|
155
|
+
print(ask("what drove churn in Q3?", source="reports/"))
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
**From raw text:**
|
|
159
|
+
```python
|
|
160
|
+
from bridgekit import ask
|
|
161
|
+
|
|
162
|
+
text = """
|
|
163
|
+
Q3 churn rose to 4.5%, driven by a product outage in August and a pricing
|
|
164
|
+
change in July that increased SMB costs by 12%.
|
|
165
|
+
"""
|
|
166
|
+
|
|
167
|
+
print(ask("what caused the Q3 churn spike?", text=text))
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
**Output** *(based on sample data included in the repo)*:
|
|
171
|
+
```
|
|
172
|
+
Based on the Q3 2024 Churn Analysis, two primary factors drove the elevated
|
|
173
|
+
churn rate of 4.5%:
|
|
174
|
+
|
|
175
|
+
1. August Product Outage — A 14-hour outage affected 3,800 accounts. Impacted
|
|
176
|
+
accounts churned at 8.1% vs 3.2% for unaffected accounts.
|
|
177
|
+
|
|
178
|
+
2. July Pricing Change — SMB costs increased by an average of 12%, causing SMB
|
|
179
|
+
churn to spike to 7.2% — the highest single-month figure in the dataset.
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
137
184
|
## Why not just use Claude?
|
|
138
185
|
|
|
139
186
|
You could. But you'd need to know what to ask, how to frame it, and what a good answer looks like. Bridgekit has that baked in — it knows you're a data scientist presenting findings, so it asks the right questions automatically. No prompt engineering required. Just paste your work and run it.
|
|
@@ -156,7 +203,7 @@ Bridgekit only ever sees text you write yourself — your narrative, your conclu
|
|
|
156
203
|
|
|
157
204
|
## What's next?
|
|
158
205
|
|
|
159
|
-
Bridgekit is a suite, not a one-off.
|
|
206
|
+
Bridgekit is a suite, not a one-off. Two tools are live — more are coming:
|
|
160
207
|
|
|
161
208
|
- **Statistical approach suggester** — describe your problem in plain English, get the right test and why
|
|
162
209
|
- **Stakeholder translator** — turn your technical findings into a narrative a non-technical audience will actually follow
|
|
@@ -1,22 +1,3 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: bridgekit
|
|
3
|
-
Version: 0.1.1
|
|
4
|
-
Summary: AI tools that make you a better data scientist, not a redundant one.
|
|
5
|
-
License: MIT
|
|
6
|
-
Project-URL: Homepage, https://github.com/getbridgekit/bridgekit
|
|
7
|
-
Project-URL: Issues, https://github.com/getbridgekit/bridgekit/issues
|
|
8
|
-
Keywords: data science,AI,analysis,evaluation,anthropic
|
|
9
|
-
Classifier: Development Status :: 3 - Alpha
|
|
10
|
-
Classifier: Intended Audience :: Science/Research
|
|
11
|
-
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
12
|
-
Classifier: Programming Language :: Python :: 3
|
|
13
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
-
Requires-Python: >=3.9
|
|
15
|
-
Description-Content-Type: text/markdown
|
|
16
|
-
License-File: LICENSE
|
|
17
|
-
Requires-Dist: anthropic>=0.20.0
|
|
18
|
-
Dynamic: license-file
|
|
19
|
-
|
|
20
1
|
# Bridgekit
|
|
21
2
|
|
|
22
3
|
**AI tools that make you a better data scientist, not a redundant one.**
|
|
@@ -134,6 +115,47 @@ Paste your writeup as a string and call `evaluate()` — that's it.
|
|
|
134
115
|
|
|
135
116
|
---
|
|
136
117
|
|
|
118
|
+
## Tool #2: Analysis Search
|
|
119
|
+
|
|
120
|
+
Ask questions across a collection of your past analysis documents. Point it at a folder and get answers grounded in your actual work — no digging through files manually.
|
|
121
|
+
|
|
122
|
+
Uses a vector database and semantic similarity to find relevant context across your documents — not keyword matching.
|
|
123
|
+
|
|
124
|
+
Supports `.txt`, `.md`, `.pdf`, `.docx`, `.pptx`, and `.ipynb` files.
|
|
125
|
+
|
|
126
|
+
**From a folder:**
|
|
127
|
+
```python
|
|
128
|
+
from bridgekit import ask
|
|
129
|
+
|
|
130
|
+
print(ask("what drove churn in Q3?", source="reports/"))
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
**From raw text:**
|
|
134
|
+
```python
|
|
135
|
+
from bridgekit import ask
|
|
136
|
+
|
|
137
|
+
text = """
|
|
138
|
+
Q3 churn rose to 4.5%, driven by a product outage in August and a pricing
|
|
139
|
+
change in July that increased SMB costs by 12%.
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
print(ask("what caused the Q3 churn spike?", text=text))
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
**Output** *(based on sample data included in the repo)*:
|
|
146
|
+
```
|
|
147
|
+
Based on the Q3 2024 Churn Analysis, two primary factors drove the elevated
|
|
148
|
+
churn rate of 4.5%:
|
|
149
|
+
|
|
150
|
+
1. August Product Outage — A 14-hour outage affected 3,800 accounts. Impacted
|
|
151
|
+
accounts churned at 8.1% vs 3.2% for unaffected accounts.
|
|
152
|
+
|
|
153
|
+
2. July Pricing Change — SMB costs increased by an average of 12%, causing SMB
|
|
154
|
+
churn to spike to 7.2% — the highest single-month figure in the dataset.
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
---
|
|
158
|
+
|
|
137
159
|
## Why not just use Claude?
|
|
138
160
|
|
|
139
161
|
You could. But you'd need to know what to ask, how to frame it, and what a good answer looks like. Bridgekit has that baked in — it knows you're a data scientist presenting findings, so it asks the right questions automatically. No prompt engineering required. Just paste your work and run it.
|
|
@@ -156,7 +178,7 @@ Bridgekit only ever sees text you write yourself — your narrative, your conclu
|
|
|
156
178
|
|
|
157
179
|
## What's next?
|
|
158
180
|
|
|
159
|
-
Bridgekit is a suite, not a one-off.
|
|
181
|
+
Bridgekit is a suite, not a one-off. Two tools are live — more are coming:
|
|
160
182
|
|
|
161
183
|
- **Statistical approach suggester** — describe your problem in plain English, get the right test and why
|
|
162
184
|
- **Stakeholder translator** — turn your technical findings into a narrative a non-technical audience will actually follow
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import anthropic
|
|
4
|
+
import chromadb
|
|
5
|
+
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
|
|
6
|
+
|
|
7
|
+
CHUNK_SIZE = 150 # words per chunk
|
|
8
|
+
CHUNK_OVERLAP = 20
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _load_file(path: Path) -> str:
|
|
12
|
+
suffix = path.suffix.lower()
|
|
13
|
+
if suffix == ".pdf":
|
|
14
|
+
import pypdf
|
|
15
|
+
reader = pypdf.PdfReader(str(path))
|
|
16
|
+
return "\n".join(page.extract_text() or "" for page in reader.pages)
|
|
17
|
+
elif suffix == ".docx":
|
|
18
|
+
import docx
|
|
19
|
+
doc = docx.Document(str(path))
|
|
20
|
+
return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
|
|
21
|
+
elif suffix == ".pptx":
|
|
22
|
+
from pptx import Presentation
|
|
23
|
+
prs = Presentation(str(path))
|
|
24
|
+
lines = []
|
|
25
|
+
for slide in prs.slides:
|
|
26
|
+
for shape in slide.shapes:
|
|
27
|
+
if hasattr(shape, "text") and shape.text.strip():
|
|
28
|
+
lines.append(shape.text)
|
|
29
|
+
return "\n".join(lines)
|
|
30
|
+
elif suffix == ".ipynb":
|
|
31
|
+
import nbformat
|
|
32
|
+
nb = nbformat.read(str(path), as_version=4)
|
|
33
|
+
lines = []
|
|
34
|
+
for cell in nb.cells:
|
|
35
|
+
if cell.cell_type in ("markdown", "code") and cell.source.strip():
|
|
36
|
+
lines.append(cell.source)
|
|
37
|
+
return "\n\n".join(lines)
|
|
38
|
+
else:
|
|
39
|
+
return path.read_text(encoding="utf-8")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _chunk(text: str) -> list[str]:
|
|
43
|
+
words = text.split()
|
|
44
|
+
chunks = []
|
|
45
|
+
i = 0
|
|
46
|
+
while i < len(words):
|
|
47
|
+
chunks.append(" ".join(words[i:i + CHUNK_SIZE]))
|
|
48
|
+
i += CHUNK_SIZE - CHUNK_OVERLAP
|
|
49
|
+
return [c for c in chunks if c.strip()]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def ask(question: str, source: str = None, text: str = None) -> str:
|
|
53
|
+
"""
|
|
54
|
+
Ask a question across a collection of analysis documents or raw text.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
question: The question to answer.
|
|
58
|
+
source: Path to a folder containing .txt, .md, .pdf, .docx, .pptx, or .ipynb files.
|
|
59
|
+
text: A raw text string to search instead of a folder.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
An answer grounded in the provided documents.
|
|
63
|
+
"""
|
|
64
|
+
if not source and not text:
|
|
65
|
+
raise ValueError("Provide either 'source' (folder path) or 'text'.")
|
|
66
|
+
|
|
67
|
+
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
|
68
|
+
if not api_key:
|
|
69
|
+
raise EnvironmentError(
|
|
70
|
+
"ANTHROPIC_API_KEY not found. Set it with: export ANTHROPIC_API_KEY=your_key_here"
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# Collect chunks
|
|
74
|
+
chunks = []
|
|
75
|
+
|
|
76
|
+
if text:
|
|
77
|
+
chunks.extend(_chunk(text))
|
|
78
|
+
|
|
79
|
+
if source:
|
|
80
|
+
folder = Path(source).expanduser().resolve()
|
|
81
|
+
supported = {".txt", ".md", ".pdf", ".docx", ".pptx", ".ipynb"}
|
|
82
|
+
for file in sorted(folder.iterdir()):
|
|
83
|
+
if file.suffix.lower() in supported:
|
|
84
|
+
content = _load_file(file)
|
|
85
|
+
chunks.extend(_chunk(content))
|
|
86
|
+
|
|
87
|
+
if not chunks:
|
|
88
|
+
raise ValueError("No content found. Check your source folder or text input.")
|
|
89
|
+
|
|
90
|
+
# Embed and store in ChromaDB
|
|
91
|
+
embedding_fn = SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
|
|
92
|
+
client = chromadb.Client()
|
|
93
|
+
collection = client.get_or_create_collection(
|
|
94
|
+
name="bridgekit_ask",
|
|
95
|
+
embedding_function=embedding_fn
|
|
96
|
+
)
|
|
97
|
+
collection.add(
|
|
98
|
+
documents=chunks,
|
|
99
|
+
ids=[f"chunk_{i}" for i in range(len(chunks))]
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# Retrieve most relevant chunks
|
|
103
|
+
results = collection.query(query_texts=[question], n_results=min(8, len(chunks)))
|
|
104
|
+
context = "\n\n".join(results["documents"][0])
|
|
105
|
+
|
|
106
|
+
# Generate answer with Claude
|
|
107
|
+
anthropic_client = anthropic.Anthropic(api_key=api_key)
|
|
108
|
+
message = anthropic_client.messages.create(
|
|
109
|
+
model="claude-opus-4-5",
|
|
110
|
+
max_tokens=1024,
|
|
111
|
+
system=(
|
|
112
|
+
"You are a senior data scientist answering questions based on analysis reports. "
|
|
113
|
+
"Answer only from the provided context. Be specific and cite findings where relevant. "
|
|
114
|
+
"If the context does not contain enough information to answer, say so clearly."
|
|
115
|
+
),
|
|
116
|
+
messages=[{
|
|
117
|
+
"role": "user",
|
|
118
|
+
"content": f"Context from analysis reports:\n\n{context}\n\nQuestion: {question}"
|
|
119
|
+
}]
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
return message.content[0].text
|
|
@@ -1,3 +1,28 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: bridgekit
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: AI tools that make you a better data scientist, not a redundant one.
|
|
5
|
+
License: MIT
|
|
6
|
+
Project-URL: Homepage, https://github.com/getbridgekit/bridgekit
|
|
7
|
+
Project-URL: Issues, https://github.com/getbridgekit/bridgekit/issues
|
|
8
|
+
Keywords: data science,AI,analysis,evaluation,anthropic
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Requires-Python: >=3.9
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
License-File: LICENSE
|
|
17
|
+
Requires-Dist: anthropic>=0.20.0
|
|
18
|
+
Requires-Dist: chromadb>=0.4.0
|
|
19
|
+
Requires-Dist: sentence-transformers>=2.0.0
|
|
20
|
+
Requires-Dist: pypdf>=3.0.0
|
|
21
|
+
Requires-Dist: python-docx>=1.0.0
|
|
22
|
+
Requires-Dist: python-pptx>=0.6.0
|
|
23
|
+
Requires-Dist: nbformat>=5.0.0
|
|
24
|
+
Dynamic: license-file
|
|
25
|
+
|
|
1
26
|
# Bridgekit
|
|
2
27
|
|
|
3
28
|
**AI tools that make you a better data scientist, not a redundant one.**
|
|
@@ -115,6 +140,47 @@ Paste your writeup as a string and call `evaluate()` — that's it.
|
|
|
115
140
|
|
|
116
141
|
---
|
|
117
142
|
|
|
143
|
+
## Tool #2: Analysis Search
|
|
144
|
+
|
|
145
|
+
Ask questions across a collection of your past analysis documents. Point it at a folder and get answers grounded in your actual work — no digging through files manually.
|
|
146
|
+
|
|
147
|
+
Uses a vector database and semantic similarity to find relevant context across your documents — not keyword matching.
|
|
148
|
+
|
|
149
|
+
Supports `.txt`, `.md`, `.pdf`, `.docx`, `.pptx`, and `.ipynb` files.
|
|
150
|
+
|
|
151
|
+
**From a folder:**
|
|
152
|
+
```python
|
|
153
|
+
from bridgekit import ask
|
|
154
|
+
|
|
155
|
+
print(ask("what drove churn in Q3?", source="reports/"))
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
**From raw text:**
|
|
159
|
+
```python
|
|
160
|
+
from bridgekit import ask
|
|
161
|
+
|
|
162
|
+
text = """
|
|
163
|
+
Q3 churn rose to 4.5%, driven by a product outage in August and a pricing
|
|
164
|
+
change in July that increased SMB costs by 12%.
|
|
165
|
+
"""
|
|
166
|
+
|
|
167
|
+
print(ask("what caused the Q3 churn spike?", text=text))
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
**Output** *(based on sample data included in the repo)*:
|
|
171
|
+
```
|
|
172
|
+
Based on the Q3 2024 Churn Analysis, two primary factors drove the elevated
|
|
173
|
+
churn rate of 4.5%:
|
|
174
|
+
|
|
175
|
+
1. August Product Outage — A 14-hour outage affected 3,800 accounts. Impacted
|
|
176
|
+
accounts churned at 8.1% vs 3.2% for unaffected accounts.
|
|
177
|
+
|
|
178
|
+
2. July Pricing Change — SMB costs increased by an average of 12%, causing SMB
|
|
179
|
+
churn to spike to 7.2% — the highest single-month figure in the dataset.
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
118
184
|
## Why not just use Claude?
|
|
119
185
|
|
|
120
186
|
You could. But you'd need to know what to ask, how to frame it, and what a good answer looks like. Bridgekit has that baked in — it knows you're a data scientist presenting findings, so it asks the right questions automatically. No prompt engineering required. Just paste your work and run it.
|
|
@@ -137,7 +203,7 @@ Bridgekit only ever sees text you write yourself — your narrative, your conclu
|
|
|
137
203
|
|
|
138
204
|
## What's next?
|
|
139
205
|
|
|
140
|
-
Bridgekit is a suite, not a one-off.
|
|
206
|
+
Bridgekit is a suite, not a one-off. Two tools are live — more are coming:
|
|
141
207
|
|
|
142
208
|
- **Statistical approach suggester** — describe your problem in plain English, get the right test and why
|
|
143
209
|
- **Stakeholder translator** — turn your technical findings into a narrative a non-technical audience will actually follow
|
|
@@ -2,9 +2,12 @@
|
|
|
2
2
|
requires = ["setuptools>=61.0"]
|
|
3
3
|
build-backend = "setuptools.build_meta"
|
|
4
4
|
|
|
5
|
+
[tool.setuptools.packages.find]
|
|
6
|
+
include = ["bridgekit*"]
|
|
7
|
+
|
|
5
8
|
[project]
|
|
6
9
|
name = "bridgekit"
|
|
7
|
-
version = "0.
|
|
10
|
+
version = "0.2.0"
|
|
8
11
|
description = "AI tools that make you a better data scientist, not a redundant one."
|
|
9
12
|
readme = "README.md"
|
|
10
13
|
requires-python = ">=3.9"
|
|
@@ -19,6 +22,12 @@ classifiers = [
|
|
|
19
22
|
]
|
|
20
23
|
dependencies = [
|
|
21
24
|
"anthropic>=0.20.0",
|
|
25
|
+
"chromadb>=0.4.0",
|
|
26
|
+
"sentence-transformers>=2.0.0",
|
|
27
|
+
"pypdf>=3.0.0",
|
|
28
|
+
"python-docx>=1.0.0",
|
|
29
|
+
"python-pptx>=0.6.0",
|
|
30
|
+
"nbformat>=5.0.0",
|
|
22
31
|
]
|
|
23
32
|
|
|
24
33
|
[project.urls]
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
anthropic>=0.20.0
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|