bridgekit 0.1.0__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,10 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bridgekit
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: AI tools that make you a better data scientist, not a redundant one.
5
5
  License: MIT
6
- Project-URL: Homepage, https://github.com/bridgekit/bridgekit
7
- Project-URL: Issues, https://github.com/bridgekit/bridgekit/issues
6
+ Project-URL: Homepage, https://github.com/getbridgekit/bridgekit
7
+ Project-URL: Issues, https://github.com/getbridgekit/bridgekit/issues
8
8
  Keywords: data science,AI,analysis,evaluation,anthropic
9
9
  Classifier: Development Status :: 3 - Alpha
10
10
  Classifier: Intended Audience :: Science/Research
@@ -15,6 +15,12 @@ Requires-Python: >=3.9
15
15
  Description-Content-Type: text/markdown
16
16
  License-File: LICENSE
17
17
  Requires-Dist: anthropic>=0.20.0
18
+ Requires-Dist: chromadb>=0.4.0
19
+ Requires-Dist: sentence-transformers>=2.0.0
20
+ Requires-Dist: pypdf>=3.0.0
21
+ Requires-Dist: python-docx>=1.0.0
22
+ Requires-Dist: python-pptx>=0.6.0
23
+ Requires-Dist: nbformat>=5.0.0
18
24
  Dynamic: license-file
19
25
 
20
26
  # Bridgekit
@@ -76,10 +82,23 @@ the finding.
76
82
 
77
83
  ## Installation
78
84
 
85
+ **Standard install:**
79
86
  ```bash
80
87
  pip install bridgekit
81
88
  ```
82
89
 
90
+ **In a virtual environment (recommended for clean setups):**
91
+ ```bash
92
+ python -m venv .venv
93
+ source .venv/bin/activate
94
+ pip install bridgekit
95
+ ```
96
+
97
+ **In a Jupyter notebook:**
98
+ ```python
99
+ !pip install bridgekit
100
+ ```
101
+
83
102
  Requires an Anthropic API key:
84
103
 
85
104
  ```bash
@@ -121,6 +140,47 @@ Paste your writeup as a string and call `evaluate()` — that's it.
121
140
 
122
141
  ---
123
142
 
143
+ ## Tool #2: Analysis Search
144
+
145
+ Ask questions across a collection of your past analysis documents. Point it at a folder and get answers grounded in your actual work — no digging through files manually.
146
+
147
+ Uses a vector database and semantic similarity to find relevant context across your documents — not keyword matching.
148
+
149
+ Supports `.txt`, `.md`, `.pdf`, `.docx`, `.pptx`, and `.ipynb` files.
150
+
151
+ **From a folder:**
152
+ ```python
153
+ from bridgekit import ask
154
+
155
+ print(ask("what drove churn in Q3?", source="reports/"))
156
+ ```
157
+
158
+ **From raw text:**
159
+ ```python
160
+ from bridgekit import ask
161
+
162
+ text = """
163
+ Q3 churn rose to 4.5%, driven by a product outage in August and a pricing
164
+ change in July that increased SMB costs by 12%.
165
+ """
166
+
167
+ print(ask("what caused the Q3 churn spike?", text=text))
168
+ ```
169
+
170
+ **Output** *(based on sample data included in the repo)*:
171
+ ```
172
+ Based on the Q3 2024 Churn Analysis, two primary factors drove the elevated
173
+ churn rate of 4.5%:
174
+
175
+ 1. August Product Outage — A 14-hour outage affected 3,800 accounts. Impacted
176
+ accounts churned at 8.1% vs 3.2% for unaffected accounts.
177
+
178
+ 2. July Pricing Change — SMB costs increased by an average of 12%, causing SMB
179
+ churn to spike to 7.2% — the highest single-month figure in the dataset.
180
+ ```
181
+
182
+ ---
183
+
124
184
  ## Why not just use Claude?
125
185
 
126
186
  You could. But you'd need to know what to ask, how to frame it, and what a good answer looks like. Bridgekit has that baked in — it knows you're a data scientist presenting findings, so it asks the right questions automatically. No prompt engineering required. Just paste your work and run it.
@@ -143,7 +203,7 @@ Bridgekit only ever sees text you write yourself — your narrative, your conclu
143
203
 
144
204
  ## What's next?
145
205
 
146
- Bridgekit is a suite, not a one-off. The analysis reviewer is tool #1. Coming next:
206
+ Bridgekit is a suite, not a one-off. Two tools are live more are coming:
147
207
 
148
208
  - **Statistical approach suggester** — describe your problem in plain English, get the right test and why
149
209
  - **Stakeholder translator** — turn your technical findings into a narrative a non-technical audience will actually follow
@@ -1,22 +1,3 @@
1
- Metadata-Version: 2.4
2
- Name: bridgekit
3
- Version: 0.1.0
4
- Summary: AI tools that make you a better data scientist, not a redundant one.
5
- License: MIT
6
- Project-URL: Homepage, https://github.com/bridgekit/bridgekit
7
- Project-URL: Issues, https://github.com/bridgekit/bridgekit/issues
8
- Keywords: data science,AI,analysis,evaluation,anthropic
9
- Classifier: Development Status :: 3 - Alpha
10
- Classifier: Intended Audience :: Science/Research
11
- Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
12
- Classifier: Programming Language :: Python :: 3
13
- Classifier: License :: OSI Approved :: MIT License
14
- Requires-Python: >=3.9
15
- Description-Content-Type: text/markdown
16
- License-File: LICENSE
17
- Requires-Dist: anthropic>=0.20.0
18
- Dynamic: license-file
19
-
20
1
  # Bridgekit
21
2
 
22
3
  **AI tools that make you a better data scientist, not a redundant one.**
@@ -76,10 +57,23 @@ the finding.
76
57
 
77
58
  ## Installation
78
59
 
60
+ **Standard install:**
61
+ ```bash
62
+ pip install bridgekit
63
+ ```
64
+
65
+ **In a virtual environment (recommended for clean setups):**
79
66
  ```bash
67
+ python -m venv .venv
68
+ source .venv/bin/activate
80
69
  pip install bridgekit
81
70
  ```
82
71
 
72
+ **In a Jupyter notebook:**
73
+ ```python
74
+ !pip install bridgekit
75
+ ```
76
+
83
77
  Requires an Anthropic API key:
84
78
 
85
79
  ```bash
@@ -121,6 +115,47 @@ Paste your writeup as a string and call `evaluate()` — that's it.
121
115
 
122
116
  ---
123
117
 
118
+ ## Tool #2: Analysis Search
119
+
120
+ Ask questions across a collection of your past analysis documents. Point it at a folder and get answers grounded in your actual work — no digging through files manually.
121
+
122
+ Uses a vector database and semantic similarity to find relevant context across your documents — not keyword matching.
123
+
124
+ Supports `.txt`, `.md`, `.pdf`, `.docx`, `.pptx`, and `.ipynb` files.
125
+
126
+ **From a folder:**
127
+ ```python
128
+ from bridgekit import ask
129
+
130
+ print(ask("what drove churn in Q3?", source="reports/"))
131
+ ```
132
+
133
+ **From raw text:**
134
+ ```python
135
+ from bridgekit import ask
136
+
137
+ text = """
138
+ Q3 churn rose to 4.5%, driven by a product outage in August and a pricing
139
+ change in July that increased SMB costs by 12%.
140
+ """
141
+
142
+ print(ask("what caused the Q3 churn spike?", text=text))
143
+ ```
144
+
145
+ **Output** *(based on sample data included in the repo)*:
146
+ ```
147
+ Based on the Q3 2024 Churn Analysis, two primary factors drove the elevated
148
+ churn rate of 4.5%:
149
+
150
+ 1. August Product Outage — A 14-hour outage affected 3,800 accounts. Impacted
151
+ accounts churned at 8.1% vs 3.2% for unaffected accounts.
152
+
153
+ 2. July Pricing Change — SMB costs increased by an average of 12%, causing SMB
154
+ churn to spike to 7.2% — the highest single-month figure in the dataset.
155
+ ```
156
+
157
+ ---
158
+
124
159
  ## Why not just use Claude?
125
160
 
126
161
  You could. But you'd need to know what to ask, how to frame it, and what a good answer looks like. Bridgekit has that baked in — it knows you're a data scientist presenting findings, so it asks the right questions automatically. No prompt engineering required. Just paste your work and run it.
@@ -143,7 +178,7 @@ Bridgekit only ever sees text you write yourself — your narrative, your conclu
143
178
 
144
179
  ## What's next?
145
180
 
146
- Bridgekit is a suite, not a one-off. The analysis reviewer is tool #1. Coming next:
181
+ Bridgekit is a suite, not a one-off. Two tools are live more are coming:
147
182
 
148
183
  - **Statistical approach suggester** — describe your problem in plain English, get the right test and why
149
184
  - **Stakeholder translator** — turn your technical findings into a narrative a non-technical audience will actually follow
@@ -0,0 +1,5 @@
1
+ from .reviewer import evaluate
2
+ from .search import ask
3
+
4
+ __version__ = "0.2.0"
5
+ __all__ = ["evaluate", "ask"]
@@ -0,0 +1,122 @@
1
+ import os
2
+ from pathlib import Path
3
+ import anthropic
4
+ import chromadb
5
+ from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
6
+
7
+ CHUNK_SIZE = 150 # words per chunk
8
+ CHUNK_OVERLAP = 20
9
+
10
+
11
+ def _load_file(path: Path) -> str:
12
+ suffix = path.suffix.lower()
13
+ if suffix == ".pdf":
14
+ import pypdf
15
+ reader = pypdf.PdfReader(str(path))
16
+ return "\n".join(page.extract_text() or "" for page in reader.pages)
17
+ elif suffix == ".docx":
18
+ import docx
19
+ doc = docx.Document(str(path))
20
+ return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
21
+ elif suffix == ".pptx":
22
+ from pptx import Presentation
23
+ prs = Presentation(str(path))
24
+ lines = []
25
+ for slide in prs.slides:
26
+ for shape in slide.shapes:
27
+ if hasattr(shape, "text") and shape.text.strip():
28
+ lines.append(shape.text)
29
+ return "\n".join(lines)
30
+ elif suffix == ".ipynb":
31
+ import nbformat
32
+ nb = nbformat.read(str(path), as_version=4)
33
+ lines = []
34
+ for cell in nb.cells:
35
+ if cell.cell_type in ("markdown", "code") and cell.source.strip():
36
+ lines.append(cell.source)
37
+ return "\n\n".join(lines)
38
+ else:
39
+ return path.read_text(encoding="utf-8")
40
+
41
+
42
+ def _chunk(text: str) -> list[str]:
43
+ words = text.split()
44
+ chunks = []
45
+ i = 0
46
+ while i < len(words):
47
+ chunks.append(" ".join(words[i:i + CHUNK_SIZE]))
48
+ i += CHUNK_SIZE - CHUNK_OVERLAP
49
+ return [c for c in chunks if c.strip()]
50
+
51
+
52
+ def ask(question: str, source: str = None, text: str = None) -> str:
53
+ """
54
+ Ask a question across a collection of analysis documents or raw text.
55
+
56
+ Args:
57
+ question: The question to answer.
58
+ source: Path to a folder containing .txt, .md, .pdf, .docx, .pptx, or .ipynb files.
59
+ text: A raw text string to search instead of a folder.
60
+
61
+ Returns:
62
+ An answer grounded in the provided documents.
63
+ """
64
+ if not source and not text:
65
+ raise ValueError("Provide either 'source' (folder path) or 'text'.")
66
+
67
+ api_key = os.environ.get("ANTHROPIC_API_KEY")
68
+ if not api_key:
69
+ raise EnvironmentError(
70
+ "ANTHROPIC_API_KEY not found. Set it with: export ANTHROPIC_API_KEY=your_key_here"
71
+ )
72
+
73
+ # Collect chunks
74
+ chunks = []
75
+
76
+ if text:
77
+ chunks.extend(_chunk(text))
78
+
79
+ if source:
80
+ folder = Path(source).expanduser().resolve()
81
+ supported = {".txt", ".md", ".pdf", ".docx", ".pptx", ".ipynb"}
82
+ for file in sorted(folder.iterdir()):
83
+ if file.suffix.lower() in supported:
84
+ content = _load_file(file)
85
+ chunks.extend(_chunk(content))
86
+
87
+ if not chunks:
88
+ raise ValueError("No content found. Check your source folder or text input.")
89
+
90
+ # Embed and store in ChromaDB
91
+ embedding_fn = SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
92
+ client = chromadb.Client()
93
+ collection = client.get_or_create_collection(
94
+ name="bridgekit_ask",
95
+ embedding_function=embedding_fn
96
+ )
97
+ collection.add(
98
+ documents=chunks,
99
+ ids=[f"chunk_{i}" for i in range(len(chunks))]
100
+ )
101
+
102
+ # Retrieve most relevant chunks
103
+ results = collection.query(query_texts=[question], n_results=min(8, len(chunks)))
104
+ context = "\n\n".join(results["documents"][0])
105
+
106
+ # Generate answer with Claude
107
+ anthropic_client = anthropic.Anthropic(api_key=api_key)
108
+ message = anthropic_client.messages.create(
109
+ model="claude-opus-4-5",
110
+ max_tokens=1024,
111
+ system=(
112
+ "You are a senior data scientist answering questions based on analysis reports. "
113
+ "Answer only from the provided context. Be specific and cite findings where relevant. "
114
+ "If the context does not contain enough information to answer, say so clearly."
115
+ ),
116
+ messages=[{
117
+ "role": "user",
118
+ "content": f"Context from analysis reports:\n\n{context}\n\nQuestion: {question}"
119
+ }]
120
+ )
121
+
122
+ return message.content[0].text
@@ -1,3 +1,28 @@
1
+ Metadata-Version: 2.4
2
+ Name: bridgekit
3
+ Version: 0.2.0
4
+ Summary: AI tools that make you a better data scientist, not a redundant one.
5
+ License: MIT
6
+ Project-URL: Homepage, https://github.com/getbridgekit/bridgekit
7
+ Project-URL: Issues, https://github.com/getbridgekit/bridgekit/issues
8
+ Keywords: data science,AI,analysis,evaluation,anthropic
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Requires-Python: >=3.9
15
+ Description-Content-Type: text/markdown
16
+ License-File: LICENSE
17
+ Requires-Dist: anthropic>=0.20.0
18
+ Requires-Dist: chromadb>=0.4.0
19
+ Requires-Dist: sentence-transformers>=2.0.0
20
+ Requires-Dist: pypdf>=3.0.0
21
+ Requires-Dist: python-docx>=1.0.0
22
+ Requires-Dist: python-pptx>=0.6.0
23
+ Requires-Dist: nbformat>=5.0.0
24
+ Dynamic: license-file
25
+
1
26
  # Bridgekit
2
27
 
3
28
  **AI tools that make you a better data scientist, not a redundant one.**
@@ -57,10 +82,23 @@ the finding.
57
82
 
58
83
  ## Installation
59
84
 
85
+ **Standard install:**
86
+ ```bash
87
+ pip install bridgekit
88
+ ```
89
+
90
+ **In a virtual environment (recommended for clean setups):**
60
91
  ```bash
92
+ python -m venv .venv
93
+ source .venv/bin/activate
61
94
  pip install bridgekit
62
95
  ```
63
96
 
97
+ **In a Jupyter notebook:**
98
+ ```python
99
+ !pip install bridgekit
100
+ ```
101
+
64
102
  Requires an Anthropic API key:
65
103
 
66
104
  ```bash
@@ -102,6 +140,47 @@ Paste your writeup as a string and call `evaluate()` — that's it.
102
140
 
103
141
  ---
104
142
 
143
+ ## Tool #2: Analysis Search
144
+
145
+ Ask questions across a collection of your past analysis documents. Point it at a folder and get answers grounded in your actual work — no digging through files manually.
146
+
147
+ Uses a vector database and semantic similarity to find relevant context across your documents — not keyword matching.
148
+
149
+ Supports `.txt`, `.md`, `.pdf`, `.docx`, `.pptx`, and `.ipynb` files.
150
+
151
+ **From a folder:**
152
+ ```python
153
+ from bridgekit import ask
154
+
155
+ print(ask("what drove churn in Q3?", source="reports/"))
156
+ ```
157
+
158
+ **From raw text:**
159
+ ```python
160
+ from bridgekit import ask
161
+
162
+ text = """
163
+ Q3 churn rose to 4.5%, driven by a product outage in August and a pricing
164
+ change in July that increased SMB costs by 12%.
165
+ """
166
+
167
+ print(ask("what caused the Q3 churn spike?", text=text))
168
+ ```
169
+
170
+ **Output** *(based on sample data included in the repo)*:
171
+ ```
172
+ Based on the Q3 2024 Churn Analysis, two primary factors drove the elevated
173
+ churn rate of 4.5%:
174
+
175
+ 1. August Product Outage — A 14-hour outage affected 3,800 accounts. Impacted
176
+ accounts churned at 8.1% vs 3.2% for unaffected accounts.
177
+
178
+ 2. July Pricing Change — SMB costs increased by an average of 12%, causing SMB
179
+ churn to spike to 7.2% — the highest single-month figure in the dataset.
180
+ ```
181
+
182
+ ---
183
+
105
184
  ## Why not just use Claude?
106
185
 
107
186
  You could. But you'd need to know what to ask, how to frame it, and what a good answer looks like. Bridgekit has that baked in — it knows you're a data scientist presenting findings, so it asks the right questions automatically. No prompt engineering required. Just paste your work and run it.
@@ -124,7 +203,7 @@ Bridgekit only ever sees text you write yourself — your narrative, your conclu
124
203
 
125
204
  ## What's next?
126
205
 
127
- Bridgekit is a suite, not a one-off. The analysis reviewer is tool #1. Coming next:
206
+ Bridgekit is a suite, not a one-off. Two tools are live more are coming:
128
207
 
129
208
  - **Statistical approach suggester** — describe your problem in plain English, get the right test and why
130
209
  - **Stakeholder translator** — turn your technical findings into a narrative a non-technical audience will actually follow
@@ -3,6 +3,7 @@ README.md
3
3
  pyproject.toml
4
4
  bridgekit/__init__.py
5
5
  bridgekit/reviewer.py
6
+ bridgekit/search.py
6
7
  bridgekit.egg-info/PKG-INFO
7
8
  bridgekit.egg-info/SOURCES.txt
8
9
  bridgekit.egg-info/dependency_links.txt
@@ -0,0 +1,7 @@
1
+ anthropic>=0.20.0
2
+ chromadb>=0.4.0
3
+ sentence-transformers>=2.0.0
4
+ pypdf>=3.0.0
5
+ python-docx>=1.0.0
6
+ python-pptx>=0.6.0
7
+ nbformat>=5.0.0
@@ -2,9 +2,12 @@
2
2
  requires = ["setuptools>=61.0"]
3
3
  build-backend = "setuptools.build_meta"
4
4
 
5
+ [tool.setuptools.packages.find]
6
+ include = ["bridgekit*"]
7
+
5
8
  [project]
6
9
  name = "bridgekit"
7
- version = "0.1.0"
10
+ version = "0.2.0"
8
11
  description = "AI tools that make you a better data scientist, not a redundant one."
9
12
  readme = "README.md"
10
13
  requires-python = ">=3.9"
@@ -19,8 +22,14 @@ classifiers = [
19
22
  ]
20
23
  dependencies = [
21
24
  "anthropic>=0.20.0",
25
+ "chromadb>=0.4.0",
26
+ "sentence-transformers>=2.0.0",
27
+ "pypdf>=3.0.0",
28
+ "python-docx>=1.0.0",
29
+ "python-pptx>=0.6.0",
30
+ "nbformat>=5.0.0",
22
31
  ]
23
32
 
24
33
  [project.urls]
25
- Homepage = "https://github.com/bridgekit/bridgekit"
26
- Issues = "https://github.com/bridgekit/bridgekit/issues"
34
+ Homepage = "https://github.com/getbridgekit/bridgekit"
35
+ Issues = "https://github.com/getbridgekit/bridgekit/issues"
@@ -1,4 +0,0 @@
1
- from .reviewer import evaluate
2
-
3
- __version__ = "0.1.0"
4
- __all__ = ["evaluate"]
@@ -1 +0,0 @@
1
- anthropic>=0.20.0
File without changes
File without changes