botstash 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- botstash-0.1.0/.gitignore +39 -0
- botstash-0.1.0/LICENSE +21 -0
- botstash-0.1.0/PKG-INFO +86 -0
- botstash-0.1.0/README.md +54 -0
- botstash-0.1.0/VECTORBOARD_SPEC.md +257 -0
- botstash-0.1.0/pyproject.toml +74 -0
- botstash-0.1.0/src/botstash/__init__.py +3 -0
- botstash-0.1.0/src/botstash/anythingllm/__init__.py +0 -0
- botstash-0.1.0/src/botstash/anythingllm/client.py +1 -0
- botstash-0.1.0/src/botstash/classifier/__init__.py +0 -0
- botstash-0.1.0/src/botstash/classifier/auto.py +1 -0
- botstash-0.1.0/src/botstash/cli.py +70 -0
- botstash-0.1.0/src/botstash/config.py +1 -0
- botstash-0.1.0/src/botstash/extractors/__init__.py +0 -0
- botstash-0.1.0/src/botstash/extractors/docx.py +1 -0
- botstash-0.1.0/src/botstash/extractors/pdf.py +1 -0
- botstash-0.1.0/src/botstash/extractors/pptx.py +1 -0
- botstash-0.1.0/src/botstash/extractors/qti.py +1 -0
- botstash-0.1.0/src/botstash/extractors/unit_outline.py +1 -0
- botstash-0.1.0/src/botstash/extractors/url_tracker.py +1 -0
- botstash-0.1.0/src/botstash/extractors/vtt.py +1 -0
- botstash-0.1.0/src/botstash/ingester/__init__.py +0 -0
- botstash-0.1.0/src/botstash/ingester/imscc.py +1 -0
- botstash-0.1.0/src/botstash/ingester/transcript.py +1 -0
- botstash-0.1.0/src/botstash/pipeline.py +1 -0
- botstash-0.1.0/src/botstash/py.typed +0 -0
- botstash-0.1.0/src/botstash/webui/__init__.py +0 -0
- botstash-0.1.0/src/botstash/webui/app.py +1 -0
- botstash-0.1.0/tests/__init__.py +0 -0
- botstash-0.1.0/tests/test_version.py +7 -0
- botstash-0.1.0/uv.lock +1412 -0
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.egg-info/
|
|
6
|
+
*.egg
|
|
7
|
+
dist/
|
|
8
|
+
build/
|
|
9
|
+
*.whl
|
|
10
|
+
|
|
11
|
+
# Virtual environments
|
|
12
|
+
.venv/
|
|
13
|
+
venv/
|
|
14
|
+
env/
|
|
15
|
+
|
|
16
|
+
# IDE
|
|
17
|
+
.idea/
|
|
18
|
+
.vscode/
|
|
19
|
+
*.swp
|
|
20
|
+
*.swo
|
|
21
|
+
*~
|
|
22
|
+
|
|
23
|
+
# Testing
|
|
24
|
+
.pytest_cache/
|
|
25
|
+
.coverage
|
|
26
|
+
htmlcov/
|
|
27
|
+
.mypy_cache/
|
|
28
|
+
.ruff_cache/
|
|
29
|
+
|
|
30
|
+
# Environment / secrets
|
|
31
|
+
.env
|
|
32
|
+
.vectorboard.env
|
|
33
|
+
|
|
34
|
+
# OS
|
|
35
|
+
.DS_Store
|
|
36
|
+
Thumbs.db
|
|
37
|
+
|
|
38
|
+
# Staging (runtime output)
|
|
39
|
+
staging/
|
botstash-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Michael
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
botstash-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: botstash
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: CLI tool and WebUI that ingests LMS course exports and Echo360 transcripts into an AnythingLLM workspace for embedded course chatbots.
|
|
5
|
+
Project-URL: Homepage, https://github.com/michael-borck/botstash
|
|
6
|
+
Project-URL: Repository, https://github.com/michael-borck/botstash
|
|
7
|
+
Project-URL: Issues, https://github.com/michael-borck/botstash/issues
|
|
8
|
+
Author-email: Michael <michael@example.com>
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: anythingllm,botstash,chatbot,education,lms,rag
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Education
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Education
|
|
20
|
+
Requires-Python: >=3.11
|
|
21
|
+
Requires-Dist: click>=8.1
|
|
22
|
+
Requires-Dist: fastapi>=0.115
|
|
23
|
+
Requires-Dist: httpx>=0.27
|
|
24
|
+
Requires-Dist: jinja2>=3.1
|
|
25
|
+
Requires-Dist: pdfminer-six>=20231228
|
|
26
|
+
Requires-Dist: python-docx>=1.1
|
|
27
|
+
Requires-Dist: python-dotenv>=1.0
|
|
28
|
+
Requires-Dist: python-pptx>=1.0
|
|
29
|
+
Requires-Dist: uvicorn>=0.32
|
|
30
|
+
Requires-Dist: webvtt-py>=0.5
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
|
|
33
|
+
# BotStash
|
|
34
|
+
|
|
35
|
+
A CLI tool and lightweight WebUI that ingests LMS course exports (Blackboard IMSCC / Canvas) and Echo360 VTT transcripts, uploads content to an [AnythingLLM](https://anythingllm.com/) workspace, and returns embeddable chatbot code for pasting into a course page.
|
|
36
|
+
|
|
37
|
+
## Features
|
|
38
|
+
|
|
39
|
+
- **IMSCC ingestion** — unzips and walks Blackboard/Canvas common cartridge exports
|
|
40
|
+
- **Transcript ingestion** — processes folders of Echo360 VTT files
|
|
41
|
+
- **Multi-format extraction** — PPTX, DOCX, PDF, VTT, QTI quizzes
|
|
42
|
+
- **Auto-classification** — heuristic tagging of content types (lecture, worksheet, assignment, etc.)
|
|
43
|
+
- **AnythingLLM integration** — uploads documents, manages workspaces, retrieves embed code
|
|
44
|
+
- **WebUI** — FastAPI + Jinja2 interface for non-terminal users
|
|
45
|
+
|
|
46
|
+
## Installation
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
pip install botstash
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Quick Start
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
# Full pipeline
|
|
56
|
+
botstash run course.zip transcripts/ \
|
|
57
|
+
--workspace ISYS2001 \
|
|
58
|
+
--url https://your-anythingllm.instance \
|
|
59
|
+
--key YOUR_API_KEY
|
|
60
|
+
|
|
61
|
+
# Two-step workflow (extract, review, embed)
|
|
62
|
+
botstash extract course.zip transcripts/ --output ./staging/
|
|
63
|
+
# ... review staging/tags.json ...
|
|
64
|
+
botstash embed ./staging/ --workspace ISYS2001
|
|
65
|
+
|
|
66
|
+
# Launch WebUI
|
|
67
|
+
botstash serve
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Development
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
# Clone and install in dev mode
|
|
74
|
+
git clone https://github.com/michael-borck/botstash.git
|
|
75
|
+
cd botstash
|
|
76
|
+
uv sync --dev
|
|
77
|
+
|
|
78
|
+
# Run checks
|
|
79
|
+
uv run ruff check src/ tests/
|
|
80
|
+
uv run mypy src/
|
|
81
|
+
uv run pytest
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## License
|
|
85
|
+
|
|
86
|
+
MIT
|
botstash-0.1.0/README.md
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# BotStash
|
|
2
|
+
|
|
3
|
+
A CLI tool and lightweight WebUI that ingests LMS course exports (Blackboard IMSCC / Canvas) and Echo360 VTT transcripts, uploads content to an [AnythingLLM](https://anythingllm.com/) workspace, and returns embeddable chatbot code for pasting into a course page.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **IMSCC ingestion** — unzips and walks Blackboard/Canvas common cartridge exports
|
|
8
|
+
- **Transcript ingestion** — processes folders of Echo360 VTT files
|
|
9
|
+
- **Multi-format extraction** — PPTX, DOCX, PDF, VTT, QTI quizzes
|
|
10
|
+
- **Auto-classification** — heuristic tagging of content types (lecture, worksheet, assignment, etc.)
|
|
11
|
+
- **AnythingLLM integration** — uploads documents, manages workspaces, retrieves embed code
|
|
12
|
+
- **WebUI** — FastAPI + Jinja2 interface for non-terminal users
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
pip install botstash
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Quick Start
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
# Full pipeline
|
|
24
|
+
botstash run course.zip transcripts/ \
|
|
25
|
+
--workspace ISYS2001 \
|
|
26
|
+
--url https://your-anythingllm.instance \
|
|
27
|
+
--key YOUR_API_KEY
|
|
28
|
+
|
|
29
|
+
# Two-step workflow (extract, review, embed)
|
|
30
|
+
botstash extract course.zip transcripts/ --output ./staging/
|
|
31
|
+
# ... review staging/tags.json ...
|
|
32
|
+
botstash embed ./staging/ --workspace ISYS2001
|
|
33
|
+
|
|
34
|
+
# Launch WebUI
|
|
35
|
+
botstash serve
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Development
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
# Clone and install in dev mode
|
|
42
|
+
git clone https://github.com/michael-borck/botstash.git
|
|
43
|
+
cd botstash
|
|
44
|
+
uv sync --dev
|
|
45
|
+
|
|
46
|
+
# Run checks
|
|
47
|
+
uv run ruff check src/ tests/
|
|
48
|
+
uv run mypy src/
|
|
49
|
+
uv run pytest
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## License
|
|
53
|
+
|
|
54
|
+
MIT
|
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
# VectorBoard — Project Specification v0.1
|
|
2
|
+
|
|
3
|
+
> A CLI tool and lightweight WebUI that ingests an LMS course export (Blackboard IMSCC or Canvas) and a folder of Echo360 VTT transcripts, extracts and classifies content, uploads it to an AnythingLLM workspace, and returns embeddable chatbot code for pasting into a course page.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Name Rationale
|
|
8
|
+
|
|
9
|
+
**VectorBoard** — vector embeddings + "board" as a generic term for a course/learning board. Intentionally LMS-agnostic so it survives the move from Blackboard to Canvas without renaming. Python package name: `vectorboard`.
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## Goals
|
|
14
|
+
|
|
15
|
+
- Give students a citeable, course-aware AI chatbot embedded directly in their LMS page
|
|
16
|
+
- Require minimal technical effort from the lecturer (one command or a simple WebUI)
|
|
17
|
+
- Support both automated and manually-reviewed classification workflows
|
|
18
|
+
- Be LMS-portable: IMSCC is the common cartridge format shared by Blackboard and Canvas
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
## Architecture
|
|
23
|
+
|
|
24
|
+
```
|
|
25
|
+
vectorboard/
|
|
26
|
+
├── cli.py # Click-based entry points
|
|
27
|
+
├── config.py # Config file + env var handling
|
|
28
|
+
├── pipeline.py # Orchestrates the full run
|
|
29
|
+
├── ingester/
|
|
30
|
+
│ ├── imscc.py # Unzip + walk IMSCC/common cartridge structure
|
|
31
|
+
│ └── transcript.py # VTT folder ingestion
|
|
32
|
+
├── extractors/
|
|
33
|
+
│ ├── docx.py
|
|
34
|
+
│ ├── pdf.py
|
|
35
|
+
│ ├── pptx.py
|
|
36
|
+
│ ├── vtt.py # VTT → clean text (timestamps stripped)
|
|
37
|
+
│ ├── qti.py # Blackboard/Canvas quiz XML → questions only
|
|
38
|
+
│ ├── url_tracker.py # Log video/external URLs found in manifests
|
|
39
|
+
│ └── unit_outline.py # Bespoke plugin (to be provided at build time)
|
|
40
|
+
├── classifier/
|
|
41
|
+
│ └── auto.py # Heuristic + optional AI classification
|
|
42
|
+
├── anythingllm/
|
|
43
|
+
│ └── client.py # AnythingLLM REST API wrapper
|
|
44
|
+
└── webui/
|
|
45
|
+
└── app.py # FastAPI + Jinja2 simple UI
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
---
|
|
49
|
+
|
|
50
|
+
## CLI Commands
|
|
51
|
+
|
|
52
|
+
### Full pipeline (default usage)
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
vectorboard run course.zip transcripts/ \
|
|
56
|
+
--workspace ISYS2001 \
|
|
57
|
+
--url https://your-anythingllm.instance \
|
|
58
|
+
--key YOUR_API_KEY
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### Two-step workflow (for manual tag review)
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
# Step 1: extract and auto-classify, write tags file
|
|
65
|
+
vectorboard extract course.zip transcripts/ --output ./staging/
|
|
66
|
+
|
|
67
|
+
# Step 2 (optional): open staging/tags.json in any editor and adjust types/titles
|
|
68
|
+
|
|
69
|
+
# Step 3: embed using the (possibly edited) tags file
|
|
70
|
+
vectorboard embed ./staging/ --workspace ISYS2001 --tags staging/tags.json
|
|
71
|
+
|
|
72
|
+
# Step 4: retrieve embed code
|
|
73
|
+
vectorboard chatbot ISYS2001
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### Reset a workspace (new semester)
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
vectorboard embed ./staging/ --workspace ISYS2001 --reset
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
`--reset` clears all documents from the workspace before re-uploading. The workspace itself (and its embed code URL) is preserved, so the LMS page never needs updating mid-year.
|
|
83
|
+
|
|
84
|
+
---
|
|
85
|
+
|
|
86
|
+
## Workspace Strategy
|
|
87
|
+
|
|
88
|
+
- **One workspace per unit code** (e.g. `ISYS2001`, `ISYS6020`)
|
|
89
|
+
- Semester changeover is handled by running with `--reset` — workspace identity and embed URL remain stable
|
|
90
|
+
- Archiving, deletion, and multi-semester history management are intentionally out of scope; these are handled manually in the AnythingLLM UI if needed
|
|
91
|
+
|
|
92
|
+
---
|
|
93
|
+
|
|
94
|
+
## Document Classification
|
|
95
|
+
|
|
96
|
+
Auto-classification uses a two-pass approach:
|
|
97
|
+
|
|
98
|
+
**Pass 1 — Filename and path heuristics**
|
|
99
|
+
|
|
100
|
+
Keywords matched against filename and parent folder name (case-insensitive):
|
|
101
|
+
|
|
102
|
+
| Keywords | Assigned type |
|
|
103
|
+
|---|---|
|
|
104
|
+
| `lecture`, `slides`, `week` | `lecture` |
|
|
105
|
+
| `worksheet`, `tutorial`, `lab` | `worksheet` |
|
|
106
|
+
| `assignment`, `task`, `project` | `assignment` |
|
|
107
|
+
| `rubric`, `marking`, `criteria` | `rubric` |
|
|
108
|
+
| `outline`, `unit guide`, `course guide` | `unit_outline` |
|
|
109
|
+
| `quiz`, `test` | `quiz` |
|
|
110
|
+
| `reading`, `article`, `chapter` | `reading` |
|
|
111
|
+
| *(VTT file)* | `transcript` |
|
|
112
|
+
| *(URL-only entry)* | `video_url` |
|
|
113
|
+
|
|
114
|
+
**Pass 2 — Content heuristics**
|
|
115
|
+
|
|
116
|
+
First 500 characters of extracted text are inspected for structural signals (e.g. QTI XML namespace → always `quiz`; "Learning Outcomes" header → likely `unit_outline`). Pass 2 overrides Pass 1 on high-confidence matches.
|
|
117
|
+
|
|
118
|
+
**Fallback:** `misc`
|
|
119
|
+
|
|
120
|
+
### Tags file format (`tags.json`)
|
|
121
|
+
|
|
122
|
+
Generated automatically after extraction. Edit before the embed step to override any classification.
|
|
123
|
+
|
|
124
|
+
```json
|
|
125
|
+
[
|
|
126
|
+
{
|
|
127
|
+
"source_file": "Week1_Intro.pptx",
|
|
128
|
+
"extracted_as": "staging/Week1_Intro.txt",
|
|
129
|
+
"type": "lecture",
|
|
130
|
+
"title": "Week 1: Introduction to Information Systems",
|
|
131
|
+
"week": 1
|
|
132
|
+
},
|
|
133
|
+
{
|
|
134
|
+
"source_file": "Assessment1_Rubric.docx",
|
|
135
|
+
"extracted_as": "staging/Assessment1_Rubric.txt",
|
|
136
|
+
"type": "rubric",
|
|
137
|
+
"title": "Assessment 1 Rubric",
|
|
138
|
+
"week": null
|
|
139
|
+
}
|
|
140
|
+
]
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
Valid `type` values: `lecture`, `worksheet`, `assignment`, `rubric`, `unit_outline`, `quiz`, `reading`, `transcript`, `video_url`, `misc`
|
|
144
|
+
|
|
145
|
+
---
|
|
146
|
+
|
|
147
|
+
## Extractor Behaviour
|
|
148
|
+
|
|
149
|
+
| Source format | Output | Notes |
|
|
150
|
+
|---|---|---|
|
|
151
|
+
| PPTX | Plain text per slide, slide number preserved as a heading | Speaker notes included |
|
|
152
|
+
| DOCX | Plain text, headings preserved as markdown-style markers | Unit outline uses bespoke plugin |
|
|
153
|
+
| PDF | Plain text via `pdfminer.six` or `pymupdf` | `--ocr` flag available as fallback |
|
|
154
|
+
| VTT | Clean plain text, timestamps stripped, filename used as title | One text file per transcript |
|
|
155
|
+
| QTI XML (quizzes) | Question text only, one question per line | Answer choices and correct answers excluded (student-facing) |
|
|
156
|
+
| Video / external URLs | Written to `urls_log.txt` with the page/item context they appeared in | Not fetched or scraped |
|
|
157
|
+
|
|
158
|
+
### Unit outline extraction
|
|
159
|
+
|
|
160
|
+
A bespoke Python plugin handles unit outlines. Interface expected at build time:
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
def extract_unit_outline(file_path: str) -> str:
|
|
164
|
+
"""Takes an absolute path to a DOCX/PDF unit outline.
|
|
165
|
+
Returns extracted plain text as a single string."""
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
---
|
|
169
|
+
|
|
170
|
+
## AnythingLLM Integration
|
|
171
|
+
|
|
172
|
+
Uses the AnythingLLM REST API. Key operations:
|
|
173
|
+
|
|
174
|
+
| Operation | API call |
|
|
175
|
+
|---|---|
|
|
176
|
+
| Check workspace exists | `GET /api/v1/workspaces` |
|
|
177
|
+
| Create workspace | `POST /api/v1/workspace/new` |
|
|
178
|
+
| Upload document | `POST /api/v1/document/upload` |
|
|
179
|
+
| Move document to workspace | `POST /api/v1/workspace/{slug}/update-embeddings` |
|
|
180
|
+
| Reset workspace documents | `DELETE` or re-embed with `--reset` flag |
|
|
181
|
+
| Get chatbot embed code | `GET /api/v1/workspace/{slug}/chatbot-embed` *(confirm endpoint)* |
|
|
182
|
+
|
|
183
|
+
### Configuration
|
|
184
|
+
|
|
185
|
+
Credentials are never passed as plain CLI arguments. Set via:
|
|
186
|
+
|
|
187
|
+
- A `.vectorboard.env` file in the working directory, or
|
|
188
|
+
- Environment variables: `ANYTHINGLLM_URL`, `ANYTHINGLLM_KEY`
|
|
189
|
+
|
|
190
|
+
A `vectorboard init` command scaffolds the `.vectorboard.env` file interactively.
|
|
191
|
+
|
|
192
|
+
---
|
|
193
|
+
|
|
194
|
+
## Staging and Cleanup
|
|
195
|
+
|
|
196
|
+
- Extracted text files are written to a `./staging/` directory during processing
|
|
197
|
+
- After a successful embed, staging files are deleted automatically unless `--keep-staging` is passed
|
|
198
|
+
- The `tags.json` file is always retained after a run (useful for auditing what was embedded)
|
|
199
|
+
|
|
200
|
+
---
|
|
201
|
+
|
|
202
|
+
## WebUI
|
|
203
|
+
|
|
204
|
+
A lightweight FastAPI + Jinja2 interface for lecturers who prefer not to use the terminal.
|
|
205
|
+
|
|
206
|
+
**Workflow:**
|
|
207
|
+
|
|
208
|
+
1. Upload course ZIP and transcript folder
|
|
209
|
+
2. Review extracted items with auto-assigned types (inline dropdowns to override before embedding)
|
|
210
|
+
3. Click **Embed to AnythingLLM**
|
|
211
|
+
4. Copy the chatbot embed code from the result panel
|
|
212
|
+
|
|
213
|
+
The WebUI is a thin wrapper around the same pipeline — no separate logic. Launch with:
|
|
214
|
+
|
|
215
|
+
```bash
|
|
216
|
+
vectorboard serve
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
---
|
|
220
|
+
|
|
221
|
+
## Out of Scope (v1)
|
|
222
|
+
|
|
223
|
+
- Fetching or scraping external URLs
|
|
224
|
+
- Semester archiving or workspace deletion
|
|
225
|
+
- Multi-workspace merge
|
|
226
|
+
- Direct LMS API integration (embed code is copied manually)
|
|
227
|
+
- LMS platforms other than Blackboard/Canvas IMSCC format
|
|
228
|
+
- Student authentication or access control on the WebUI
|
|
229
|
+
|
|
230
|
+
---
|
|
231
|
+
|
|
232
|
+
## Open Questions (resolve at build time)
|
|
233
|
+
|
|
234
|
+
1. **Unit outline plugin** — confirm function signature and whether it handles both DOCX and PDF inputs
|
|
235
|
+
2. **AnythingLLM embed code** — confirm whether the embed snippet is retrievable via API or only from the UI; if UI-only, the `chatbot` command prints the workspace URL and instructs the user to copy from there
|
|
236
|
+
3. **AnythingLLM document metadata** — determine which metadata fields (week number, doc type) can be passed at upload time to improve citation quality in responses
|
|
237
|
+
4. **Canvas differences** — confirm whether Canvas IMSCC exports use the same QTI format and manifest structure as Blackboard; adjust `imscc.py` accordingly
|
|
238
|
+
|
|
239
|
+
---
|
|
240
|
+
|
|
241
|
+
## Suggested Dependency Stack
|
|
242
|
+
|
|
243
|
+
| Purpose | Library |
|
|
244
|
+
|---|---|
|
|
245
|
+
| CLI framework | `click` |
|
|
246
|
+
| WebUI | `fastapi` + `jinja2` + `uvicorn` |
|
|
247
|
+
| PPTX extraction | `python-pptx` |
|
|
248
|
+
| DOCX extraction | `python-docx` |
|
|
249
|
+
| PDF extraction | `pdfminer.six` or `pymupdf` |
|
|
250
|
+
| VTT parsing | `webvtt-py` |
|
|
251
|
+
| HTTP client (AnythingLLM API) | `httpx` |
|
|
252
|
+
| Config / env | `python-dotenv` |
|
|
253
|
+
| Packaging | `pyproject.toml` with `pip install -e .` |
|
|
254
|
+
|
|
255
|
+
---
|
|
256
|
+
|
|
257
|
+
*VectorBoard v0.1 spec — generated April 2026. Continue build in a Claude Code session.*
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "botstash"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "CLI tool and WebUI that ingests LMS course exports and Echo360 transcripts into an AnythingLLM workspace for embedded course chatbots."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = "MIT"
|
|
7
|
+
requires-python = ">=3.11"
|
|
8
|
+
authors = [
|
|
9
|
+
{ name = "Michael", email = "michael@example.com" },
|
|
10
|
+
]
|
|
11
|
+
keywords = ["lms", "chatbot", "anythingllm", "botstash", "education", "rag"]
|
|
12
|
+
classifiers = [
|
|
13
|
+
"Development Status :: 3 - Alpha",
|
|
14
|
+
"Intended Audience :: Education",
|
|
15
|
+
"License :: OSI Approved :: MIT License",
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Programming Language :: Python :: 3.11",
|
|
18
|
+
"Programming Language :: Python :: 3.12",
|
|
19
|
+
"Programming Language :: Python :: 3.13",
|
|
20
|
+
"Topic :: Education",
|
|
21
|
+
]
|
|
22
|
+
dependencies = [
|
|
23
|
+
"click>=8.1",
|
|
24
|
+
"httpx>=0.27",
|
|
25
|
+
"python-dotenv>=1.0",
|
|
26
|
+
"python-pptx>=1.0",
|
|
27
|
+
"python-docx>=1.1",
|
|
28
|
+
"pdfminer.six>=20231228",
|
|
29
|
+
"webvtt-py>=0.5",
|
|
30
|
+
"fastapi>=0.115",
|
|
31
|
+
"jinja2>=3.1",
|
|
32
|
+
"uvicorn>=0.32",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
[project.scripts]
|
|
36
|
+
botstash = "botstash.cli:cli"
|
|
37
|
+
|
|
38
|
+
[project.urls]
|
|
39
|
+
Homepage = "https://github.com/michael-borck/botstash"
|
|
40
|
+
Repository = "https://github.com/michael-borck/botstash"
|
|
41
|
+
Issues = "https://github.com/michael-borck/botstash/issues"
|
|
42
|
+
|
|
43
|
+
[build-system]
|
|
44
|
+
requires = ["hatchling"]
|
|
45
|
+
build-backend = "hatchling.build"
|
|
46
|
+
|
|
47
|
+
[tool.hatch.build.targets.wheel]
|
|
48
|
+
packages = ["src/botstash"]
|
|
49
|
+
|
|
50
|
+
[tool.ruff]
|
|
51
|
+
target-version = "py311"
|
|
52
|
+
line-length = 88
|
|
53
|
+
src = ["src", "tests"]
|
|
54
|
+
|
|
55
|
+
[tool.ruff.lint]
|
|
56
|
+
select = ["E", "F", "I", "N", "W", "UP", "B", "SIM", "TCH"]
|
|
57
|
+
|
|
58
|
+
[tool.mypy]
|
|
59
|
+
python_version = "3.11"
|
|
60
|
+
strict = true
|
|
61
|
+
warn_return_any = true
|
|
62
|
+
warn_unused_configs = true
|
|
63
|
+
|
|
64
|
+
[tool.pytest.ini_options]
|
|
65
|
+
testpaths = ["tests"]
|
|
66
|
+
addopts = "-v --tb=short"
|
|
67
|
+
|
|
68
|
+
[dependency-groups]
|
|
69
|
+
dev = [
|
|
70
|
+
"mypy>=1.20.0",
|
|
71
|
+
"pytest>=9.0.3",
|
|
72
|
+
"ruff>=0.15.9",
|
|
73
|
+
"twine>=6.2.0",
|
|
74
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""AnythingLLM REST API wrapper."""
|
|
File without changes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Heuristic and optional AI classification."""
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Click-based CLI entry points."""
|
|
2
|
+
|
|
3
|
+
import click
|
|
4
|
+
|
|
5
|
+
from botstash import __version__
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@click.group()
|
|
9
|
+
@click.version_option(version=__version__, prog_name="botstash")
|
|
10
|
+
def cli() -> None:
|
|
11
|
+
"""BotStash — LMS course content to AnythingLLM chatbot pipeline."""
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@cli.command()
|
|
15
|
+
@click.argument("course_zip", type=click.Path(exists=True))
|
|
16
|
+
@click.argument("transcripts", type=click.Path(exists=True))
|
|
17
|
+
@click.option("--workspace", required=True, help="AnythingLLM workspace name.")
|
|
18
|
+
@click.option("--url", envvar="ANYTHINGLLM_URL", help="AnythingLLM instance URL.")
|
|
19
|
+
@click.option("--key", envvar="ANYTHINGLLM_KEY", help="AnythingLLM API key.")
|
|
20
|
+
@click.option("--keep-staging", is_flag=True, help="Keep staging files after embed.")
|
|
21
|
+
def run(
|
|
22
|
+
course_zip: str,
|
|
23
|
+
transcripts: str,
|
|
24
|
+
workspace: str,
|
|
25
|
+
url: str | None,
|
|
26
|
+
key: str | None,
|
|
27
|
+
keep_staging: bool,
|
|
28
|
+
) -> None:
|
|
29
|
+
"""Run the full extract-classify-embed pipeline."""
|
|
30
|
+
click.echo("botstash run: not yet implemented")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@cli.command()
|
|
34
|
+
@click.argument("course_zip", type=click.Path(exists=True))
|
|
35
|
+
@click.argument("transcripts", type=click.Path(exists=True))
|
|
36
|
+
@click.option("--output", default="./staging", help="Output directory.")
|
|
37
|
+
def extract(course_zip: str, transcripts: str, output: str) -> None:
|
|
38
|
+
"""Extract and auto-classify content from a course export."""
|
|
39
|
+
click.echo("botstash extract: not yet implemented")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@cli.command()
|
|
43
|
+
@click.argument("staging_dir", type=click.Path(exists=True))
|
|
44
|
+
@click.option("--workspace", required=True, help="AnythingLLM workspace name.")
|
|
45
|
+
@click.option("--tags", type=click.Path(exists=True), help="Path to tags.json.")
|
|
46
|
+
@click.option("--reset", is_flag=True, help="Clear workspace before uploading.")
|
|
47
|
+
def embed(staging_dir: str, workspace: str, tags: str | None, reset: bool) -> None:
|
|
48
|
+
"""Embed staged documents into an AnythingLLM workspace."""
|
|
49
|
+
click.echo("botstash embed: not yet implemented")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@cli.command()
|
|
53
|
+
@click.argument("workspace")
|
|
54
|
+
def chatbot(workspace: str) -> None:
|
|
55
|
+
"""Retrieve the chatbot embed code for a workspace."""
|
|
56
|
+
click.echo("botstash chatbot: not yet implemented")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@cli.command()
|
|
60
|
+
def init() -> None:
|
|
61
|
+
"""Scaffold a .botstash.env configuration file."""
|
|
62
|
+
click.echo("botstash init: not yet implemented")
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@cli.command()
|
|
66
|
+
@click.option("--host", default="127.0.0.1", help="Host to bind to.")
|
|
67
|
+
@click.option("--port", default=8000, help="Port to bind to.")
|
|
68
|
+
def serve(host: str, port: int) -> None:
|
|
69
|
+
"""Launch the BotStash WebUI."""
|
|
70
|
+
click.echo("botstash serve: not yet implemented")
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Configuration file and environment variable handling."""
|
|
File without changes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""DOCX text extraction."""
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""PDF text extraction."""
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""PPTX text extraction."""
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""QTI XML quiz extraction — question text only."""
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Bespoke unit outline extraction plugin."""
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Log video/external URLs found in manifests."""
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""VTT to clean text conversion (timestamps stripped)."""
|
|
File without changes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Unzip and walk IMSCC/common cartridge structure."""
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""VTT folder ingestion."""
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Orchestrates the full extract-classify-embed pipeline."""
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""FastAPI + Jinja2 WebUI."""
|
|
File without changes
|