chunksilo 2.0.0__tar.gz → 2.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of chunksilo might be problematic. Click here for more details.
- {chunksilo-2.0.0/src/chunksilo.egg-info → chunksilo-2.1.0}/PKG-INFO +65 -16
- {chunksilo-2.0.0 → chunksilo-2.1.0}/README.md +62 -15
- {chunksilo-2.0.0 → chunksilo-2.1.0}/pyproject.toml +2 -1
- {chunksilo-2.0.0 → chunksilo-2.1.0}/src/chunksilo/__init__.py +1 -1
- {chunksilo-2.0.0 → chunksilo-2.1.0}/src/chunksilo/cfgload.py +10 -0
- {chunksilo-2.0.0 → chunksilo-2.1.0}/src/chunksilo/search.py +478 -10
- {chunksilo-2.0.0 → chunksilo-2.1.0/src/chunksilo.egg-info}/PKG-INFO +65 -16
- {chunksilo-2.0.0 → chunksilo-2.1.0}/src/chunksilo.egg-info/SOURCES.txt +1 -0
- {chunksilo-2.0.0 → chunksilo-2.1.0}/src/chunksilo.egg-info/requires.txt +3 -0
- chunksilo-2.1.0/test/test_jira_integration.py +565 -0
- {chunksilo-2.0.0 → chunksilo-2.1.0}/test/test_utils.py +94 -0
- {chunksilo-2.0.0 → chunksilo-2.1.0}/LICENSE +0 -0
- {chunksilo-2.0.0 → chunksilo-2.1.0}/NOTICE +0 -0
- {chunksilo-2.0.0 → chunksilo-2.1.0}/requirements.txt +0 -0
- {chunksilo-2.0.0 → chunksilo-2.1.0}/setup.cfg +0 -0
- {chunksilo-2.0.0 → chunksilo-2.1.0}/src/chunksilo/__main__.py +0 -0
- {chunksilo-2.0.0 → chunksilo-2.1.0}/src/chunksilo/cli.py +0 -0
- {chunksilo-2.0.0 → chunksilo-2.1.0}/src/chunksilo/confluence_html_formatter.py +0 -0
- {chunksilo-2.0.0 → chunksilo-2.1.0}/src/chunksilo/index.py +0 -0
- {chunksilo-2.0.0 → chunksilo-2.1.0}/src/chunksilo/server.py +0 -0
- {chunksilo-2.0.0 → chunksilo-2.1.0}/src/chunksilo.egg-info/dependency_links.txt +0 -0
- {chunksilo-2.0.0 → chunksilo-2.1.0}/src/chunksilo.egg-info/entry_points.txt +0 -0
- {chunksilo-2.0.0 → chunksilo-2.1.0}/src/chunksilo.egg-info/top_level.txt +0 -0
- {chunksilo-2.0.0 → chunksilo-2.1.0}/test/test_chunk_location.py +0 -0
- {chunksilo-2.0.0 → chunksilo-2.1.0}/test/test_confluence_html_formatter.py +0 -0
- {chunksilo-2.0.0 → chunksilo-2.1.0}/test/test_error_handling.py +0 -0
- {chunksilo-2.0.0 → chunksilo-2.1.0}/test/test_heading_path_integration.py +0 -0
- {chunksilo-2.0.0 → chunksilo-2.1.0}/test/test_incremental_ingest.py +0 -0
- {chunksilo-2.0.0 → chunksilo-2.1.0}/test/test_rag_metrics.py +0 -0
- {chunksilo-2.0.0 → chunksilo-2.1.0}/test/test_retrieval_only.py +0 -0
- {chunksilo-2.0.0 → chunksilo-2.1.0}/test/test_system.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: chunksilo
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.1.0
|
|
4
4
|
Summary: Local RAG-based semantic document search with MCP server interface
|
|
5
5
|
Author: Fredrik Reveny
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -33,32 +33,53 @@ Requires-Dist: fastembed<1,>=0.5.0
|
|
|
33
33
|
Requires-Dist: pyyaml<7,>=6.0
|
|
34
34
|
Provides-Extra: confluence
|
|
35
35
|
Requires-Dist: llama-index-readers-confluence<1,>=0.6.0; extra == "confluence"
|
|
36
|
+
Provides-Extra: jira
|
|
37
|
+
Requires-Dist: jira<4,>=3.5.0; extra == "jira"
|
|
36
38
|
Provides-Extra: test
|
|
37
39
|
Requires-Dist: pytest<9,>=7.4.0; extra == "test"
|
|
38
40
|
Requires-Dist: requests<3,>=2.31.0; extra == "test"
|
|
39
41
|
Dynamic: license-file
|
|
40
42
|
|
|
43
|
+
<p align="center">
|
|
44
|
+
<img src="chunksilo.png" alt="ChunkSilo Logo" width="500">
|
|
45
|
+
</p>
|
|
46
|
+
|
|
41
47
|
# ChunkSilo MCP Server
|
|
42
48
|
|
|
43
49
|
ChunkSilo is like a local Google for your documents. It uses semantic search — matching by meaning rather than exact keywords — so your LLM can find relevant information across all your files even when the wording differs from your query. Point it at your PDFs, Word docs, Markdown, and text files, and it builds a fully searchable index locally on your machine.
|
|
44
50
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
-
|
|
48
|
-
-
|
|
49
|
-
-
|
|
50
|
-
-
|
|
51
|
+
- Runs entirely on your machine — no servers, no infrastructure
|
|
52
|
+
- Semantic search + keyword filename matching across PDF, DOCX, DOC, Markdown, and TXT
|
|
53
|
+
- Incremental indexing — only reprocesses new or changed files
|
|
54
|
+
- Heading-aware results with source links back to the original file
|
|
55
|
+
- Date filtering and recency boosting
|
|
56
|
+
- Optional Confluence integration
|
|
51
57
|
|
|
52
|
-
|
|
58
|
+
### Example `search_docs` output
|
|
53
59
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
60
|
+
```json
|
|
61
|
+
{
|
|
62
|
+
"matched_files": [
|
|
63
|
+
{ "uri": "file:///docs/database-configuration.docx", "score": 0.8432 }
|
|
64
|
+
],
|
|
65
|
+
"num_matched_files": 1,
|
|
66
|
+
"chunks": [
|
|
67
|
+
{
|
|
68
|
+
"text": "To configure the database connection, set the DATABASE_URL environment variable...",
|
|
69
|
+
"score": 0.912,
|
|
70
|
+
"location": {
|
|
71
|
+
"uri": "file:///docs/setup-guide.pdf",
|
|
72
|
+
"page": 12,
|
|
73
|
+
"line": null,
|
|
74
|
+
"heading_path": ["Getting Started", "Configuration", "Database"]
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
],
|
|
78
|
+
"num_chunks": 1,
|
|
79
|
+
"query": "how to configure the database",
|
|
80
|
+
"retrieval_time": "0.42s"
|
|
81
|
+
}
|
|
82
|
+
```
|
|
62
83
|
|
|
63
84
|
## Installation
|
|
64
85
|
|
|
@@ -71,6 +92,12 @@ pip install chunksilo
|
|
|
71
92
|
|
|
72
93
|
# Or with Confluence support:
|
|
73
94
|
pip install chunksilo[confluence]
|
|
95
|
+
|
|
96
|
+
# Or with Jira support:
|
|
97
|
+
pip install chunksilo[jira]
|
|
98
|
+
|
|
99
|
+
# Or with both Confluence and Jira:
|
|
100
|
+
pip install chunksilo[confluence,jira]
|
|
74
101
|
```
|
|
75
102
|
|
|
76
103
|
Then:
|
|
@@ -184,6 +211,27 @@ All settings are optional and have sensible defaults.
|
|
|
184
211
|
| `confluence.timeout` | `10.0` | Request timeout in seconds |
|
|
185
212
|
| `confluence.max_results` | `30` | Maximum results per search |
|
|
186
213
|
|
|
214
|
+
#### Jira Settings (optional)
|
|
215
|
+
|
|
216
|
+
> **Note:** Jira integration requires the optional dependency. Install with: `pip install chunksilo[jira]`
|
|
217
|
+
|
|
218
|
+
| Setting | Default | Description |
|
|
219
|
+
| :--- | :--- | :--- |
|
|
220
|
+
| `jira.url` | `""` | Jira base URL (empty = disabled) |
|
|
221
|
+
| `jira.username` | `""` | Jira username/email |
|
|
222
|
+
| `jira.api_token` | `""` | Jira API token |
|
|
223
|
+
| `jira.timeout` | `10.0` | Request timeout in seconds |
|
|
224
|
+
| `jira.max_results` | `30` | Maximum results per search |
|
|
225
|
+
| `jira.projects` | `[]` | Project keys to search (empty = all) |
|
|
226
|
+
| `jira.include_comments` | `true` | Include issue comments in search |
|
|
227
|
+
| `jira.include_custom_fields` | `true` | Include custom fields in search |
|
|
228
|
+
|
|
229
|
+
**Creating a Jira API Token:**
|
|
230
|
+
1. Log into Jira
|
|
231
|
+
2. Go to Account Settings > Security > API Tokens
|
|
232
|
+
3. Click "Create API Token"
|
|
233
|
+
4. Copy the token and add it to your config
|
|
234
|
+
|
|
187
235
|
#### SSL Settings (optional)
|
|
188
236
|
|
|
189
237
|
| Setting | Default | Description |
|
|
@@ -357,6 +405,7 @@ Add to `mcp_settings.json` (typically in `~/.config/Code/User/globalStorage/roov
|
|
|
357
405
|
- **Retrieval errors**: Check paths in your MCP client configuration.
|
|
358
406
|
- **Offline mode**: PyPI installs default to `offline: false` (models auto-download). The offline bundle includes pre-downloaded models and sets `offline: true`. Set `retrieval.offline: true` in `config.yaml` to prevent network calls after initial model download.
|
|
359
407
|
- **Confluence Integration**: Install with `pip install chunksilo[confluence]`, then set `confluence.url`, `confluence.username`, and `confluence.api_token` in `config.yaml`.
|
|
408
|
+
- **Jira Integration**: Install with `pip install chunksilo[jira]`, then set `jira.url`, `jira.username`, and `jira.api_token` in `config.yaml`. Optionally configure `jira.projects` to restrict search to specific project keys.
|
|
360
409
|
- **Custom CA Bundle**: Set `ssl.ca_bundle_path` in `config.yaml` for custom certificates.
|
|
361
410
|
- **Network mounts**: Unavailable directories are skipped with a warning; indexing continues with available directories.
|
|
362
411
|
- **Legacy .doc files**: Requires LibreOffice to be installed for automatic conversion to .docx. If LibreOffice is not found, .doc files are skipped with a warning. Full heading extraction is supported.
|
|
@@ -1,24 +1,43 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="chunksilo.png" alt="ChunkSilo Logo" width="500">
|
|
3
|
+
</p>
|
|
4
|
+
|
|
1
5
|
# ChunkSilo MCP Server
|
|
2
6
|
|
|
3
7
|
ChunkSilo is like a local Google for your documents. It uses semantic search — matching by meaning rather than exact keywords — so your LLM can find relevant information across all your files even when the wording differs from your query. Point it at your PDFs, Word docs, Markdown, and text files, and it builds a fully searchable index locally on your machine.
|
|
4
8
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
-
|
|
8
|
-
-
|
|
9
|
-
-
|
|
10
|
-
-
|
|
9
|
+
- Runs entirely on your machine — no servers, no infrastructure
|
|
10
|
+
- Semantic search + keyword filename matching across PDF, DOCX, DOC, Markdown, and TXT
|
|
11
|
+
- Incremental indexing — only reprocesses new or changed files
|
|
12
|
+
- Heading-aware results with source links back to the original file
|
|
13
|
+
- Date filtering and recency boosting
|
|
14
|
+
- Optional Confluence integration
|
|
11
15
|
|
|
12
|
-
|
|
16
|
+
### Example `search_docs` output
|
|
13
17
|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
18
|
+
```json
|
|
19
|
+
{
|
|
20
|
+
"matched_files": [
|
|
21
|
+
{ "uri": "file:///docs/database-configuration.docx", "score": 0.8432 }
|
|
22
|
+
],
|
|
23
|
+
"num_matched_files": 1,
|
|
24
|
+
"chunks": [
|
|
25
|
+
{
|
|
26
|
+
"text": "To configure the database connection, set the DATABASE_URL environment variable...",
|
|
27
|
+
"score": 0.912,
|
|
28
|
+
"location": {
|
|
29
|
+
"uri": "file:///docs/setup-guide.pdf",
|
|
30
|
+
"page": 12,
|
|
31
|
+
"line": null,
|
|
32
|
+
"heading_path": ["Getting Started", "Configuration", "Database"]
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
],
|
|
36
|
+
"num_chunks": 1,
|
|
37
|
+
"query": "how to configure the database",
|
|
38
|
+
"retrieval_time": "0.42s"
|
|
39
|
+
}
|
|
40
|
+
```
|
|
22
41
|
|
|
23
42
|
## Installation
|
|
24
43
|
|
|
@@ -31,6 +50,12 @@ pip install chunksilo
|
|
|
31
50
|
|
|
32
51
|
# Or with Confluence support:
|
|
33
52
|
pip install chunksilo[confluence]
|
|
53
|
+
|
|
54
|
+
# Or with Jira support:
|
|
55
|
+
pip install chunksilo[jira]
|
|
56
|
+
|
|
57
|
+
# Or with both Confluence and Jira:
|
|
58
|
+
pip install chunksilo[confluence,jira]
|
|
34
59
|
```
|
|
35
60
|
|
|
36
61
|
Then:
|
|
@@ -144,6 +169,27 @@ All settings are optional and have sensible defaults.
|
|
|
144
169
|
| `confluence.timeout` | `10.0` | Request timeout in seconds |
|
|
145
170
|
| `confluence.max_results` | `30` | Maximum results per search |
|
|
146
171
|
|
|
172
|
+
#### Jira Settings (optional)
|
|
173
|
+
|
|
174
|
+
> **Note:** Jira integration requires the optional dependency. Install with: `pip install chunksilo[jira]`
|
|
175
|
+
|
|
176
|
+
| Setting | Default | Description |
|
|
177
|
+
| :--- | :--- | :--- |
|
|
178
|
+
| `jira.url` | `""` | Jira base URL (empty = disabled) |
|
|
179
|
+
| `jira.username` | `""` | Jira username/email |
|
|
180
|
+
| `jira.api_token` | `""` | Jira API token |
|
|
181
|
+
| `jira.timeout` | `10.0` | Request timeout in seconds |
|
|
182
|
+
| `jira.max_results` | `30` | Maximum results per search |
|
|
183
|
+
| `jira.projects` | `[]` | Project keys to search (empty = all) |
|
|
184
|
+
| `jira.include_comments` | `true` | Include issue comments in search |
|
|
185
|
+
| `jira.include_custom_fields` | `true` | Include custom fields in search |
|
|
186
|
+
|
|
187
|
+
**Creating a Jira API Token:**
|
|
188
|
+
1. Log into Jira
|
|
189
|
+
2. Go to Account Settings > Security > API Tokens
|
|
190
|
+
3. Click "Create API Token"
|
|
191
|
+
4. Copy the token and add it to your config
|
|
192
|
+
|
|
147
193
|
#### SSL Settings (optional)
|
|
148
194
|
|
|
149
195
|
| Setting | Default | Description |
|
|
@@ -317,6 +363,7 @@ Add to `mcp_settings.json` (typically in `~/.config/Code/User/globalStorage/roov
|
|
|
317
363
|
- **Retrieval errors**: Check paths in your MCP client configuration.
|
|
318
364
|
- **Offline mode**: PyPI installs default to `offline: false` (models auto-download). The offline bundle includes pre-downloaded models and sets `offline: true`. Set `retrieval.offline: true` in `config.yaml` to prevent network calls after initial model download.
|
|
319
365
|
- **Confluence Integration**: Install with `pip install chunksilo[confluence]`, then set `confluence.url`, `confluence.username`, and `confluence.api_token` in `config.yaml`.
|
|
366
|
+
- **Jira Integration**: Install with `pip install chunksilo[jira]`, then set `jira.url`, `jira.username`, and `jira.api_token` in `config.yaml`. Optionally configure `jira.projects` to restrict search to specific project keys.
|
|
320
367
|
- **Custom CA Bundle**: Set `ssl.ca_bundle_path` in `config.yaml` for custom certificates.
|
|
321
368
|
- **Network mounts**: Unavailable directories are skipped with a warning; indexing continues with available directories.
|
|
322
369
|
- **Legacy .doc files**: Requires LibreOffice to be installed for automatic conversion to .docx. If LibreOffice is not found, .doc files are skipped with a warning. Full heading extraction is supported.
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "chunksilo"
|
|
7
|
-
version = "2.
|
|
7
|
+
version = "2.1.0"
|
|
8
8
|
description = "Local RAG-based semantic document search with MCP server interface"
|
|
9
9
|
license = "Apache-2.0"
|
|
10
10
|
requires-python = ">=3.11"
|
|
@@ -26,6 +26,7 @@ dynamic = ["dependencies"]
|
|
|
26
26
|
|
|
27
27
|
[project.optional-dependencies]
|
|
28
28
|
confluence = ["llama-index-readers-confluence>=0.6.0,<1"]
|
|
29
|
+
jira = ["jira>=3.5.0,<4"]
|
|
29
30
|
test = ["pytest>=7.4.0,<9", "requests>=2.31.0,<3"]
|
|
30
31
|
|
|
31
32
|
[project.scripts]
|
|
@@ -70,6 +70,16 @@ _DEFAULTS: dict[str, Any] = {
|
|
|
70
70
|
"timeout": 10.0,
|
|
71
71
|
"max_results": 30,
|
|
72
72
|
},
|
|
73
|
+
"jira": {
|
|
74
|
+
"url": "",
|
|
75
|
+
"username": "",
|
|
76
|
+
"api_token": "",
|
|
77
|
+
"timeout": 10.0,
|
|
78
|
+
"max_results": 30,
|
|
79
|
+
"projects": [], # Empty list = all accessible projects
|
|
80
|
+
"include_comments": False,
|
|
81
|
+
"include_custom_fields": False,
|
|
82
|
+
},
|
|
73
83
|
"ssl": {
|
|
74
84
|
"ca_bundle_path": "",
|
|
75
85
|
},
|