chunksilo 2.0.0__tar.gz → 2.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of chunksilo might be problematic. Click here for more details.

Files changed (31) hide show
  1. {chunksilo-2.0.0/src/chunksilo.egg-info → chunksilo-2.1.0}/PKG-INFO +65 -16
  2. {chunksilo-2.0.0 → chunksilo-2.1.0}/README.md +62 -15
  3. {chunksilo-2.0.0 → chunksilo-2.1.0}/pyproject.toml +2 -1
  4. {chunksilo-2.0.0 → chunksilo-2.1.0}/src/chunksilo/__init__.py +1 -1
  5. {chunksilo-2.0.0 → chunksilo-2.1.0}/src/chunksilo/cfgload.py +10 -0
  6. {chunksilo-2.0.0 → chunksilo-2.1.0}/src/chunksilo/search.py +478 -10
  7. {chunksilo-2.0.0 → chunksilo-2.1.0/src/chunksilo.egg-info}/PKG-INFO +65 -16
  8. {chunksilo-2.0.0 → chunksilo-2.1.0}/src/chunksilo.egg-info/SOURCES.txt +1 -0
  9. {chunksilo-2.0.0 → chunksilo-2.1.0}/src/chunksilo.egg-info/requires.txt +3 -0
  10. chunksilo-2.1.0/test/test_jira_integration.py +565 -0
  11. {chunksilo-2.0.0 → chunksilo-2.1.0}/test/test_utils.py +94 -0
  12. {chunksilo-2.0.0 → chunksilo-2.1.0}/LICENSE +0 -0
  13. {chunksilo-2.0.0 → chunksilo-2.1.0}/NOTICE +0 -0
  14. {chunksilo-2.0.0 → chunksilo-2.1.0}/requirements.txt +0 -0
  15. {chunksilo-2.0.0 → chunksilo-2.1.0}/setup.cfg +0 -0
  16. {chunksilo-2.0.0 → chunksilo-2.1.0}/src/chunksilo/__main__.py +0 -0
  17. {chunksilo-2.0.0 → chunksilo-2.1.0}/src/chunksilo/cli.py +0 -0
  18. {chunksilo-2.0.0 → chunksilo-2.1.0}/src/chunksilo/confluence_html_formatter.py +0 -0
  19. {chunksilo-2.0.0 → chunksilo-2.1.0}/src/chunksilo/index.py +0 -0
  20. {chunksilo-2.0.0 → chunksilo-2.1.0}/src/chunksilo/server.py +0 -0
  21. {chunksilo-2.0.0 → chunksilo-2.1.0}/src/chunksilo.egg-info/dependency_links.txt +0 -0
  22. {chunksilo-2.0.0 → chunksilo-2.1.0}/src/chunksilo.egg-info/entry_points.txt +0 -0
  23. {chunksilo-2.0.0 → chunksilo-2.1.0}/src/chunksilo.egg-info/top_level.txt +0 -0
  24. {chunksilo-2.0.0 → chunksilo-2.1.0}/test/test_chunk_location.py +0 -0
  25. {chunksilo-2.0.0 → chunksilo-2.1.0}/test/test_confluence_html_formatter.py +0 -0
  26. {chunksilo-2.0.0 → chunksilo-2.1.0}/test/test_error_handling.py +0 -0
  27. {chunksilo-2.0.0 → chunksilo-2.1.0}/test/test_heading_path_integration.py +0 -0
  28. {chunksilo-2.0.0 → chunksilo-2.1.0}/test/test_incremental_ingest.py +0 -0
  29. {chunksilo-2.0.0 → chunksilo-2.1.0}/test/test_rag_metrics.py +0 -0
  30. {chunksilo-2.0.0 → chunksilo-2.1.0}/test/test_retrieval_only.py +0 -0
  31. {chunksilo-2.0.0 → chunksilo-2.1.0}/test/test_system.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chunksilo
3
- Version: 2.0.0
3
+ Version: 2.1.0
4
4
  Summary: Local RAG-based semantic document search with MCP server interface
5
5
  Author: Fredrik Reveny
6
6
  License-Expression: Apache-2.0
@@ -33,32 +33,53 @@ Requires-Dist: fastembed<1,>=0.5.0
33
33
  Requires-Dist: pyyaml<7,>=6.0
34
34
  Provides-Extra: confluence
35
35
  Requires-Dist: llama-index-readers-confluence<1,>=0.6.0; extra == "confluence"
36
+ Provides-Extra: jira
37
+ Requires-Dist: jira<4,>=3.5.0; extra == "jira"
36
38
  Provides-Extra: test
37
39
  Requires-Dist: pytest<9,>=7.4.0; extra == "test"
38
40
  Requires-Dist: requests<3,>=2.31.0; extra == "test"
39
41
  Dynamic: license-file
40
42
 
43
+ <p align="center">
44
+ <img src="chunksilo.png" alt="ChunkSilo Logo" width="500">
45
+ </p>
46
+
41
47
  # ChunkSilo MCP Server
42
48
 
43
49
  ChunkSilo is like a local Google for your documents. It uses semantic search — matching by meaning rather than exact keywords — so your LLM can find relevant information across all your files even when the wording differs from your query. Point it at your PDFs, Word docs, Markdown, and text files, and it builds a fully searchable index locally on your machine.
44
50
 
45
- ## Overview
46
-
47
- - **No permissions headache**: Each user indexes only the files they already have access to. No centralized access-control system to build or maintain — document permissions stay exactly where they are.
48
- - **No infrastructure required**: Runs entirely on the user's own machine as an MCP server. Nothing to deploy, no servers to manage.
49
- - **Easy to set up**: Any user with an MCP-compatible LLM client can install, point at their document directories, and have everything indexed and searchable.
50
- - **Works with what you have**: Supports PDF, DOCX, DOC, Markdown, and TXT from local folders, network drives, or shared mounts.
51
+ - Runs entirely on your machine — no servers, no infrastructure
52
+ - Semantic search + keyword filename matching across PDF, DOCX, DOC, Markdown, and TXT
53
+ - Incremental indexing only reprocesses new or changed files
54
+ - Heading-aware results with source links back to the original file
55
+ - Date filtering and recency boosting
56
+ - Optional Confluence integration
51
57
 
52
- ## Features
58
+ ### Example `search_docs` output
53
59
 
54
- - **Local indexing and search**: All indexing and search runs on your machine with bundled models — ChunkSilo itself makes no external network calls when `offline: true`. Note: search results are passed to your MCP client's LLM, which may be cloud-hosted.
55
- - **Incremental indexing**: Only reindexes new or changed files, so re-runs are fast even on large document collections.
56
- - **Heading-aware navigation**: Extracts headings from PDFs, Word docs, and Markdown so results include the full heading path (e.g. "Chapter 3 > Setup > Prerequisites").
57
- - **Date filtering and recency boost**: Search within a date range or let recent documents rank higher automatically.
58
- - **Dual retrieval**: Returns both meaning-based chunk matches and keyword-based filename matches separately, so file lookups don't get buried by unrelated content.
59
- - **Multi-directory with per-folder rules**: Index multiple directories with individual include/exclude glob patterns — useful for shared drives with mixed content.
60
- - **Confluence integration**: Optionally searches your Confluence instance alongside local files, with results returned in the same format.
61
- - **Source links**: Each result includes a clickable link back to the source file or Confluence page in supported MCP clients.
60
+ ```json
61
+ {
62
+ "matched_files": [
63
+ { "uri": "file:///docs/database-configuration.docx", "score": 0.8432 }
64
+ ],
65
+ "num_matched_files": 1,
66
+ "chunks": [
67
+ {
68
+ "text": "To configure the database connection, set the DATABASE_URL environment variable...",
69
+ "score": 0.912,
70
+ "location": {
71
+ "uri": "file:///docs/setup-guide.pdf",
72
+ "page": 12,
73
+ "line": null,
74
+ "heading_path": ["Getting Started", "Configuration", "Database"]
75
+ }
76
+ }
77
+ ],
78
+ "num_chunks": 1,
79
+ "query": "how to configure the database",
80
+ "retrieval_time": "0.42s"
81
+ }
82
+ ```
62
83
 
63
84
  ## Installation
64
85
 
@@ -71,6 +92,12 @@ pip install chunksilo
71
92
 
72
93
  # Or with Confluence support:
73
94
  pip install chunksilo[confluence]
95
+
96
+ # Or with Jira support:
97
+ pip install chunksilo[jira]
98
+
99
+ # Or with both Confluence and Jira:
100
+ pip install chunksilo[confluence,jira]
74
101
  ```
75
102
 
76
103
  Then:
@@ -184,6 +211,27 @@ All settings are optional and have sensible defaults.
184
211
  | `confluence.timeout` | `10.0` | Request timeout in seconds |
185
212
  | `confluence.max_results` | `30` | Maximum results per search |
186
213
 
214
+ #### Jira Settings (optional)
215
+
216
+ > **Note:** Jira integration requires the optional dependency. Install with: `pip install chunksilo[jira]`
217
+
218
+ | Setting | Default | Description |
219
+ | :--- | :--- | :--- |
220
+ | `jira.url` | `""` | Jira base URL (empty = disabled) |
221
+ | `jira.username` | `""` | Jira username/email |
222
+ | `jira.api_token` | `""` | Jira API token |
223
+ | `jira.timeout` | `10.0` | Request timeout in seconds |
224
+ | `jira.max_results` | `30` | Maximum results per search |
225
+ | `jira.projects` | `[]` | Project keys to search (empty = all) |
226
+ | `jira.include_comments` | `true` | Include issue comments in search |
227
+ | `jira.include_custom_fields` | `true` | Include custom fields in search |
228
+
229
+ **Creating a Jira API Token:**
230
+ 1. Log into Jira
231
+ 2. Go to Account Settings > Security > API Tokens
232
+ 3. Click "Create API Token"
233
+ 4. Copy the token and add it to your config
234
+
187
235
  #### SSL Settings (optional)
188
236
 
189
237
  | Setting | Default | Description |
@@ -357,6 +405,7 @@ Add to `mcp_settings.json` (typically in `~/.config/Code/User/globalStorage/roov
357
405
  - **Retrieval errors**: Check paths in your MCP client configuration.
358
406
  - **Offline mode**: PyPI installs default to `offline: false` (models auto-download). The offline bundle includes pre-downloaded models and sets `offline: true`. Set `retrieval.offline: true` in `config.yaml` to prevent network calls after initial model download.
359
407
  - **Confluence Integration**: Install with `pip install chunksilo[confluence]`, then set `confluence.url`, `confluence.username`, and `confluence.api_token` in `config.yaml`.
408
+ - **Jira Integration**: Install with `pip install chunksilo[jira]`, then set `jira.url`, `jira.username`, and `jira.api_token` in `config.yaml`. Optionally configure `jira.projects` to restrict search to specific project keys.
360
409
  - **Custom CA Bundle**: Set `ssl.ca_bundle_path` in `config.yaml` for custom certificates.
361
410
  - **Network mounts**: Unavailable directories are skipped with a warning; indexing continues with available directories.
362
411
  - **Legacy .doc files**: Requires LibreOffice to be installed for automatic conversion to .docx. If LibreOffice is not found, .doc files are skipped with a warning. Full heading extraction is supported.
@@ -1,24 +1,43 @@
1
+ <p align="center">
2
+ <img src="chunksilo.png" alt="ChunkSilo Logo" width="500">
3
+ </p>
4
+
1
5
  # ChunkSilo MCP Server
2
6
 
3
7
  ChunkSilo is like a local Google for your documents. It uses semantic search — matching by meaning rather than exact keywords — so your LLM can find relevant information across all your files even when the wording differs from your query. Point it at your PDFs, Word docs, Markdown, and text files, and it builds a fully searchable index locally on your machine.
4
8
 
5
- ## Overview
6
-
7
- - **No permissions headache**: Each user indexes only the files they already have access to. No centralized access-control system to build or maintain — document permissions stay exactly where they are.
8
- - **No infrastructure required**: Runs entirely on the user's own machine as an MCP server. Nothing to deploy, no servers to manage.
9
- - **Easy to set up**: Any user with an MCP-compatible LLM client can install, point at their document directories, and have everything indexed and searchable.
10
- - **Works with what you have**: Supports PDF, DOCX, DOC, Markdown, and TXT from local folders, network drives, or shared mounts.
9
+ - Runs entirely on your machine — no servers, no infrastructure
10
+ - Semantic search + keyword filename matching across PDF, DOCX, DOC, Markdown, and TXT
11
+ - Incremental indexing only reprocesses new or changed files
12
+ - Heading-aware results with source links back to the original file
13
+ - Date filtering and recency boosting
14
+ - Optional Confluence integration
11
15
 
12
- ## Features
16
+ ### Example `search_docs` output
13
17
 
14
- - **Local indexing and search**: All indexing and search runs on your machine with bundled models — ChunkSilo itself makes no external network calls when `offline: true`. Note: search results are passed to your MCP client's LLM, which may be cloud-hosted.
15
- - **Incremental indexing**: Only reindexes new or changed files, so re-runs are fast even on large document collections.
16
- - **Heading-aware navigation**: Extracts headings from PDFs, Word docs, and Markdown so results include the full heading path (e.g. "Chapter 3 > Setup > Prerequisites").
17
- - **Date filtering and recency boost**: Search within a date range or let recent documents rank higher automatically.
18
- - **Dual retrieval**: Returns both meaning-based chunk matches and keyword-based filename matches separately, so file lookups don't get buried by unrelated content.
19
- - **Multi-directory with per-folder rules**: Index multiple directories with individual include/exclude glob patterns — useful for shared drives with mixed content.
20
- - **Confluence integration**: Optionally searches your Confluence instance alongside local files, with results returned in the same format.
21
- - **Source links**: Each result includes a clickable link back to the source file or Confluence page in supported MCP clients.
18
+ ```json
19
+ {
20
+ "matched_files": [
21
+ { "uri": "file:///docs/database-configuration.docx", "score": 0.8432 }
22
+ ],
23
+ "num_matched_files": 1,
24
+ "chunks": [
25
+ {
26
+ "text": "To configure the database connection, set the DATABASE_URL environment variable...",
27
+ "score": 0.912,
28
+ "location": {
29
+ "uri": "file:///docs/setup-guide.pdf",
30
+ "page": 12,
31
+ "line": null,
32
+ "heading_path": ["Getting Started", "Configuration", "Database"]
33
+ }
34
+ }
35
+ ],
36
+ "num_chunks": 1,
37
+ "query": "how to configure the database",
38
+ "retrieval_time": "0.42s"
39
+ }
40
+ ```
22
41
 
23
42
  ## Installation
24
43
 
@@ -31,6 +50,12 @@ pip install chunksilo
31
50
 
32
51
  # Or with Confluence support:
33
52
  pip install chunksilo[confluence]
53
+
54
+ # Or with Jira support:
55
+ pip install chunksilo[jira]
56
+
57
+ # Or with both Confluence and Jira:
58
+ pip install chunksilo[confluence,jira]
34
59
  ```
35
60
 
36
61
  Then:
@@ -144,6 +169,27 @@ All settings are optional and have sensible defaults.
144
169
  | `confluence.timeout` | `10.0` | Request timeout in seconds |
145
170
  | `confluence.max_results` | `30` | Maximum results per search |
146
171
 
172
+ #### Jira Settings (optional)
173
+
174
+ > **Note:** Jira integration requires the optional dependency. Install with: `pip install chunksilo[jira]`
175
+
176
+ | Setting | Default | Description |
177
+ | :--- | :--- | :--- |
178
+ | `jira.url` | `""` | Jira base URL (empty = disabled) |
179
+ | `jira.username` | `""` | Jira username/email |
180
+ | `jira.api_token` | `""` | Jira API token |
181
+ | `jira.timeout` | `10.0` | Request timeout in seconds |
182
+ | `jira.max_results` | `30` | Maximum results per search |
183
+ | `jira.projects` | `[]` | Project keys to search (empty = all) |
184
+ | `jira.include_comments` | `true` | Include issue comments in search |
185
+ | `jira.include_custom_fields` | `true` | Include custom fields in search |
186
+
187
+ **Creating a Jira API Token:**
188
+ 1. Log into Jira
189
+ 2. Go to Account Settings > Security > API Tokens
190
+ 3. Click "Create API Token"
191
+ 4. Copy the token and add it to your config
192
+
147
193
  #### SSL Settings (optional)
148
194
 
149
195
  | Setting | Default | Description |
@@ -317,6 +363,7 @@ Add to `mcp_settings.json` (typically in `~/.config/Code/User/globalStorage/roov
317
363
  - **Retrieval errors**: Check paths in your MCP client configuration.
318
364
  - **Offline mode**: PyPI installs default to `offline: false` (models auto-download). The offline bundle includes pre-downloaded models and sets `offline: true`. Set `retrieval.offline: true` in `config.yaml` to prevent network calls after initial model download.
319
365
  - **Confluence Integration**: Install with `pip install chunksilo[confluence]`, then set `confluence.url`, `confluence.username`, and `confluence.api_token` in `config.yaml`.
366
+ - **Jira Integration**: Install with `pip install chunksilo[jira]`, then set `jira.url`, `jira.username`, and `jira.api_token` in `config.yaml`. Optionally configure `jira.projects` to restrict search to specific project keys.
320
367
  - **Custom CA Bundle**: Set `ssl.ca_bundle_path` in `config.yaml` for custom certificates.
321
368
  - **Network mounts**: Unavailable directories are skipped with a warning; indexing continues with available directories.
322
369
  - **Legacy .doc files**: Requires LibreOffice to be installed for automatic conversion to .docx. If LibreOffice is not found, .doc files are skipped with a warning. Full heading extraction is supported.
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "chunksilo"
7
- version = "2.0.0"
7
+ version = "2.1.0"
8
8
  description = "Local RAG-based semantic document search with MCP server interface"
9
9
  license = "Apache-2.0"
10
10
  requires-python = ">=3.11"
@@ -26,6 +26,7 @@ dynamic = ["dependencies"]
26
26
 
27
27
  [project.optional-dependencies]
28
28
  confluence = ["llama-index-readers-confluence>=0.6.0,<1"]
29
+ jira = ["jira>=3.5.0,<4"]
29
30
  test = ["pytest>=7.4.0,<9", "requests>=2.31.0,<3"]
30
31
 
31
32
  [project.scripts]
@@ -1,4 +1,4 @@
1
1
  # SPDX-License-Identifier: Apache-2.0
2
2
  """ChunkSilo - Local RAG-based semantic document search."""
3
3
 
4
- __version__ = "2.0.0"
4
+ __version__ = "2.1.0"
@@ -70,6 +70,16 @@ _DEFAULTS: dict[str, Any] = {
70
70
  "timeout": 10.0,
71
71
  "max_results": 30,
72
72
  },
73
+ "jira": {
74
+ "url": "",
75
+ "username": "",
76
+ "api_token": "",
77
+ "timeout": 10.0,
78
+ "max_results": 30,
79
+ "projects": [], # Empty list = all accessible projects
80
+ "include_comments": False,
81
+ "include_custom_fields": False,
82
+ },
73
83
  "ssl": {
74
84
  "ca_bundle_path": "",
75
85
  },