chunksilo 2.0.0__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of chunksilo might be problematic. Click here for more details.
- chunksilo/__init__.py +1 -1
- chunksilo/cfgload.py +10 -0
- chunksilo/search.py +478 -10
- {chunksilo-2.0.0.dist-info → chunksilo-2.1.0.dist-info}/METADATA +65 -16
- chunksilo-2.1.0.dist-info/RECORD +15 -0
- {chunksilo-2.0.0.dist-info → chunksilo-2.1.0.dist-info}/WHEEL +1 -1
- chunksilo-2.0.0.dist-info/RECORD +0 -15
- {chunksilo-2.0.0.dist-info → chunksilo-2.1.0.dist-info}/entry_points.txt +0 -0
- {chunksilo-2.0.0.dist-info → chunksilo-2.1.0.dist-info}/licenses/LICENSE +0 -0
- {chunksilo-2.0.0.dist-info → chunksilo-2.1.0.dist-info}/licenses/NOTICE +0 -0
- {chunksilo-2.0.0.dist-info → chunksilo-2.1.0.dist-info}/top_level.txt +0 -0
chunksilo/__init__.py
CHANGED
chunksilo/cfgload.py
CHANGED
|
@@ -70,6 +70,16 @@ _DEFAULTS: dict[str, Any] = {
|
|
|
70
70
|
"timeout": 10.0,
|
|
71
71
|
"max_results": 30,
|
|
72
72
|
},
|
|
73
|
+
"jira": {
|
|
74
|
+
"url": "",
|
|
75
|
+
"username": "",
|
|
76
|
+
"api_token": "",
|
|
77
|
+
"timeout": 10.0,
|
|
78
|
+
"max_results": 30,
|
|
79
|
+
"projects": [], # Empty list = all accessible projects
|
|
80
|
+
"include_comments": False,
|
|
81
|
+
"include_custom_fields": False,
|
|
82
|
+
},
|
|
73
83
|
"ssl": {
|
|
74
84
|
"ca_bundle_path": "",
|
|
75
85
|
},
|
chunksilo/search.py
CHANGED
|
@@ -7,6 +7,7 @@ Contains all retrieval logic independent of the MCP server.
|
|
|
7
7
|
Used by both the MCP server (server.py) and the CLI (cli.py).
|
|
8
8
|
"""
|
|
9
9
|
import os
|
|
10
|
+
import re
|
|
10
11
|
import time
|
|
11
12
|
import math
|
|
12
13
|
import logging
|
|
@@ -26,6 +27,12 @@ except ImportError:
|
|
|
26
27
|
ConfluenceReader = None
|
|
27
28
|
requests = None
|
|
28
29
|
|
|
30
|
+
# Optional Jira integration
|
|
31
|
+
try:
|
|
32
|
+
from jira import JIRA
|
|
33
|
+
except ImportError:
|
|
34
|
+
JIRA = None
|
|
35
|
+
|
|
29
36
|
# TEMPORARY FIX: Patch Confluence HTML parser to handle syntax highlighting spans
|
|
30
37
|
# Remove when upstream issue is fixed (see confluence_html_formatter.py)
|
|
31
38
|
if ConfluenceReader is not None:
|
|
@@ -365,23 +372,279 @@ def _get_confluence_page_dates(
|
|
|
365
372
|
data = response.json()
|
|
366
373
|
result = {}
|
|
367
374
|
if "createdAt" in data:
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
result["creation_date"] =
|
|
371
|
-
except Exception:
|
|
372
|
-
pass
|
|
375
|
+
creation_date = _parse_iso8601_to_date(data["createdAt"])
|
|
376
|
+
if creation_date:
|
|
377
|
+
result["creation_date"] = creation_date
|
|
373
378
|
if "version" in data and "createdAt" in data["version"]:
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
result["last_modified_date"] =
|
|
377
|
-
except Exception:
|
|
378
|
-
pass
|
|
379
|
+
last_modified = _parse_iso8601_to_date(data["version"]["createdAt"])
|
|
380
|
+
if last_modified:
|
|
381
|
+
result["last_modified_date"] = last_modified
|
|
379
382
|
return result
|
|
380
383
|
except Exception as e:
|
|
381
384
|
logger.debug(f"Failed to fetch Confluence page dates: {e}")
|
|
382
385
|
return {}
|
|
383
386
|
|
|
384
387
|
|
|
388
|
+
def _prepare_jira_jql_query(query: str, config: dict[str, Any]) -> str:
|
|
389
|
+
"""Construct a JQL query from user search terms and configuration.
|
|
390
|
+
|
|
391
|
+
Uses Jira's 'text' field which searches across Summary, Description,
|
|
392
|
+
Environment, Comments, and all text custom fields. This provides broad
|
|
393
|
+
coverage similar to natural language search.
|
|
394
|
+
|
|
395
|
+
Note: Fuzzy search operators (~) are deprecated in Jira Cloud but work
|
|
396
|
+
in Data Center/Server. ChunkSilo's semantic search (embeddings + reranker)
|
|
397
|
+
provides fuzzy matching regardless of Jira version.
|
|
398
|
+
|
|
399
|
+
Args:
|
|
400
|
+
query: User's search query string
|
|
401
|
+
config: Configuration dict containing jira settings
|
|
402
|
+
|
|
403
|
+
Returns:
|
|
404
|
+
JQL query string ready for Jira API
|
|
405
|
+
|
|
406
|
+
Raises:
|
|
407
|
+
None - returns safe default query on edge cases
|
|
408
|
+
|
|
409
|
+
Performance Note:
|
|
410
|
+
For high-volume instances, consider using specific fields like
|
|
411
|
+
"Summary ~ 'term' OR Description ~ 'term'" instead of "text ~ 'term'"
|
|
412
|
+
to reduce search scope. The current implementation prioritizes recall.
|
|
413
|
+
|
|
414
|
+
References:
|
|
415
|
+
- Jira text field: https://support.atlassian.com/jira-software-cloud/docs/search-for-work-items-using-the-text-field/
|
|
416
|
+
- JQL operators: https://support.atlassian.com/jira-software-cloud/docs/jql-operators/
|
|
417
|
+
"""
|
|
418
|
+
# Reuse Confluence query term preparation for stopword filtering
|
|
419
|
+
# This gives us a clean list of meaningful search terms
|
|
420
|
+
query_terms = _prepare_confluence_query_terms(query)
|
|
421
|
+
|
|
422
|
+
# Build the text search clause
|
|
423
|
+
# Using JQL 'text' field which searches across all text fields for broad recall
|
|
424
|
+
if not query_terms:
|
|
425
|
+
# No meaningful terms after filtering, use original query
|
|
426
|
+
escaped = query.strip().replace('"', '\\"')
|
|
427
|
+
if not escaped:
|
|
428
|
+
logger.warning("Jira search skipped: empty query after processing")
|
|
429
|
+
return ""
|
|
430
|
+
text_clause = f'text ~ "{escaped}"'
|
|
431
|
+
elif len(query_terms) == 1:
|
|
432
|
+
# Single term - simple text search
|
|
433
|
+
text_clause = f'text ~ "{query_terms[0]}"'
|
|
434
|
+
else:
|
|
435
|
+
# Multiple terms - use OR logic to find issues matching any term
|
|
436
|
+
# ChunkSilo's reranker will score results by relevance after retrieval
|
|
437
|
+
text_conditions = ' OR '.join([f'text ~ "{term}"' for term in query_terms])
|
|
438
|
+
text_clause = f'({text_conditions})'
|
|
439
|
+
|
|
440
|
+
# Add project filter if configured
|
|
441
|
+
# Empty projects list means search all accessible projects
|
|
442
|
+
projects = config["jira"].get("projects", [])
|
|
443
|
+
if projects:
|
|
444
|
+
# Restrict search to specific project keys
|
|
445
|
+
project_list = ", ".join([f'"{p}"' for p in projects])
|
|
446
|
+
project_clause = f'project IN ({project_list})'
|
|
447
|
+
jql = f'{text_clause} AND {project_clause}'
|
|
448
|
+
else:
|
|
449
|
+
jql = text_clause
|
|
450
|
+
|
|
451
|
+
# Order by updated DESC for recency
|
|
452
|
+
# This enables ChunkSilo's recency boost feature and returns most relevant recent issues first
|
|
453
|
+
jql += ' ORDER BY updated DESC'
|
|
454
|
+
|
|
455
|
+
return jql
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
def _jira_issue_to_text(issue, include_comments: bool, include_custom_fields: bool) -> str:
|
|
459
|
+
"""Convert a Jira issue to searchable text representation.
|
|
460
|
+
|
|
461
|
+
This function constructs a text representation of a Jira issue that will
|
|
462
|
+
be embedded and indexed by ChunkSilo's vector database. The text includes
|
|
463
|
+
structured sections for issue metadata, description, comments, and custom
|
|
464
|
+
fields, making it suitable for semantic search.
|
|
465
|
+
|
|
466
|
+
Args:
|
|
467
|
+
issue: JIRA issue object from the jira library
|
|
468
|
+
include_comments: If True, include all issue comments with author names
|
|
469
|
+
include_custom_fields: If True, include all custom field values
|
|
470
|
+
|
|
471
|
+
Returns:
|
|
472
|
+
Formatted text string suitable for embedding and search
|
|
473
|
+
|
|
474
|
+
Format:
|
|
475
|
+
Issue: PROJ-123
|
|
476
|
+
Summary: Issue title here
|
|
477
|
+
|
|
478
|
+
Description:
|
|
479
|
+
Issue description text...
|
|
480
|
+
|
|
481
|
+
Comments:
|
|
482
|
+
- John Doe: Comment text
|
|
483
|
+
- Jane Smith: Another comment
|
|
484
|
+
|
|
485
|
+
Custom Fields:
|
|
486
|
+
customfield_10001: value
|
|
487
|
+
|
|
488
|
+
Note:
|
|
489
|
+
Custom fields are detected by checking for attributes starting with
|
|
490
|
+
'customfield_'. Only fields with non-empty values are included.
|
|
491
|
+
|
|
492
|
+
Error Handling:
|
|
493
|
+
Missing or None fields (description, comments) are gracefully skipped.
|
|
494
|
+
The function always returns valid text even with minimal issue data.
|
|
495
|
+
"""
|
|
496
|
+
parts = []
|
|
497
|
+
|
|
498
|
+
# Issue key and summary are always included
|
|
499
|
+
# These are the most important fields for identifying and understanding the issue
|
|
500
|
+
parts.append(f"Issue: {issue.key}")
|
|
501
|
+
parts.append(f"Summary: {issue.fields.summary}")
|
|
502
|
+
|
|
503
|
+
# Description provides detailed context
|
|
504
|
+
# Use hasattr() for safe field access since description can be missing/None
|
|
505
|
+
if hasattr(issue.fields, 'description') and issue.fields.description:
|
|
506
|
+
parts.append(f"\nDescription:\n{issue.fields.description}")
|
|
507
|
+
|
|
508
|
+
# Comments provide discussion context and additional searchable content
|
|
509
|
+
# Include author names for context (who said what matters for search)
|
|
510
|
+
if include_comments and hasattr(issue.fields, 'comment'):
|
|
511
|
+
comments = issue.fields.comment.comments
|
|
512
|
+
if comments:
|
|
513
|
+
parts.append("\nComments:")
|
|
514
|
+
for comment in comments:
|
|
515
|
+
# Safely extract author display name
|
|
516
|
+
author = getattr(comment, 'author', None)
|
|
517
|
+
author_name = author.displayName if author and hasattr(author, 'displayName') else 'Unknown'
|
|
518
|
+
parts.append(f"- {author_name}: {comment.body}")
|
|
519
|
+
|
|
520
|
+
# Custom fields provide instance-specific metadata
|
|
521
|
+
# Detect by 'customfield_' prefix, include any with non-empty values
|
|
522
|
+
if include_custom_fields:
|
|
523
|
+
custom_fields = []
|
|
524
|
+
for field_name in dir(issue.fields):
|
|
525
|
+
if field_name.startswith('customfield_'):
|
|
526
|
+
field_value = getattr(issue.fields, field_name, None)
|
|
527
|
+
# Only include fields with meaningful values
|
|
528
|
+
if field_value is not None and str(field_value).strip():
|
|
529
|
+
custom_fields.append(f"{field_name}: {field_value}")
|
|
530
|
+
|
|
531
|
+
if custom_fields:
|
|
532
|
+
parts.append("\nCustom Fields:")
|
|
533
|
+
parts.extend(custom_fields)
|
|
534
|
+
|
|
535
|
+
# Join all sections with newlines for structured, readable text
|
|
536
|
+
# This format works well for semantic search and LLM processing
|
|
537
|
+
return "\n".join(parts)
|
|
538
|
+
|
|
539
|
+
|
|
540
|
+
def _jira_issue_to_metadata(issue, jira_url: str) -> dict[str, Any]:
|
|
541
|
+
"""Extract structured metadata from a Jira issue.
|
|
542
|
+
|
|
543
|
+
This function extracts metadata that will be attached to the search result
|
|
544
|
+
node. The metadata is used for:
|
|
545
|
+
- Display in search results (title, status, etc.)
|
|
546
|
+
- Date range filtering (creation_date, last_modified_date)
|
|
547
|
+
- Recency boosting (last_modified_date)
|
|
548
|
+
- URI construction (issue_key)
|
|
549
|
+
- Result grouping and sorting
|
|
550
|
+
|
|
551
|
+
Args:
|
|
552
|
+
issue: JIRA issue object from the jira library
|
|
553
|
+
jira_url: Base Jira URL for constructing attachment URLs
|
|
554
|
+
|
|
555
|
+
Returns:
|
|
556
|
+
Dictionary of metadata fields following ChunkSilo conventions
|
|
557
|
+
|
|
558
|
+
Metadata Fields:
|
|
559
|
+
Required:
|
|
560
|
+
- source: Always "Jira"
|
|
561
|
+
- issue_key: Jira issue key (e.g., "PROJ-123")
|
|
562
|
+
- issue_type: Type of issue ("Bug", "Story", etc.)
|
|
563
|
+
- status: Current status ("Open", "In Progress", etc.)
|
|
564
|
+
- title: Issue summary
|
|
565
|
+
- file_name: Display name (format: "{key}: {summary}")
|
|
566
|
+
|
|
567
|
+
Optional (present if available):
|
|
568
|
+
- priority: Issue priority ("High", "Medium", "Low")
|
|
569
|
+
- creation_date: ISO format date string (YYYY-MM-DD)
|
|
570
|
+
- last_modified_date: ISO format date string (YYYY-MM-DD)
|
|
571
|
+
- project_key: Project key (e.g., "PROJ")
|
|
572
|
+
- project_name: Full project name
|
|
573
|
+
- assignee: Assigned user's display name
|
|
574
|
+
- reporter: Reporter's display name
|
|
575
|
+
- attachments: List of attachment metadata (not content)
|
|
576
|
+
|
|
577
|
+
Date Format:
|
|
578
|
+
Jira returns ISO 8601 dates like "2024-01-15T10:30:00.000+0000".
|
|
579
|
+
We convert to "YYYY-MM-DD" format for consistency with ChunkSilo's
|
|
580
|
+
date filtering and display logic.
|
|
581
|
+
|
|
582
|
+
Attachment Handling:
|
|
583
|
+
Per requirements, we list attachment metadata but do NOT download
|
|
584
|
+
or index attachment content. Users/models can follow URLs to access
|
|
585
|
+
attachments if needed.
|
|
586
|
+
|
|
587
|
+
Error Handling:
|
|
588
|
+
Missing optional fields (priority, assignee, etc.) are gracefully
|
|
589
|
+
skipped. The function always returns a valid metadata dict with
|
|
590
|
+
at least the required fields.
|
|
591
|
+
"""
|
|
592
|
+
# Required fields - always present
|
|
593
|
+
metadata = {
|
|
594
|
+
"source": "Jira", # Identifies source for URI resolution and display
|
|
595
|
+
"issue_key": issue.key, # Unique identifier for linking
|
|
596
|
+
"issue_type": issue.fields.issuetype.name,
|
|
597
|
+
"status": issue.fields.status.name,
|
|
598
|
+
"title": issue.fields.summary,
|
|
599
|
+
# file_name format matches ChunkSilo convention for display
|
|
600
|
+
"file_name": f"{issue.key}: {issue.fields.summary}",
|
|
601
|
+
}
|
|
602
|
+
|
|
603
|
+
# Optional: Priority (may not exist on all issue types)
|
|
604
|
+
if hasattr(issue.fields, 'priority') and issue.fields.priority:
|
|
605
|
+
metadata["priority"] = issue.fields.priority.name
|
|
606
|
+
|
|
607
|
+
# Dates for filtering and recency boosting
|
|
608
|
+
# Parse ISO 8601 format and convert to YYYY-MM-DD for consistency
|
|
609
|
+
if hasattr(issue.fields, 'created') and issue.fields.created:
|
|
610
|
+
creation_date = _parse_iso8601_to_date(issue.fields.created)
|
|
611
|
+
if creation_date:
|
|
612
|
+
metadata["creation_date"] = creation_date
|
|
613
|
+
|
|
614
|
+
if hasattr(issue.fields, 'updated') and issue.fields.updated:
|
|
615
|
+
# last_modified_date used for recency boost calculation
|
|
616
|
+
last_modified = _parse_iso8601_to_date(issue.fields.updated)
|
|
617
|
+
if last_modified:
|
|
618
|
+
metadata["last_modified_date"] = last_modified
|
|
619
|
+
|
|
620
|
+
# Project information
|
|
621
|
+
if hasattr(issue.fields, 'project') and issue.fields.project:
|
|
622
|
+
metadata["project_key"] = issue.fields.project.key
|
|
623
|
+
metadata["project_name"] = issue.fields.project.name
|
|
624
|
+
|
|
625
|
+
# People - use display names for human-readable output
|
|
626
|
+
if hasattr(issue.fields, 'assignee') and issue.fields.assignee:
|
|
627
|
+
metadata["assignee"] = issue.fields.assignee.displayName
|
|
628
|
+
|
|
629
|
+
if hasattr(issue.fields, 'reporter') and issue.fields.reporter:
|
|
630
|
+
metadata["reporter"] = issue.fields.reporter.displayName
|
|
631
|
+
|
|
632
|
+
# Attachments - list metadata only, don't index content
|
|
633
|
+
# Per requirements: provide URIs so user/model can access if needed
|
|
634
|
+
if hasattr(issue.fields, 'attachment') and issue.fields.attachment:
|
|
635
|
+
attachments = []
|
|
636
|
+
for att in issue.fields.attachment:
|
|
637
|
+
attachments.append({
|
|
638
|
+
"filename": att.filename,
|
|
639
|
+
"url": att.content, # Direct download URL
|
|
640
|
+
"size": att.size, # Size in bytes
|
|
641
|
+
})
|
|
642
|
+
if attachments:
|
|
643
|
+
metadata["attachments"] = attachments
|
|
644
|
+
|
|
645
|
+
return metadata
|
|
646
|
+
|
|
647
|
+
|
|
385
648
|
def _search_confluence(query: str, config: dict[str, Any]) -> list[NodeWithScore]:
|
|
386
649
|
"""Search Confluence for documents matching the query using CQL."""
|
|
387
650
|
base_url = config["confluence"]["url"]
|
|
@@ -447,6 +710,153 @@ def _search_confluence(query: str, config: dict[str, Any]) -> list[NodeWithScore
|
|
|
447
710
|
return []
|
|
448
711
|
|
|
449
712
|
|
|
713
|
+
def _search_jira(query: str, config: dict[str, Any]) -> list[NodeWithScore]:
|
|
714
|
+
"""Search Jira for issues matching the query using JQL.
|
|
715
|
+
|
|
716
|
+
This function performs a real-time search against the configured Jira
|
|
717
|
+
instance using JQL (Jira Query Language). Results are converted to
|
|
718
|
+
ChunkSilo's NodeWithScore format and merged with other search results.
|
|
719
|
+
|
|
720
|
+
The function follows ChunkSilo's integration pattern (same as Confluence):
|
|
721
|
+
- Returns empty list on errors (graceful degradation)
|
|
722
|
+
- Logs warnings for configuration issues
|
|
723
|
+
- Uses timeout protection in calling code (ThreadPoolExecutor)
|
|
724
|
+
- Converts results to standard NodeWithScore format
|
|
725
|
+
|
|
726
|
+
Search Strategy:
|
|
727
|
+
- Uses JQL 'text' field for broad search across all text fields
|
|
728
|
+
- Relies on ChunkSilo's semantic search (embeddings + reranker) for
|
|
729
|
+
relevance ranking instead of traditional fuzzy search
|
|
730
|
+
- Returns results with score=0.0; FlashRank reranker scores them
|
|
731
|
+
|
|
732
|
+
Args:
|
|
733
|
+
query: User's search query string
|
|
734
|
+
config: Configuration dictionary with jira and ssl settings
|
|
735
|
+
|
|
736
|
+
Returns:
|
|
737
|
+
List of NodeWithScore objects, or empty list on error/disabled
|
|
738
|
+
|
|
739
|
+
Configuration Requirements:
|
|
740
|
+
config["jira"]["url"]: Jira base URL (empty = disabled)
|
|
741
|
+
config["jira"]["username"]: Jira username or email
|
|
742
|
+
config["jira"]["api_token"]: Jira API token (not password)
|
|
743
|
+
config["jira"]["max_results"]: Maximum issues to return
|
|
744
|
+
config["jira"]["projects"]: List of project keys (empty = all)
|
|
745
|
+
config["jira"]["include_comments"]: Include issue comments
|
|
746
|
+
config["jira"]["include_custom_fields"]: Include custom fields
|
|
747
|
+
config["ssl"]["ca_bundle_path"]: Optional SSL CA bundle path
|
|
748
|
+
|
|
749
|
+
Error Handling:
|
|
750
|
+
- Empty URL: Returns [] with debug log
|
|
751
|
+
- Missing library: Returns [] with warning log
|
|
752
|
+
- Missing credentials: Returns [] with warning log
|
|
753
|
+
- API errors: Returns [] with error log (exc_info=True)
|
|
754
|
+
- All errors are non-fatal to allow other searches to succeed
|
|
755
|
+
|
|
756
|
+
Performance:
|
|
757
|
+
- Respects max_results limit (default 30)
|
|
758
|
+
- Orders by updated DESC (most recent first)
|
|
759
|
+
- Fetches all fields including custom fields
|
|
760
|
+
|
|
761
|
+
SSL/TLS:
|
|
762
|
+
- Supports custom CA bundles via ssl.ca_bundle_path
|
|
763
|
+
- Automatically configured through jira_options["verify"]
|
|
764
|
+
|
|
765
|
+
Authentication:
|
|
766
|
+
- Uses basic auth (username + API token)
|
|
767
|
+
- Works for both Jira Cloud and Data Center/Server
|
|
768
|
+
|
|
769
|
+
References:
|
|
770
|
+
- Jira REST API: https://developer.atlassian.com/cloud/jira/platform/rest/v3/
|
|
771
|
+
- JQL Reference: https://support.atlassian.com/jira-software-cloud/docs/use-advanced-search-with-jira-query-language-jql/
|
|
772
|
+
"""
|
|
773
|
+
# Check if Jira integration is enabled via URL
|
|
774
|
+
base_url = config["jira"]["url"]
|
|
775
|
+
if not base_url:
|
|
776
|
+
logger.debug("Jira search skipped: jira.url not set in config")
|
|
777
|
+
return []
|
|
778
|
+
|
|
779
|
+
# Gracefully degrade if optional dependency not installed
|
|
780
|
+
if JIRA is None:
|
|
781
|
+
logger.warning("jira library not installed, skipping Jira search")
|
|
782
|
+
return []
|
|
783
|
+
|
|
784
|
+
# Extract configuration settings
|
|
785
|
+
username = config["jira"]["username"]
|
|
786
|
+
api_token = config["jira"]["api_token"]
|
|
787
|
+
max_results = config["jira"]["max_results"]
|
|
788
|
+
include_comments = config["jira"]["include_comments"]
|
|
789
|
+
include_custom_fields = config["jira"]["include_custom_fields"]
|
|
790
|
+
ca_bundle_path = config["ssl"]["ca_bundle_path"] or None
|
|
791
|
+
|
|
792
|
+
# Validate required credentials are present
|
|
793
|
+
if not (base_url and username and api_token):
|
|
794
|
+
missing = []
|
|
795
|
+
if not username:
|
|
796
|
+
missing.append("jira.username")
|
|
797
|
+
if not api_token:
|
|
798
|
+
missing.append("jira.api_token")
|
|
799
|
+
logger.warning(f"Jira search skipped: missing {', '.join(missing)} in config")
|
|
800
|
+
return []
|
|
801
|
+
|
|
802
|
+
try:
|
|
803
|
+
# Configure SSL certificate verification if CA bundle provided
|
|
804
|
+
# This enables Jira integration in corporate environments with custom CAs
|
|
805
|
+
jira_options = {"server": base_url}
|
|
806
|
+
if ca_bundle_path:
|
|
807
|
+
jira_options["verify"] = ca_bundle_path
|
|
808
|
+
|
|
809
|
+
# Use basic auth (username + API token) for authentication
|
|
810
|
+
# Works for both Jira Cloud and Data Center/Server
|
|
811
|
+
jira_client = JIRA(
|
|
812
|
+
options=jira_options,
|
|
813
|
+
basic_auth=(username, api_token)
|
|
814
|
+
)
|
|
815
|
+
|
|
816
|
+
# Construct JQL with text search and project filtering
|
|
817
|
+
jql = _prepare_jira_jql_query(query, config)
|
|
818
|
+
if not jql:
|
|
819
|
+
# Empty query after processing
|
|
820
|
+
return []
|
|
821
|
+
|
|
822
|
+
# Log JQL query for debugging
|
|
823
|
+
logger.debug(f"Jira JQL query: {jql}")
|
|
824
|
+
|
|
825
|
+
# Fetch all fields including custom fields for comprehensive search
|
|
826
|
+
# maxResults limits API response size for performance
|
|
827
|
+
issues = jira_client.search_issues(
|
|
828
|
+
jql,
|
|
829
|
+
maxResults=max_results,
|
|
830
|
+
fields='*all' # Get all fields including custom fields
|
|
831
|
+
)
|
|
832
|
+
|
|
833
|
+
# Convert issues to NodeWithScore format
|
|
834
|
+
nodes: list[NodeWithScore] = []
|
|
835
|
+
for issue in issues:
|
|
836
|
+
# Build searchable text representation
|
|
837
|
+
# This will be embedded and indexed by ChunkSilo's vector database
|
|
838
|
+
text = _jira_issue_to_text(issue, include_comments, include_custom_fields)
|
|
839
|
+
|
|
840
|
+
# Extract structured metadata for filtering and display
|
|
841
|
+
metadata = _jira_issue_to_metadata(issue, base_url)
|
|
842
|
+
|
|
843
|
+
# Create node with text and metadata
|
|
844
|
+
# Initial score is 0.0; FlashRank reranker will assign relevance scores
|
|
845
|
+
node = TextNode(text=text, metadata=metadata)
|
|
846
|
+
nodes.append(NodeWithScore(node=node, score=0.0))
|
|
847
|
+
|
|
848
|
+
logger.debug(f"Jira search returned {len(nodes)} issues")
|
|
849
|
+
# Return results for merging with other search sources
|
|
850
|
+
return nodes
|
|
851
|
+
|
|
852
|
+
except Exception as e:
|
|
853
|
+
# Catch all exceptions to prevent search pipeline failure
|
|
854
|
+
# Log errors with full traceback for debugging
|
|
855
|
+
logger.error(f"Failed to search Jira: {e}", exc_info=True)
|
|
856
|
+
# Return empty list to allow search to continue with other sources
|
|
857
|
+
return []
|
|
858
|
+
|
|
859
|
+
|
|
450
860
|
def load_llamaindex_index(config: dict[str, Any] | None = None):
|
|
451
861
|
"""Load the LlamaIndex from storage."""
|
|
452
862
|
if config is None:
|
|
@@ -471,6 +881,41 @@ def load_llamaindex_index(config: dict[str, Any] | None = None):
|
|
|
471
881
|
return index
|
|
472
882
|
|
|
473
883
|
|
|
884
|
+
def _parse_iso8601_to_date(iso_string: str) -> str | None:
|
|
885
|
+
"""Parse ISO 8601 timestamp to YYYY-MM-DD format.
|
|
886
|
+
|
|
887
|
+
Handles various ISO 8601 formats including:
|
|
888
|
+
- Z suffix: 2024-01-15T10:30:00Z
|
|
889
|
+
- Timezone with colon: 2024-01-15T10:30:00.000+00:00
|
|
890
|
+
- Timezone without colon: 2024-01-15T10:30:00.000+0000 (Jira format)
|
|
891
|
+
|
|
892
|
+
Args:
|
|
893
|
+
iso_string: ISO 8601 formatted datetime string
|
|
894
|
+
|
|
895
|
+
Returns:
|
|
896
|
+
Date in YYYY-MM-DD format, or None if parsing fails
|
|
897
|
+
"""
|
|
898
|
+
if not iso_string:
|
|
899
|
+
return None
|
|
900
|
+
|
|
901
|
+
try:
|
|
902
|
+
# Normalize the timestamp
|
|
903
|
+
normalized = iso_string.strip()
|
|
904
|
+
|
|
905
|
+
# Replace Z suffix with +00:00
|
|
906
|
+
normalized = normalized.replace('Z', '+00:00')
|
|
907
|
+
|
|
908
|
+
# Insert colon in timezone offsets like +0000 → +00:00
|
|
909
|
+
# Matches ±HHMM at end of string, inserts colon: ±HH:MM
|
|
910
|
+
normalized = re.sub(r'([+-]\d{2})(\d{2})$', r'\1:\2', normalized)
|
|
911
|
+
|
|
912
|
+
# Parse and format
|
|
913
|
+
dt = datetime.fromisoformat(normalized)
|
|
914
|
+
return dt.strftime("%Y-%m-%d")
|
|
915
|
+
except Exception:
|
|
916
|
+
return None
|
|
917
|
+
|
|
918
|
+
|
|
474
919
|
def _parse_date(date_str: str) -> datetime | None:
|
|
475
920
|
"""Parse date string in YYYY-MM-DD format."""
|
|
476
921
|
try:
|
|
@@ -623,6 +1068,23 @@ def run_search(
|
|
|
623
1068
|
if confluence_nodes:
|
|
624
1069
|
nodes.extend(confluence_nodes)
|
|
625
1070
|
|
|
1071
|
+
# Search Jira (with timeout)
|
|
1072
|
+
jira_nodes: list[NodeWithScore] = []
|
|
1073
|
+
jira_timeout = config["jira"]["timeout"]
|
|
1074
|
+
if config["jira"]["url"]:
|
|
1075
|
+
try:
|
|
1076
|
+
with ThreadPoolExecutor(max_workers=1) as executor:
|
|
1077
|
+
future = executor.submit(_search_jira, enhanced_query, config)
|
|
1078
|
+
jira_nodes = future.result(timeout=jira_timeout)
|
|
1079
|
+
logger.info(f"Jira search returned {len(jira_nodes)} entries")
|
|
1080
|
+
except FuturesTimeoutError:
|
|
1081
|
+
logger.warning(f"Jira search timed out after {jira_timeout}s")
|
|
1082
|
+
except Exception as e:
|
|
1083
|
+
logger.error(f"Error during Jira search: {e}")
|
|
1084
|
+
|
|
1085
|
+
if jira_nodes:
|
|
1086
|
+
nodes.extend(jira_nodes)
|
|
1087
|
+
|
|
626
1088
|
# Apply date filtering
|
|
627
1089
|
if date_from or date_to:
|
|
628
1090
|
original_count = len(nodes)
|
|
@@ -734,6 +1196,12 @@ def run_search(
|
|
|
734
1196
|
from urllib.parse import quote
|
|
735
1197
|
encoded_title = quote(title.replace(" ", "+"))
|
|
736
1198
|
source_uri = f"{confluence_url.rstrip('/')}/spaces/~{encoded_title}"
|
|
1199
|
+
elif original_source == "Jira":
|
|
1200
|
+
# Build Jira issue URI using standard browse URL format
|
|
1201
|
+
jira_url = config["jira"]["url"]
|
|
1202
|
+
issue_key = metadata.get("issue_key")
|
|
1203
|
+
if jira_url and issue_key:
|
|
1204
|
+
source_uri = f"{jira_url.rstrip('/')}/browse/{issue_key}"
|
|
737
1205
|
elif file_path:
|
|
738
1206
|
source_uri = _resolve_file_uri(file_path, config)
|
|
739
1207
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: chunksilo
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.1.0
|
|
4
4
|
Summary: Local RAG-based semantic document search with MCP server interface
|
|
5
5
|
Author: Fredrik Reveny
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -33,32 +33,53 @@ Requires-Dist: fastembed<1,>=0.5.0
|
|
|
33
33
|
Requires-Dist: pyyaml<7,>=6.0
|
|
34
34
|
Provides-Extra: confluence
|
|
35
35
|
Requires-Dist: llama-index-readers-confluence<1,>=0.6.0; extra == "confluence"
|
|
36
|
+
Provides-Extra: jira
|
|
37
|
+
Requires-Dist: jira<4,>=3.5.0; extra == "jira"
|
|
36
38
|
Provides-Extra: test
|
|
37
39
|
Requires-Dist: pytest<9,>=7.4.0; extra == "test"
|
|
38
40
|
Requires-Dist: requests<3,>=2.31.0; extra == "test"
|
|
39
41
|
Dynamic: license-file
|
|
40
42
|
|
|
43
|
+
<p align="center">
|
|
44
|
+
<img src="chunksilo.png" alt="ChunkSilo Logo" width="500">
|
|
45
|
+
</p>
|
|
46
|
+
|
|
41
47
|
# ChunkSilo MCP Server
|
|
42
48
|
|
|
43
49
|
ChunkSilo is like a local Google for your documents. It uses semantic search — matching by meaning rather than exact keywords — so your LLM can find relevant information across all your files even when the wording differs from your query. Point it at your PDFs, Word docs, Markdown, and text files, and it builds a fully searchable index locally on your machine.
|
|
44
50
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
-
|
|
48
|
-
-
|
|
49
|
-
-
|
|
50
|
-
-
|
|
51
|
+
- Runs entirely on your machine — no servers, no infrastructure
|
|
52
|
+
- Semantic search + keyword filename matching across PDF, DOCX, DOC, Markdown, and TXT
|
|
53
|
+
- Incremental indexing — only reprocesses new or changed files
|
|
54
|
+
- Heading-aware results with source links back to the original file
|
|
55
|
+
- Date filtering and recency boosting
|
|
56
|
+
- Optional Confluence integration
|
|
51
57
|
|
|
52
|
-
|
|
58
|
+
### Example `search_docs` output
|
|
53
59
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
60
|
+
```json
|
|
61
|
+
{
|
|
62
|
+
"matched_files": [
|
|
63
|
+
{ "uri": "file:///docs/database-configuration.docx", "score": 0.8432 }
|
|
64
|
+
],
|
|
65
|
+
"num_matched_files": 1,
|
|
66
|
+
"chunks": [
|
|
67
|
+
{
|
|
68
|
+
"text": "To configure the database connection, set the DATABASE_URL environment variable...",
|
|
69
|
+
"score": 0.912,
|
|
70
|
+
"location": {
|
|
71
|
+
"uri": "file:///docs/setup-guide.pdf",
|
|
72
|
+
"page": 12,
|
|
73
|
+
"line": null,
|
|
74
|
+
"heading_path": ["Getting Started", "Configuration", "Database"]
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
],
|
|
78
|
+
"num_chunks": 1,
|
|
79
|
+
"query": "how to configure the database",
|
|
80
|
+
"retrieval_time": "0.42s"
|
|
81
|
+
}
|
|
82
|
+
```
|
|
62
83
|
|
|
63
84
|
## Installation
|
|
64
85
|
|
|
@@ -71,6 +92,12 @@ pip install chunksilo
|
|
|
71
92
|
|
|
72
93
|
# Or with Confluence support:
|
|
73
94
|
pip install chunksilo[confluence]
|
|
95
|
+
|
|
96
|
+
# Or with Jira support:
|
|
97
|
+
pip install chunksilo[jira]
|
|
98
|
+
|
|
99
|
+
# Or with both Confluence and Jira:
|
|
100
|
+
pip install chunksilo[confluence,jira]
|
|
74
101
|
```
|
|
75
102
|
|
|
76
103
|
Then:
|
|
@@ -184,6 +211,27 @@ All settings are optional and have sensible defaults.
|
|
|
184
211
|
| `confluence.timeout` | `10.0` | Request timeout in seconds |
|
|
185
212
|
| `confluence.max_results` | `30` | Maximum results per search |
|
|
186
213
|
|
|
214
|
+
#### Jira Settings (optional)
|
|
215
|
+
|
|
216
|
+
> **Note:** Jira integration requires the optional dependency. Install with: `pip install chunksilo[jira]`
|
|
217
|
+
|
|
218
|
+
| Setting | Default | Description |
|
|
219
|
+
| :--- | :--- | :--- |
|
|
220
|
+
| `jira.url` | `""` | Jira base URL (empty = disabled) |
|
|
221
|
+
| `jira.username` | `""` | Jira username/email |
|
|
222
|
+
| `jira.api_token` | `""` | Jira API token |
|
|
223
|
+
| `jira.timeout` | `10.0` | Request timeout in seconds |
|
|
224
|
+
| `jira.max_results` | `30` | Maximum results per search |
|
|
225
|
+
| `jira.projects` | `[]` | Project keys to search (empty = all) |
|
|
226
|
+
| `jira.include_comments` | `true` | Include issue comments in search |
|
|
227
|
+
| `jira.include_custom_fields` | `true` | Include custom fields in search |
|
|
228
|
+
|
|
229
|
+
**Creating a Jira API Token:**
|
|
230
|
+
1. Log into Jira
|
|
231
|
+
2. Go to Account Settings > Security > API Tokens
|
|
232
|
+
3. Click "Create API Token"
|
|
233
|
+
4. Copy the token and add it to your config
|
|
234
|
+
|
|
187
235
|
#### SSL Settings (optional)
|
|
188
236
|
|
|
189
237
|
| Setting | Default | Description |
|
|
@@ -357,6 +405,7 @@ Add to `mcp_settings.json` (typically in `~/.config/Code/User/globalStorage/roov
|
|
|
357
405
|
- **Retrieval errors**: Check paths in your MCP client configuration.
|
|
358
406
|
- **Offline mode**: PyPI installs default to `offline: false` (models auto-download). The offline bundle includes pre-downloaded models and sets `offline: true`. Set `retrieval.offline: true` in `config.yaml` to prevent network calls after initial model download.
|
|
359
407
|
- **Confluence Integration**: Install with `pip install chunksilo[confluence]`, then set `confluence.url`, `confluence.username`, and `confluence.api_token` in `config.yaml`.
|
|
408
|
+
- **Jira Integration**: Install with `pip install chunksilo[jira]`, then set `jira.url`, `jira.username`, and `jira.api_token` in `config.yaml`. Optionally configure `jira.projects` to restrict search to specific project keys.
|
|
360
409
|
- **Custom CA Bundle**: Set `ssl.ca_bundle_path` in `config.yaml` for custom certificates.
|
|
361
410
|
- **Network mounts**: Unavailable directories are skipped with a warning; indexing continues with available directories.
|
|
362
411
|
- **Legacy .doc files**: Requires LibreOffice to be installed for automatic conversion to .docx. If LibreOffice is not found, .doc files are skipped with a warning. Full heading extraction is supported.
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
chunksilo/__init__.py,sha256=wTXWfXq3GyIUT2AJrq0y9aYikeeR6kIWaFy05LaPS64,121
|
|
2
|
+
chunksilo/__main__.py,sha256=eY8-KfJfBz0nibDPY_jv2RvkLXEx7ZDSPRWiDJb7PpY,39
|
|
3
|
+
chunksilo/cfgload.py,sha256=dvzDSftqbLCEhYb76OPA4hP_ToJE7FYdpko67m_2RkA,4639
|
|
4
|
+
chunksilo/cli.py,sha256=ZKXzW-HOmnLXC4Ynu-D86XIbp7bdQ_OYVHuHFVlj41U,4170
|
|
5
|
+
chunksilo/confluence_html_formatter.py,sha256=D8pb5TCrai6exIqeajH49y4D_t0jfQkmI6aNm4BzPIg,2828
|
|
6
|
+
chunksilo/index.py,sha256=ECSTH8c0ZhnTxAHxM7dcCfFOGTk99STHuyDhc3uMCO0,51618
|
|
7
|
+
chunksilo/search.py,sha256=TakSH-62oM4mJg1v3O9ylivbF_IROBNfHVfJ8lkZMrw,47879
|
|
8
|
+
chunksilo/server.py,sha256=PhQWMvEGb3UqWwk0tm44kTOnKH2NH4NmshnKx11xwh8,3374
|
|
9
|
+
chunksilo-2.1.0.dist-info/licenses/LICENSE,sha256=kda7NTahQy3nKvLe-LGIVCdI-qePTyqNNx2e6HLVH3k,10766
|
|
10
|
+
chunksilo-2.1.0.dist-info/licenses/NOTICE,sha256=58GCfasR-XT7RPvz9OoWb4NrPg-3AqEwzw9FJOhngZo,1228
|
|
11
|
+
chunksilo-2.1.0.dist-info/METADATA,sha256=4O5jw6bowVdUVmKOyZIhEeHo6Bgr16mJtGTIKgzkFwU,13772
|
|
12
|
+
chunksilo-2.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
13
|
+
chunksilo-2.1.0.dist-info/entry_points.txt,sha256=TfgrfBQyIRHMG5NFaXR-owDLsjHBCxjJ4YGIGBnVzbQ,87
|
|
14
|
+
chunksilo-2.1.0.dist-info/top_level.txt,sha256=xfAH0GhTfZbwkbi_DgUA5keNP-osqVNcoZjwkhpwVms,10
|
|
15
|
+
chunksilo-2.1.0.dist-info/RECORD,,
|
chunksilo-2.0.0.dist-info/RECORD
DELETED
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
chunksilo/__init__.py,sha256=Ph1w-3A5CKoGdxzUNVlrB483eOM5JxAQN8K88-6HbuY,121
|
|
2
|
-
chunksilo/__main__.py,sha256=eY8-KfJfBz0nibDPY_jv2RvkLXEx7ZDSPRWiDJb7PpY,39
|
|
3
|
-
chunksilo/cfgload.py,sha256=A7ab2RkeYQhFTTvn56I900lWpXCqXb8ao8DGYeUai1U,4359
|
|
4
|
-
chunksilo/cli.py,sha256=ZKXzW-HOmnLXC4Ynu-D86XIbp7bdQ_OYVHuHFVlj41U,4170
|
|
5
|
-
chunksilo/confluence_html_formatter.py,sha256=D8pb5TCrai6exIqeajH49y4D_t0jfQkmI6aNm4BzPIg,2828
|
|
6
|
-
chunksilo/index.py,sha256=ECSTH8c0ZhnTxAHxM7dcCfFOGTk99STHuyDhc3uMCO0,51618
|
|
7
|
-
chunksilo/search.py,sha256=8_6zuvyVdZHNMCBHrK3NcP0Ct7oYeCSLuoOHT8RxD0o,28722
|
|
8
|
-
chunksilo/server.py,sha256=PhQWMvEGb3UqWwk0tm44kTOnKH2NH4NmshnKx11xwh8,3374
|
|
9
|
-
chunksilo-2.0.0.dist-info/licenses/LICENSE,sha256=kda7NTahQy3nKvLe-LGIVCdI-qePTyqNNx2e6HLVH3k,10766
|
|
10
|
-
chunksilo-2.0.0.dist-info/licenses/NOTICE,sha256=58GCfasR-XT7RPvz9OoWb4NrPg-3AqEwzw9FJOhngZo,1228
|
|
11
|
-
chunksilo-2.0.0.dist-info/METADATA,sha256=AA4R48_d6h7V89dHM9hb0av53WlfrDKZzALfOZ0jywU,13360
|
|
12
|
-
chunksilo-2.0.0.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
|
|
13
|
-
chunksilo-2.0.0.dist-info/entry_points.txt,sha256=TfgrfBQyIRHMG5NFaXR-owDLsjHBCxjJ4YGIGBnVzbQ,87
|
|
14
|
-
chunksilo-2.0.0.dist-info/top_level.txt,sha256=xfAH0GhTfZbwkbi_DgUA5keNP-osqVNcoZjwkhpwVms,10
|
|
15
|
-
chunksilo-2.0.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|