chunksilo 2.0.0__py3-none-any.whl → 2.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of chunksilo might be problematic. Click here for more details.

chunksilo/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # SPDX-License-Identifier: Apache-2.0
2
2
  """ChunkSilo - Local RAG-based semantic document search."""
3
3
 
4
- __version__ = "2.0.0"
4
+ __version__ = "2.1.1"
chunksilo/cfgload.py CHANGED
@@ -70,6 +70,16 @@ _DEFAULTS: dict[str, Any] = {
70
70
  "timeout": 10.0,
71
71
  "max_results": 30,
72
72
  },
73
+ "jira": {
74
+ "url": "",
75
+ "username": "",
76
+ "api_token": "",
77
+ "timeout": 10.0,
78
+ "max_results": 30,
79
+ "projects": [], # Empty list = all accessible projects
80
+ "include_comments": False,
81
+ "include_custom_fields": False,
82
+ },
73
83
  "ssl": {
74
84
  "ca_bundle_path": "",
75
85
  },
chunksilo/search.py CHANGED
@@ -7,6 +7,7 @@ Contains all retrieval logic independent of the MCP server.
7
7
  Used by both the MCP server (server.py) and the CLI (cli.py).
8
8
  """
9
9
  import os
10
+ import re
10
11
  import time
11
12
  import math
12
13
  import logging
@@ -26,6 +27,12 @@ except ImportError:
26
27
  ConfluenceReader = None
27
28
  requests = None
28
29
 
30
+ # Optional Jira integration
31
+ try:
32
+ from jira import JIRA
33
+ except ImportError:
34
+ JIRA = None
35
+
29
36
  # TEMPORARY FIX: Patch Confluence HTML parser to handle syntax highlighting spans
30
37
  # Remove when upstream issue is fixed (see confluence_html_formatter.py)
31
38
  if ConfluenceReader is not None:
@@ -365,23 +372,279 @@ def _get_confluence_page_dates(
365
372
  data = response.json()
366
373
  result = {}
367
374
  if "createdAt" in data:
368
- try:
369
- dt = datetime.fromisoformat(data["createdAt"].replace("Z", "+00:00"))
370
- result["creation_date"] = dt.strftime("%Y-%m-%d")
371
- except Exception:
372
- pass
375
+ creation_date = _parse_iso8601_to_date(data["createdAt"])
376
+ if creation_date:
377
+ result["creation_date"] = creation_date
373
378
  if "version" in data and "createdAt" in data["version"]:
374
- try:
375
- dt = datetime.fromisoformat(data["version"]["createdAt"].replace("Z", "+00:00"))
376
- result["last_modified_date"] = dt.strftime("%Y-%m-%d")
377
- except Exception:
378
- pass
379
+ last_modified = _parse_iso8601_to_date(data["version"]["createdAt"])
380
+ if last_modified:
381
+ result["last_modified_date"] = last_modified
379
382
  return result
380
383
  except Exception as e:
381
384
  logger.debug(f"Failed to fetch Confluence page dates: {e}")
382
385
  return {}
383
386
 
384
387
 
388
+ def _prepare_jira_jql_query(query: str, config: dict[str, Any]) -> str:
389
+ """Construct a JQL query from user search terms and configuration.
390
+
391
+ Uses Jira's 'text' field which searches across Summary, Description,
392
+ Environment, Comments, and all text custom fields. This provides broad
393
+ coverage similar to natural language search.
394
+
395
+ Note: Fuzzy search operators (~) are deprecated in Jira Cloud but work
396
+ in Data Center/Server. ChunkSilo's semantic search (embeddings + reranker)
397
+ provides fuzzy matching regardless of Jira version.
398
+
399
+ Args:
400
+ query: User's search query string
401
+ config: Configuration dict containing jira settings
402
+
403
+ Returns:
404
+ JQL query string ready for Jira API
405
+
406
+ Raises:
407
+ None - returns safe default query on edge cases
408
+
409
+ Performance Note:
410
+ For high-volume instances, consider using specific fields like
411
+ "Summary ~ 'term' OR Description ~ 'term'" instead of "text ~ 'term'"
412
+ to reduce search scope. The current implementation prioritizes recall.
413
+
414
+ References:
415
+ - Jira text field: https://support.atlassian.com/jira-software-cloud/docs/search-for-work-items-using-the-text-field/
416
+ - JQL operators: https://support.atlassian.com/jira-software-cloud/docs/jql-operators/
417
+ """
418
+ # Reuse Confluence query term preparation for stopword filtering
419
+ # This gives us a clean list of meaningful search terms
420
+ query_terms = _prepare_confluence_query_terms(query)
421
+
422
+ # Build the text search clause
423
+ # Using JQL 'text' field which searches across all text fields for broad recall
424
+ if not query_terms:
425
+ # No meaningful terms after filtering, use original query
426
+ escaped = query.strip().replace('"', '\\"')
427
+ if not escaped:
428
+ logger.warning("Jira search skipped: empty query after processing")
429
+ return ""
430
+ text_clause = f'text ~ "{escaped}"'
431
+ elif len(query_terms) == 1:
432
+ # Single term - simple text search
433
+ text_clause = f'text ~ "{query_terms[0]}"'
434
+ else:
435
+ # Multiple terms - use OR logic to find issues matching any term
436
+ # ChunkSilo's reranker will score results by relevance after retrieval
437
+ text_conditions = ' OR '.join([f'text ~ "{term}"' for term in query_terms])
438
+ text_clause = f'({text_conditions})'
439
+
440
+ # Add project filter if configured
441
+ # Empty projects list means search all accessible projects
442
+ projects = config["jira"].get("projects", [])
443
+ if projects:
444
+ # Restrict search to specific project keys
445
+ project_list = ", ".join([f'"{p}"' for p in projects])
446
+ project_clause = f'project IN ({project_list})'
447
+ jql = f'{text_clause} AND {project_clause}'
448
+ else:
449
+ jql = text_clause
450
+
451
+ # Order by updated DESC for recency
452
+ # This enables ChunkSilo's recency boost feature and returns most relevant recent issues first
453
+ jql += ' ORDER BY updated DESC'
454
+
455
+ return jql
456
+
457
+
458
+ def _jira_issue_to_text(issue, include_comments: bool, include_custom_fields: bool) -> str:
459
+ """Convert a Jira issue to searchable text representation.
460
+
461
+ This function constructs a text representation of a Jira issue that will
462
+ be embedded and indexed by ChunkSilo's vector database. The text includes
463
+ structured sections for issue metadata, description, comments, and custom
464
+ fields, making it suitable for semantic search.
465
+
466
+ Args:
467
+ issue: JIRA issue object from the jira library
468
+ include_comments: If True, include all issue comments with author names
469
+ include_custom_fields: If True, include all custom field values
470
+
471
+ Returns:
472
+ Formatted text string suitable for embedding and search
473
+
474
+ Format:
475
+ Issue: PROJ-123
476
+ Summary: Issue title here
477
+
478
+ Description:
479
+ Issue description text...
480
+
481
+ Comments:
482
+ - John Doe: Comment text
483
+ - Jane Smith: Another comment
484
+
485
+ Custom Fields:
486
+ customfield_10001: value
487
+
488
+ Note:
489
+ Custom fields are detected by checking for attributes starting with
490
+ 'customfield_'. Only fields with non-empty values are included.
491
+
492
+ Error Handling:
493
+ Missing or None fields (description, comments) are gracefully skipped.
494
+ The function always returns valid text even with minimal issue data.
495
+ """
496
+ parts = []
497
+
498
+ # Issue key and summary are always included
499
+ # These are the most important fields for identifying and understanding the issue
500
+ parts.append(f"Issue: {issue.key}")
501
+ parts.append(f"Summary: {issue.fields.summary}")
502
+
503
+ # Description provides detailed context
504
+ # Use hasattr() for safe field access since description can be missing/None
505
+ if hasattr(issue.fields, 'description') and issue.fields.description:
506
+ parts.append(f"\nDescription:\n{issue.fields.description}")
507
+
508
+ # Comments provide discussion context and additional searchable content
509
+ # Include author names for context (who said what matters for search)
510
+ if include_comments and hasattr(issue.fields, 'comment'):
511
+ comments = issue.fields.comment.comments
512
+ if comments:
513
+ parts.append("\nComments:")
514
+ for comment in comments:
515
+ # Safely extract author display name
516
+ author = getattr(comment, 'author', None)
517
+ author_name = author.displayName if author and hasattr(author, 'displayName') else 'Unknown'
518
+ parts.append(f"- {author_name}: {comment.body}")
519
+
520
+ # Custom fields provide instance-specific metadata
521
+ # Detect by 'customfield_' prefix, include any with non-empty values
522
+ if include_custom_fields:
523
+ custom_fields = []
524
+ for field_name in dir(issue.fields):
525
+ if field_name.startswith('customfield_'):
526
+ field_value = getattr(issue.fields, field_name, None)
527
+ # Only include fields with meaningful values
528
+ if field_value is not None and str(field_value).strip():
529
+ custom_fields.append(f"{field_name}: {field_value}")
530
+
531
+ if custom_fields:
532
+ parts.append("\nCustom Fields:")
533
+ parts.extend(custom_fields)
534
+
535
+ # Join all sections with newlines for structured, readable text
536
+ # This format works well for semantic search and LLM processing
537
+ return "\n".join(parts)
538
+
539
+
540
+ def _jira_issue_to_metadata(issue, jira_url: str) -> dict[str, Any]:
541
+ """Extract structured metadata from a Jira issue.
542
+
543
+ This function extracts metadata that will be attached to the search result
544
+ node. The metadata is used for:
545
+ - Display in search results (title, status, etc.)
546
+ - Date range filtering (creation_date, last_modified_date)
547
+ - Recency boosting (last_modified_date)
548
+ - URI construction (issue_key)
549
+ - Result grouping and sorting
550
+
551
+ Args:
552
+ issue: JIRA issue object from the jira library
553
+ jira_url: Base Jira URL for constructing attachment URLs
554
+
555
+ Returns:
556
+ Dictionary of metadata fields following ChunkSilo conventions
557
+
558
+ Metadata Fields:
559
+ Required:
560
+ - source: Always "Jira"
561
+ - issue_key: Jira issue key (e.g., "PROJ-123")
562
+ - issue_type: Type of issue ("Bug", "Story", etc.)
563
+ - status: Current status ("Open", "In Progress", etc.)
564
+ - title: Issue summary
565
+ - file_name: Display name (format: "{key}: {summary}")
566
+
567
+ Optional (present if available):
568
+ - priority: Issue priority ("High", "Medium", "Low")
569
+ - creation_date: ISO format date string (YYYY-MM-DD)
570
+ - last_modified_date: ISO format date string (YYYY-MM-DD)
571
+ - project_key: Project key (e.g., "PROJ")
572
+ - project_name: Full project name
573
+ - assignee: Assigned user's display name
574
+ - reporter: Reporter's display name
575
+ - attachments: List of attachment metadata (not content)
576
+
577
+ Date Format:
578
+ Jira returns ISO 8601 dates like "2024-01-15T10:30:00.000+0000".
579
+ We convert to "YYYY-MM-DD" format for consistency with ChunkSilo's
580
+ date filtering and display logic.
581
+
582
+ Attachment Handling:
583
+ Per requirements, we list attachment metadata but do NOT download
584
+ or index attachment content. Users/models can follow URLs to access
585
+ attachments if needed.
586
+
587
+ Error Handling:
588
+ Missing optional fields (priority, assignee, etc.) are gracefully
589
+ skipped. The function always returns a valid metadata dict with
590
+ at least the required fields.
591
+ """
592
+ # Required fields - always present
593
+ metadata = {
594
+ "source": "Jira", # Identifies source for URI resolution and display
595
+ "issue_key": issue.key, # Unique identifier for linking
596
+ "issue_type": issue.fields.issuetype.name,
597
+ "status": issue.fields.status.name,
598
+ "title": issue.fields.summary,
599
+ # file_name format matches ChunkSilo convention for display
600
+ "file_name": f"{issue.key}: {issue.fields.summary}",
601
+ }
602
+
603
+ # Optional: Priority (may not exist on all issue types)
604
+ if hasattr(issue.fields, 'priority') and issue.fields.priority:
605
+ metadata["priority"] = issue.fields.priority.name
606
+
607
+ # Dates for filtering and recency boosting
608
+ # Parse ISO 8601 format and convert to YYYY-MM-DD for consistency
609
+ if hasattr(issue.fields, 'created') and issue.fields.created:
610
+ creation_date = _parse_iso8601_to_date(issue.fields.created)
611
+ if creation_date:
612
+ metadata["creation_date"] = creation_date
613
+
614
+ if hasattr(issue.fields, 'updated') and issue.fields.updated:
615
+ # last_modified_date used for recency boost calculation
616
+ last_modified = _parse_iso8601_to_date(issue.fields.updated)
617
+ if last_modified:
618
+ metadata["last_modified_date"] = last_modified
619
+
620
+ # Project information
621
+ if hasattr(issue.fields, 'project') and issue.fields.project:
622
+ metadata["project_key"] = issue.fields.project.key
623
+ metadata["project_name"] = issue.fields.project.name
624
+
625
+ # People - use display names for human-readable output
626
+ if hasattr(issue.fields, 'assignee') and issue.fields.assignee:
627
+ metadata["assignee"] = issue.fields.assignee.displayName
628
+
629
+ if hasattr(issue.fields, 'reporter') and issue.fields.reporter:
630
+ metadata["reporter"] = issue.fields.reporter.displayName
631
+
632
+ # Attachments - list metadata only, don't index content
633
+ # Per requirements: provide URIs so user/model can access if needed
634
+ if hasattr(issue.fields, 'attachment') and issue.fields.attachment:
635
+ attachments = []
636
+ for att in issue.fields.attachment:
637
+ attachments.append({
638
+ "filename": att.filename,
639
+ "url": att.content, # Direct download URL
640
+ "size": att.size, # Size in bytes
641
+ })
642
+ if attachments:
643
+ metadata["attachments"] = attachments
644
+
645
+ return metadata
646
+
647
+
385
648
  def _search_confluence(query: str, config: dict[str, Any]) -> list[NodeWithScore]:
386
649
  """Search Confluence for documents matching the query using CQL."""
387
650
  base_url = config["confluence"]["url"]
@@ -447,6 +710,153 @@ def _search_confluence(query: str, config: dict[str, Any]) -> list[NodeWithScore
447
710
  return []
448
711
 
449
712
 
713
+ def _search_jira(query: str, config: dict[str, Any]) -> list[NodeWithScore]:
714
+ """Search Jira for issues matching the query using JQL.
715
+
716
+ This function performs a real-time search against the configured Jira
717
+ instance using JQL (Jira Query Language). Results are converted to
718
+ ChunkSilo's NodeWithScore format and merged with other search results.
719
+
720
+ The function follows ChunkSilo's integration pattern (same as Confluence):
721
+ - Returns empty list on errors (graceful degradation)
722
+ - Logs warnings for configuration issues
723
+ - Uses timeout protection in calling code (ThreadPoolExecutor)
724
+ - Converts results to standard NodeWithScore format
725
+
726
+ Search Strategy:
727
+ - Uses JQL 'text' field for broad search across all text fields
728
+ - Relies on ChunkSilo's semantic search (embeddings + reranker) for
729
+ relevance ranking instead of traditional fuzzy search
730
+ - Returns results with score=0.0; FlashRank reranker scores them
731
+
732
+ Args:
733
+ query: User's search query string
734
+ config: Configuration dictionary with jira and ssl settings
735
+
736
+ Returns:
737
+ List of NodeWithScore objects, or empty list on error/disabled
738
+
739
+ Configuration Requirements:
740
+ config["jira"]["url"]: Jira base URL (empty = disabled)
741
+ config["jira"]["username"]: Jira username or email
742
+ config["jira"]["api_token"]: Jira API token (not password)
743
+ config["jira"]["max_results"]: Maximum issues to return
744
+ config["jira"]["projects"]: List of project keys (empty = all)
745
+ config["jira"]["include_comments"]: Include issue comments
746
+ config["jira"]["include_custom_fields"]: Include custom fields
747
+ config["ssl"]["ca_bundle_path"]: Optional SSL CA bundle path
748
+
749
+ Error Handling:
750
+ - Empty URL: Returns [] with debug log
751
+ - Missing library: Returns [] with warning log
752
+ - Missing credentials: Returns [] with warning log
753
+ - API errors: Returns [] with error log (exc_info=True)
754
+ - All errors are non-fatal to allow other searches to succeed
755
+
756
+ Performance:
757
+ - Respects max_results limit (default 30)
758
+ - Orders by updated DESC (most recent first)
759
+ - Fetches all fields including custom fields
760
+
761
+ SSL/TLS:
762
+ - Supports custom CA bundles via ssl.ca_bundle_path
763
+ - Automatically configured through jira_options["verify"]
764
+
765
+ Authentication:
766
+ - Uses basic auth (username + API token)
767
+ - Works for both Jira Cloud and Data Center/Server
768
+
769
+ References:
770
+ - Jira REST API: https://developer.atlassian.com/cloud/jira/platform/rest/v3/
771
+ - JQL Reference: https://support.atlassian.com/jira-software-cloud/docs/use-advanced-search-with-jira-query-language-jql/
772
+ """
773
+ # Check if Jira integration is enabled via URL
774
+ base_url = config["jira"]["url"]
775
+ if not base_url:
776
+ logger.debug("Jira search skipped: jira.url not set in config")
777
+ return []
778
+
779
+ # Gracefully degrade if optional dependency not installed
780
+ if JIRA is None:
781
+ logger.warning("jira library not installed, skipping Jira search")
782
+ return []
783
+
784
+ # Extract configuration settings
785
+ username = config["jira"]["username"]
786
+ api_token = config["jira"]["api_token"]
787
+ max_results = config["jira"]["max_results"]
788
+ include_comments = config["jira"]["include_comments"]
789
+ include_custom_fields = config["jira"]["include_custom_fields"]
790
+ ca_bundle_path = config["ssl"]["ca_bundle_path"] or None
791
+
792
+ # Validate required credentials are present
793
+ if not (base_url and username and api_token):
794
+ missing = []
795
+ if not username:
796
+ missing.append("jira.username")
797
+ if not api_token:
798
+ missing.append("jira.api_token")
799
+ logger.warning(f"Jira search skipped: missing {', '.join(missing)} in config")
800
+ return []
801
+
802
+ try:
803
+ # Configure SSL certificate verification if CA bundle provided
804
+ # This enables Jira integration in corporate environments with custom CAs
805
+ jira_options = {"server": base_url}
806
+ if ca_bundle_path:
807
+ jira_options["verify"] = ca_bundle_path
808
+
809
+ # Use basic auth (username + API token) for authentication
810
+ # Works for both Jira Cloud and Data Center/Server
811
+ jira_client = JIRA(
812
+ options=jira_options,
813
+ basic_auth=(username, api_token)
814
+ )
815
+
816
+ # Construct JQL with text search and project filtering
817
+ jql = _prepare_jira_jql_query(query, config)
818
+ if not jql:
819
+ # Empty query after processing
820
+ return []
821
+
822
+ # Log JQL query for debugging
823
+ logger.debug(f"Jira JQL query: {jql}")
824
+
825
+ # Fetch all fields including custom fields for comprehensive search
826
+ # maxResults limits API response size for performance
827
+ issues = jira_client.search_issues(
828
+ jql,
829
+ maxResults=max_results,
830
+ fields='*all' # Get all fields including custom fields
831
+ )
832
+
833
+ # Convert issues to NodeWithScore format
834
+ nodes: list[NodeWithScore] = []
835
+ for issue in issues:
836
+ # Build searchable text representation
837
+ # This will be embedded and indexed by ChunkSilo's vector database
838
+ text = _jira_issue_to_text(issue, include_comments, include_custom_fields)
839
+
840
+ # Extract structured metadata for filtering and display
841
+ metadata = _jira_issue_to_metadata(issue, base_url)
842
+
843
+ # Create node with text and metadata
844
+ # Initial score is 0.0; FlashRank reranker will assign relevance scores
845
+ node = TextNode(text=text, metadata=metadata)
846
+ nodes.append(NodeWithScore(node=node, score=0.0))
847
+
848
+ logger.debug(f"Jira search returned {len(nodes)} issues")
849
+ # Return results for merging with other search sources
850
+ return nodes
851
+
852
+ except Exception as e:
853
+ # Catch all exceptions to prevent search pipeline failure
854
+ # Log errors with full traceback for debugging
855
+ logger.error(f"Failed to search Jira: {e}", exc_info=True)
856
+ # Return empty list to allow search to continue with other sources
857
+ return []
858
+
859
+
450
860
  def load_llamaindex_index(config: dict[str, Any] | None = None):
451
861
  """Load the LlamaIndex from storage."""
452
862
  if config is None:
@@ -471,6 +881,41 @@ def load_llamaindex_index(config: dict[str, Any] | None = None):
471
881
  return index
472
882
 
473
883
 
884
+ def _parse_iso8601_to_date(iso_string: str) -> str | None:
885
+ """Parse ISO 8601 timestamp to YYYY-MM-DD format.
886
+
887
+ Handles various ISO 8601 formats including:
888
+ - Z suffix: 2024-01-15T10:30:00Z
889
+ - Timezone with colon: 2024-01-15T10:30:00.000+00:00
890
+ - Timezone without colon: 2024-01-15T10:30:00.000+0000 (Jira format)
891
+
892
+ Args:
893
+ iso_string: ISO 8601 formatted datetime string
894
+
895
+ Returns:
896
+ Date in YYYY-MM-DD format, or None if parsing fails
897
+ """
898
+ if not iso_string:
899
+ return None
900
+
901
+ try:
902
+ # Normalize the timestamp
903
+ normalized = iso_string.strip()
904
+
905
+ # Replace Z suffix with +00:00
906
+ normalized = normalized.replace('Z', '+00:00')
907
+
908
+ # Insert colon in timezone offsets like +0000 → +00:00
909
+ # Matches ±HHMM at end of string, inserts colon: ±HH:MM
910
+ normalized = re.sub(r'([+-]\d{2})(\d{2})$', r'\1:\2', normalized)
911
+
912
+ # Parse and format
913
+ dt = datetime.fromisoformat(normalized)
914
+ return dt.strftime("%Y-%m-%d")
915
+ except Exception:
916
+ return None
917
+
918
+
474
919
  def _parse_date(date_str: str) -> datetime | None:
475
920
  """Parse date string in YYYY-MM-DD format."""
476
921
  try:
@@ -623,6 +1068,23 @@ def run_search(
623
1068
  if confluence_nodes:
624
1069
  nodes.extend(confluence_nodes)
625
1070
 
1071
+ # Search Jira (with timeout)
1072
+ jira_nodes: list[NodeWithScore] = []
1073
+ jira_timeout = config["jira"]["timeout"]
1074
+ if config["jira"]["url"]:
1075
+ try:
1076
+ with ThreadPoolExecutor(max_workers=1) as executor:
1077
+ future = executor.submit(_search_jira, enhanced_query, config)
1078
+ jira_nodes = future.result(timeout=jira_timeout)
1079
+ logger.info(f"Jira search returned {len(jira_nodes)} entries")
1080
+ except FuturesTimeoutError:
1081
+ logger.warning(f"Jira search timed out after {jira_timeout}s")
1082
+ except Exception as e:
1083
+ logger.error(f"Error during Jira search: {e}")
1084
+
1085
+ if jira_nodes:
1086
+ nodes.extend(jira_nodes)
1087
+
626
1088
  # Apply date filtering
627
1089
  if date_from or date_to:
628
1090
  original_count = len(nodes)
@@ -734,6 +1196,12 @@ def run_search(
734
1196
  from urllib.parse import quote
735
1197
  encoded_title = quote(title.replace(" ", "+"))
736
1198
  source_uri = f"{confluence_url.rstrip('/')}/spaces/~{encoded_title}"
1199
+ elif original_source == "Jira":
1200
+ # Build Jira issue URI using standard browse URL format
1201
+ jira_url = config["jira"]["url"]
1202
+ issue_key = metadata.get("issue_key")
1203
+ if jira_url and issue_key:
1204
+ source_uri = f"{jira_url.rstrip('/')}/browse/{issue_key}"
737
1205
  elif file_path:
738
1206
  source_uri = _resolve_file_uri(file_path, config)
739
1207
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chunksilo
3
- Version: 2.0.0
3
+ Version: 2.1.1
4
4
  Summary: Local RAG-based semantic document search with MCP server interface
5
5
  Author: Fredrik Reveny
6
6
  License-Expression: Apache-2.0
@@ -26,39 +26,59 @@ Requires-Dist: pillow<11,>=10.3.0
26
26
  Requires-Dist: pypdf<7,>=5.1.0
27
27
  Requires-Dist: python-docx<2,>=1.1.0
28
28
  Requires-Dist: mcp<2,>=1.0.0
29
- Requires-Dist: python-dotenv<2,>=1.0.0
30
29
  Requires-Dist: huggingface-hub<2,>=0.22.0
31
30
  Requires-Dist: flashrank<1,>=0.1.0
32
31
  Requires-Dist: fastembed<1,>=0.5.0
33
32
  Requires-Dist: pyyaml<7,>=6.0
34
33
  Provides-Extra: confluence
35
34
  Requires-Dist: llama-index-readers-confluence<1,>=0.6.0; extra == "confluence"
35
+ Provides-Extra: jira
36
+ Requires-Dist: jira<4,>=3.5.0; extra == "jira"
36
37
  Provides-Extra: test
37
38
  Requires-Dist: pytest<9,>=7.4.0; extra == "test"
38
39
  Requires-Dist: requests<3,>=2.31.0; extra == "test"
39
40
  Dynamic: license-file
40
41
 
42
+ <p align="center">
43
+ <img src="https://raw.githubusercontent.com/Chetic/chunksilo/main/chunksilo.png" alt="ChunkSilo Logo" width="500">
44
+ </p>
45
+
41
46
  # ChunkSilo MCP Server
42
47
 
43
48
  ChunkSilo is like a local Google for your documents. It uses semantic search — matching by meaning rather than exact keywords — so your LLM can find relevant information across all your files even when the wording differs from your query. Point it at your PDFs, Word docs, Markdown, and text files, and it builds a fully searchable index locally on your machine.
44
49
 
45
- ## Overview
46
-
47
- - **No permissions headache**: Each user indexes only the files they already have access to. No centralized access-control system to build or maintain — document permissions stay exactly where they are.
48
- - **No infrastructure required**: Runs entirely on the user's own machine as an MCP server. Nothing to deploy, no servers to manage.
49
- - **Easy to set up**: Any user with an MCP-compatible LLM client can install, point at their document directories, and have everything indexed and searchable.
50
- - **Works with what you have**: Supports PDF, DOCX, DOC, Markdown, and TXT from local folders, network drives, or shared mounts.
50
+ - Runs entirely on your machine — no servers, no infrastructure
51
+ - Semantic search + keyword filename matching across PDF, DOCX, DOC, Markdown, and TXT
52
+ - Incremental indexing only reprocesses new or changed files
53
+ - Heading-aware results with source links back to the original file
54
+ - Date filtering and recency boosting
55
+ - Optional Confluence integration
51
56
 
52
- ## Features
57
+ ### Example `search_docs` output
53
58
 
54
- - **Local indexing and search**: All indexing and search runs on your machine with bundled models — ChunkSilo itself makes no external network calls when `offline: true`. Note: search results are passed to your MCP client's LLM, which may be cloud-hosted.
55
- - **Incremental indexing**: Only reindexes new or changed files, so re-runs are fast even on large document collections.
56
- - **Heading-aware navigation**: Extracts headings from PDFs, Word docs, and Markdown so results include the full heading path (e.g. "Chapter 3 > Setup > Prerequisites").
57
- - **Date filtering and recency boost**: Search within a date range or let recent documents rank higher automatically.
58
- - **Dual retrieval**: Returns both meaning-based chunk matches and keyword-based filename matches separately, so file lookups don't get buried by unrelated content.
59
- - **Multi-directory with per-folder rules**: Index multiple directories with individual include/exclude glob patterns — useful for shared drives with mixed content.
60
- - **Confluence integration**: Optionally searches your Confluence instance alongside local files, with results returned in the same format.
61
- - **Source links**: Each result includes a clickable link back to the source file or Confluence page in supported MCP clients.
59
+ ```json
60
+ {
61
+ "matched_files": [
62
+ { "uri": "file:///docs/database-configuration.docx", "score": 0.8432 }
63
+ ],
64
+ "num_matched_files": 1,
65
+ "chunks": [
66
+ {
67
+ "text": "To configure the database connection, set the DATABASE_URL environment variable...",
68
+ "score": 0.912,
69
+ "location": {
70
+ "uri": "file:///docs/setup-guide.pdf",
71
+ "page": 12,
72
+ "line": null,
73
+ "heading_path": ["Getting Started", "Configuration", "Database"]
74
+ }
75
+ }
76
+ ],
77
+ "num_chunks": 1,
78
+ "query": "how to configure the database",
79
+ "retrieval_time": "0.42s"
80
+ }
81
+ ```
62
82
 
63
83
  ## Installation
64
84
 
@@ -71,6 +91,12 @@ pip install chunksilo
71
91
 
72
92
  # Or with Confluence support:
73
93
  pip install chunksilo[confluence]
94
+
95
+ # Or with Jira support:
96
+ pip install chunksilo[jira]
97
+
98
+ # Or with both Confluence and Jira:
99
+ pip install chunksilo[confluence,jira]
74
100
  ```
75
101
 
76
102
  Then:
@@ -184,6 +210,27 @@ All settings are optional and have sensible defaults.
184
210
  | `confluence.timeout` | `10.0` | Request timeout in seconds |
185
211
  | `confluence.max_results` | `30` | Maximum results per search |
186
212
 
213
+ #### Jira Settings (optional)
214
+
215
+ > **Note:** Jira integration requires the optional dependency. Install with: `pip install chunksilo[jira]`
216
+
217
+ | Setting | Default | Description |
218
+ | :--- | :--- | :--- |
219
+ | `jira.url` | `""` | Jira base URL (empty = disabled) |
220
+ | `jira.username` | `""` | Jira username/email |
221
+ | `jira.api_token` | `""` | Jira API token |
222
+ | `jira.timeout` | `10.0` | Request timeout in seconds |
223
+ | `jira.max_results` | `30` | Maximum results per search |
224
+ | `jira.projects` | `[]` | Project keys to search (empty = all) |
225
+ | `jira.include_comments` | `true` | Include issue comments in search |
226
+ | `jira.include_custom_fields` | `true` | Include custom fields in search |
227
+
228
+ **Creating a Jira API Token:**
229
+ 1. Log into Jira
230
+ 2. Go to Account Settings > Security > API Tokens
231
+ 3. Click "Create API Token"
232
+ 4. Copy the token and add it to your config
233
+
187
234
  #### SSL Settings (optional)
188
235
 
189
236
  | Setting | Default | Description |
@@ -357,6 +404,7 @@ Add to `mcp_settings.json` (typically in `~/.config/Code/User/globalStorage/roov
357
404
  - **Retrieval errors**: Check paths in your MCP client configuration.
358
405
  - **Offline mode**: PyPI installs default to `offline: false` (models auto-download). The offline bundle includes pre-downloaded models and sets `offline: true`. Set `retrieval.offline: true` in `config.yaml` to prevent network calls after initial model download.
359
406
  - **Confluence Integration**: Install with `pip install chunksilo[confluence]`, then set `confluence.url`, `confluence.username`, and `confluence.api_token` in `config.yaml`.
407
+ - **Jira Integration**: Install with `pip install chunksilo[jira]`, then set `jira.url`, `jira.username`, and `jira.api_token` in `config.yaml`. Optionally configure `jira.projects` to restrict search to specific project keys.
360
408
  - **Custom CA Bundle**: Set `ssl.ca_bundle_path` in `config.yaml` for custom certificates.
361
409
  - **Network mounts**: Unavailable directories are skipped with a warning; indexing continues with available directories.
362
410
  - **Legacy .doc files**: Requires LibreOffice to be installed for automatic conversion to .docx. If LibreOffice is not found, .doc files are skipped with a warning. Full heading extraction is supported.
@@ -0,0 +1,15 @@
1
+ chunksilo/__init__.py,sha256=JuibWxTebuRgN7IF8oNtLuEEpFlXQFDtWOEUH3keAcs,121
2
+ chunksilo/__main__.py,sha256=eY8-KfJfBz0nibDPY_jv2RvkLXEx7ZDSPRWiDJb7PpY,39
3
+ chunksilo/cfgload.py,sha256=dvzDSftqbLCEhYb76OPA4hP_ToJE7FYdpko67m_2RkA,4639
4
+ chunksilo/cli.py,sha256=ZKXzW-HOmnLXC4Ynu-D86XIbp7bdQ_OYVHuHFVlj41U,4170
5
+ chunksilo/confluence_html_formatter.py,sha256=D8pb5TCrai6exIqeajH49y4D_t0jfQkmI6aNm4BzPIg,2828
6
+ chunksilo/index.py,sha256=ECSTH8c0ZhnTxAHxM7dcCfFOGTk99STHuyDhc3uMCO0,51618
7
+ chunksilo/search.py,sha256=TakSH-62oM4mJg1v3O9ylivbF_IROBNfHVfJ8lkZMrw,47879
8
+ chunksilo/server.py,sha256=PhQWMvEGb3UqWwk0tm44kTOnKH2NH4NmshnKx11xwh8,3374
9
+ chunksilo-2.1.1.dist-info/licenses/LICENSE,sha256=kda7NTahQy3nKvLe-LGIVCdI-qePTyqNNx2e6HLVH3k,10766
10
+ chunksilo-2.1.1.dist-info/licenses/NOTICE,sha256=58GCfasR-XT7RPvz9OoWb4NrPg-3AqEwzw9FJOhngZo,1228
11
+ chunksilo-2.1.1.dist-info/METADATA,sha256=B-JAQGxX34vLxk08Q3sycVl5TIRwpwxRJLYnBCTvxWU,13789
12
+ chunksilo-2.1.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
13
+ chunksilo-2.1.1.dist-info/entry_points.txt,sha256=TfgrfBQyIRHMG5NFaXR-owDLsjHBCxjJ4YGIGBnVzbQ,87
14
+ chunksilo-2.1.1.dist-info/top_level.txt,sha256=xfAH0GhTfZbwkbi_DgUA5keNP-osqVNcoZjwkhpwVms,10
15
+ chunksilo-2.1.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.10.1)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,15 +0,0 @@
1
- chunksilo/__init__.py,sha256=Ph1w-3A5CKoGdxzUNVlrB483eOM5JxAQN8K88-6HbuY,121
2
- chunksilo/__main__.py,sha256=eY8-KfJfBz0nibDPY_jv2RvkLXEx7ZDSPRWiDJb7PpY,39
3
- chunksilo/cfgload.py,sha256=A7ab2RkeYQhFTTvn56I900lWpXCqXb8ao8DGYeUai1U,4359
4
- chunksilo/cli.py,sha256=ZKXzW-HOmnLXC4Ynu-D86XIbp7bdQ_OYVHuHFVlj41U,4170
5
- chunksilo/confluence_html_formatter.py,sha256=D8pb5TCrai6exIqeajH49y4D_t0jfQkmI6aNm4BzPIg,2828
6
- chunksilo/index.py,sha256=ECSTH8c0ZhnTxAHxM7dcCfFOGTk99STHuyDhc3uMCO0,51618
7
- chunksilo/search.py,sha256=8_6zuvyVdZHNMCBHrK3NcP0Ct7oYeCSLuoOHT8RxD0o,28722
8
- chunksilo/server.py,sha256=PhQWMvEGb3UqWwk0tm44kTOnKH2NH4NmshnKx11xwh8,3374
9
- chunksilo-2.0.0.dist-info/licenses/LICENSE,sha256=kda7NTahQy3nKvLe-LGIVCdI-qePTyqNNx2e6HLVH3k,10766
10
- chunksilo-2.0.0.dist-info/licenses/NOTICE,sha256=58GCfasR-XT7RPvz9OoWb4NrPg-3AqEwzw9FJOhngZo,1228
11
- chunksilo-2.0.0.dist-info/METADATA,sha256=AA4R48_d6h7V89dHM9hb0av53WlfrDKZzALfOZ0jywU,13360
12
- chunksilo-2.0.0.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
13
- chunksilo-2.0.0.dist-info/entry_points.txt,sha256=TfgrfBQyIRHMG5NFaXR-owDLsjHBCxjJ4YGIGBnVzbQ,87
14
- chunksilo-2.0.0.dist-info/top_level.txt,sha256=xfAH0GhTfZbwkbi_DgUA5keNP-osqVNcoZjwkhpwVms,10
15
- chunksilo-2.0.0.dist-info/RECORD,,