langroid 0.1.139__py3-none-any.whl → 0.1.219__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. langroid/__init__.py +70 -0
  2. langroid/agent/__init__.py +22 -0
  3. langroid/agent/base.py +120 -33
  4. langroid/agent/batch.py +134 -35
  5. langroid/agent/callbacks/__init__.py +0 -0
  6. langroid/agent/callbacks/chainlit.py +608 -0
  7. langroid/agent/chat_agent.py +164 -100
  8. langroid/agent/chat_document.py +19 -2
  9. langroid/agent/openai_assistant.py +20 -10
  10. langroid/agent/special/__init__.py +33 -10
  11. langroid/agent/special/doc_chat_agent.py +521 -108
  12. langroid/agent/special/lance_doc_chat_agent.py +258 -0
  13. langroid/agent/special/lance_rag/__init__.py +9 -0
  14. langroid/agent/special/lance_rag/critic_agent.py +136 -0
  15. langroid/agent/special/lance_rag/lance_rag_task.py +80 -0
  16. langroid/agent/special/lance_rag/query_planner_agent.py +180 -0
  17. langroid/agent/special/lance_tools.py +44 -0
  18. langroid/agent/special/neo4j/__init__.py +0 -0
  19. langroid/agent/special/neo4j/csv_kg_chat.py +174 -0
  20. langroid/agent/special/neo4j/neo4j_chat_agent.py +370 -0
  21. langroid/agent/special/neo4j/utils/__init__.py +0 -0
  22. langroid/agent/special/neo4j/utils/system_message.py +46 -0
  23. langroid/agent/special/relevance_extractor_agent.py +23 -7
  24. langroid/agent/special/retriever_agent.py +29 -174
  25. langroid/agent/special/sql/__init__.py +7 -0
  26. langroid/agent/special/sql/sql_chat_agent.py +47 -23
  27. langroid/agent/special/sql/utils/__init__.py +11 -0
  28. langroid/agent/special/sql/utils/description_extractors.py +95 -46
  29. langroid/agent/special/sql/utils/populate_metadata.py +28 -21
  30. langroid/agent/special/table_chat_agent.py +43 -9
  31. langroid/agent/task.py +423 -114
  32. langroid/agent/tool_message.py +67 -10
  33. langroid/agent/tools/__init__.py +8 -0
  34. langroid/agent/tools/duckduckgo_search_tool.py +66 -0
  35. langroid/agent/tools/google_search_tool.py +11 -0
  36. langroid/agent/tools/metaphor_search_tool.py +67 -0
  37. langroid/agent/tools/recipient_tool.py +6 -24
  38. langroid/agent/tools/sciphi_search_rag_tool.py +79 -0
  39. langroid/cachedb/__init__.py +6 -0
  40. langroid/embedding_models/__init__.py +24 -0
  41. langroid/embedding_models/base.py +9 -1
  42. langroid/embedding_models/models.py +117 -17
  43. langroid/embedding_models/protoc/embeddings.proto +19 -0
  44. langroid/embedding_models/protoc/embeddings_pb2.py +33 -0
  45. langroid/embedding_models/protoc/embeddings_pb2.pyi +50 -0
  46. langroid/embedding_models/protoc/embeddings_pb2_grpc.py +79 -0
  47. langroid/embedding_models/remote_embeds.py +153 -0
  48. langroid/language_models/__init__.py +22 -0
  49. langroid/language_models/azure_openai.py +47 -4
  50. langroid/language_models/base.py +26 -10
  51. langroid/language_models/config.py +5 -0
  52. langroid/language_models/openai_gpt.py +407 -121
  53. langroid/language_models/prompt_formatter/__init__.py +9 -0
  54. langroid/language_models/prompt_formatter/base.py +4 -6
  55. langroid/language_models/prompt_formatter/hf_formatter.py +135 -0
  56. langroid/language_models/utils.py +10 -9
  57. langroid/mytypes.py +10 -4
  58. langroid/parsing/__init__.py +33 -1
  59. langroid/parsing/document_parser.py +259 -63
  60. langroid/parsing/image_text.py +32 -0
  61. langroid/parsing/parse_json.py +143 -0
  62. langroid/parsing/parser.py +20 -7
  63. langroid/parsing/repo_loader.py +108 -46
  64. langroid/parsing/search.py +8 -0
  65. langroid/parsing/table_loader.py +44 -0
  66. langroid/parsing/url_loader.py +59 -13
  67. langroid/parsing/urls.py +18 -9
  68. langroid/parsing/utils.py +130 -9
  69. langroid/parsing/web_search.py +73 -0
  70. langroid/prompts/__init__.py +7 -0
  71. langroid/prompts/chat-gpt4-system-prompt.md +68 -0
  72. langroid/prompts/prompts_config.py +1 -1
  73. langroid/utils/__init__.py +10 -0
  74. langroid/utils/algorithms/__init__.py +3 -0
  75. langroid/utils/configuration.py +0 -1
  76. langroid/utils/constants.py +4 -0
  77. langroid/utils/logging.py +2 -5
  78. langroid/utils/output/__init__.py +15 -2
  79. langroid/utils/output/status.py +33 -0
  80. langroid/utils/pandas_utils.py +30 -0
  81. langroid/utils/pydantic_utils.py +446 -4
  82. langroid/utils/system.py +36 -1
  83. langroid/vector_store/__init__.py +34 -2
  84. langroid/vector_store/base.py +33 -2
  85. langroid/vector_store/chromadb.py +42 -13
  86. langroid/vector_store/lancedb.py +226 -60
  87. langroid/vector_store/meilisearch.py +7 -6
  88. langroid/vector_store/momento.py +3 -2
  89. langroid/vector_store/qdrantdb.py +82 -11
  90. {langroid-0.1.139.dist-info → langroid-0.1.219.dist-info}/METADATA +190 -129
  91. langroid-0.1.219.dist-info/RECORD +127 -0
  92. langroid/agent/special/recipient_validator_agent.py +0 -157
  93. langroid/parsing/json.py +0 -64
  94. langroid/utils/web/selenium_login.py +0 -36
  95. langroid-0.1.139.dist-info/RECORD +0 -103
  96. {langroid-0.1.139.dist-info → langroid-0.1.219.dist-info}/LICENSE +0 -0
  97. {langroid-0.1.139.dist-info → langroid-0.1.219.dist-info}/WHEEL +0 -0
@@ -10,15 +10,15 @@ from pathlib import Path
10
10
  from typing import Any, Dict, List, Optional, Tuple, Union
11
11
  from urllib.parse import urlparse
12
12
 
13
- from bs4 import BeautifulSoup
14
13
  from dotenv import load_dotenv
15
14
  from github import Github
16
15
  from github.ContentFile import ContentFile
16
+ from github.Label import Label
17
17
  from github.Repository import Repository
18
- from pydantic import BaseSettings
18
+ from pydantic import BaseModel, BaseSettings, Field
19
19
 
20
20
  from langroid.mytypes import DocMetaData, Document
21
- from langroid.parsing.document_parser import DocumentParser
21
+ from langroid.parsing.document_parser import DocumentParser, DocumentType
22
22
  from langroid.parsing.parser import Parser, ParsingConfig
23
23
 
24
24
  logger = logging.getLogger(__name__)
@@ -43,6 +43,22 @@ def _has_files(directory: str) -> bool:
43
43
  return False
44
44
 
45
45
 
46
+ # Pydantic model for GitHub issue data
47
+ class IssueData(BaseModel):
48
+ state: str = Field(..., description="State of issue e.g. open or closed")
49
+ year: int = Field(..., description="Year issue was created")
50
+ month: int = Field(..., description="Month issue was created")
51
+ day: int = Field(..., description="Day issue was created")
52
+ assignee: Optional[str] = Field(..., description="Assignee of issue")
53
+ size: Optional[str] = Field(..., description="Size of issue, e.g. XS, S, M, L, XXL")
54
+ text: str = Field(..., description="Text of issue, i.e. description body")
55
+
56
+
57
+ def get_issue_size(labels: List[Label]) -> str | None:
58
+ sizes = ["XS", "S", "M", "L", "XL", "XXL"]
59
+ return next((label.name for label in labels if label.name in sizes), None)
60
+
61
+
46
62
  class RepoLoaderConfig(BaseSettings):
47
63
  """
48
64
  Configuration for RepoLoader.
@@ -155,6 +171,27 @@ class RepoLoader:
155
171
  def _get_dir_name(self) -> str:
156
172
  return urlparse(self.url).path.replace("/", "_")
157
173
 
174
+ def get_issues(self, k: int | None = 100) -> List[IssueData]:
175
+ """Get up to k issues from the GitHub repo."""
176
+ if k is None:
177
+ issues = self.repo.get_issues(state="all")
178
+ else:
179
+ issues = self.repo.get_issues(state="all")[:k]
180
+ issue_data_list = []
181
+ for issue in issues:
182
+ issue_data = IssueData(
183
+ state=issue.state,
184
+ year=issue.created_at.year,
185
+ month=issue.created_at.month,
186
+ day=issue.created_at.day,
187
+ assignee=issue.assignee.login if issue.assignee else None,
188
+ size=get_issue_size(issue.labels),
189
+ text=issue.body or "No issue description body.",
190
+ )
191
+ issue_data_list.append(issue_data)
192
+
193
+ return issue_data_list
194
+
158
195
  @staticmethod
159
196
  def _file_type(name: str) -> str:
160
197
  """
@@ -453,18 +490,25 @@ class RepoLoader:
453
490
 
454
491
  @staticmethod
455
492
  def get_documents(
456
- path: str,
493
+ path: str | bytes,
457
494
  parser: Parser = Parser(ParsingConfig()),
458
495
  file_types: Optional[List[str]] = None,
459
496
  exclude_dirs: Optional[List[str]] = None,
460
497
  depth: int = -1,
461
498
  lines: Optional[int] = None,
499
+ doc_type: str | DocumentType | None = None,
462
500
  ) -> List[Document]:
463
501
  """
464
502
  Recursively get all files under a path as Document objects.
465
503
 
466
504
  Args:
467
- path (str): The path to the directory or file.
505
+ path (str|bytes): The path to the directory or file, or bytes content.
506
+ The bytes option is meant to support the case where the content
507
+ has already been read from a file in an upstream process
508
+ (e.g. from an API or a database), and we want to avoid having to
509
+ write it to a temporary file just to read it again.
510
+ (which can be very slow for large files,
511
+ especially in a docker container)
468
512
  parser (Parser): Parser to use to parse files.
469
513
  file_types (List[str], optional): List of file extensions OR
470
514
  filenames OR file_path_names to include.
@@ -475,6 +519,7 @@ class RepoLoader:
475
519
  which includes all depths.
476
520
  lines (int, optional): Number of lines to read from each file.
477
521
  Defaults to None, which reads all lines.
522
+ doc_type (str|DocumentType, optional): The type of document to parse.
478
523
 
479
524
  Returns:
480
525
  List[Document]: List of Document objects representing files.
@@ -482,52 +527,69 @@ class RepoLoader:
482
527
  """
483
528
  docs = []
484
529
  file_paths = []
485
- path_obj = Path(path).resolve()
486
-
487
- if path_obj.is_file():
488
- file_paths.append(str(path_obj))
530
+ if isinstance(path, bytes):
531
+ file_paths.append(path)
489
532
  else:
490
- path_depth = len(path_obj.parts)
491
- for root, dirs, files in os.walk(path):
492
- # Exclude directories if needed
493
- if exclude_dirs:
494
- dirs[:] = [d for d in dirs if d not in exclude_dirs]
495
-
496
- current_depth = len(Path(root).resolve().parts) - path_depth
497
- if depth == -1 or current_depth <= depth:
498
- for file in files:
499
- file_path = str(Path(root) / file)
500
- if (
501
- file_types is None
502
- or RepoLoader._file_type(file_path) in file_types
503
- or os.path.basename(file_path) in file_types
504
- or file_path in file_types
505
- ):
506
- file_paths.append(file_path)
533
+ path_obj = Path(path).resolve()
534
+
535
+ if path_obj.is_file():
536
+ file_paths.append(str(path_obj))
537
+ else:
538
+ path_depth = len(path_obj.parts)
539
+ for root, dirs, files in os.walk(path):
540
+ # Exclude directories if needed
541
+ if exclude_dirs:
542
+ dirs[:] = [d for d in dirs if d not in exclude_dirs]
543
+
544
+ current_depth = len(Path(root).resolve().parts) - path_depth
545
+ if depth == -1 or current_depth <= depth:
546
+ for file in files:
547
+ file_path = str(Path(root) / file)
548
+ if (
549
+ file_types is None
550
+ or RepoLoader._file_type(file_path) in file_types
551
+ or os.path.basename(file_path) in file_types
552
+ or file_path in file_types
553
+ ):
554
+ file_paths.append(file_path)
507
555
 
508
556
  for file_path in file_paths:
509
- _, file_extension = os.path.splitext(file_path)
510
- if file_extension.lower() in [".pdf", ".docx"]:
511
- doc_parser = DocumentParser.create(
557
+ docs.extend(
558
+ DocumentParser.chunks_from_path_or_bytes(
512
559
  file_path,
513
- parser.config,
514
- )
515
- docs.extend(doc_parser.get_doc_chunks())
516
- else:
517
- with open(file_path, "r") as f:
518
- if lines is not None:
519
- file_lines = list(itertools.islice(f, lines))
520
- content = "\n".join(line.strip() for line in file_lines)
521
- else:
522
- content = f.read()
523
- soup = BeautifulSoup(content, "html.parser")
524
- text = soup.get_text()
525
- docs.append(
526
- Document(
527
- content=text,
528
- metadata=DocMetaData(source=str(file_path)),
529
- )
560
+ parser,
561
+ doc_type=doc_type,
562
+ lines=lines,
530
563
  )
564
+ )
565
+ # dtype: DocumentType = DocumentParser._document_type(file_path, doc_type)
566
+ # if dtype in [DocumentType.PDF, DocumentType.DOC, DocumentType.DOCX]:
567
+ # doc_parser = DocumentParser.create(
568
+ # file_path,
569
+ # parser.config,
570
+ # doc_type=doc_type,
571
+ # )
572
+ # new_chunks = doc_parser.get_doc_chunks()
573
+ # if len(new_chunks) == 0 and file_extension.lower() == ".pdf":
574
+ # doc_parser = ImagePdfParser(file_path, parser.config)
575
+ # new_chunks = doc_parser.get_doc_chunks()
576
+ # docs.extend(new_chunks)
577
+ # else:
578
+ # # try getting as plain text; these will be chunked downstream
579
+ # with open(file_path, "r") as f:
580
+ # if lines is not None:
581
+ # file_lines = list(itertools.islice(f, lines))
582
+ # content = "\n".join(line.strip() for line in file_lines)
583
+ # else:
584
+ # content = f.read()
585
+ # soup = BeautifulSoup(content, "html.parser")
586
+ # text = soup.get_text()
587
+ # docs.append(
588
+ # Document(
589
+ # content=text,
590
+ # metadata=DocMetaData(source=str(file_path)),
591
+ # )
592
+ # )
531
593
 
532
594
  return docs
533
595
 
@@ -64,6 +64,14 @@ def find_fuzzy_matches_in_docs(
64
64
  break
65
65
  if words_after is None and words_before is None:
66
66
  return orig_doc_matches
67
+ if len(orig_doc_matches) == 0:
68
+ return []
69
+ if set(orig_doc_matches[0].__fields__) != {"content", "metadata"}:
70
+ # If there are fields beyond just content and metadata,
71
+ # we do NOT want to create new document objects with content fields
72
+ # based on words_before and words_after, since we don't know how to
73
+ # set those other fields.
74
+ return orig_doc_matches
67
75
 
68
76
  contextual_matches = []
69
77
  for match in orig_doc_matches:
@@ -1,4 +1,5 @@
1
1
  from csv import Sniffer
2
+ from typing import List
2
3
 
3
4
  import pandas as pd
4
5
 
@@ -48,3 +49,46 @@ def read_tabular_data(path_or_url: str, sep: None | str = None) -> pd.DataFrame:
48
49
  "Unable to read data. "
49
50
  "Please ensure it is correctly formatted. Error: " + str(e)
50
51
  )
52
+
53
+
54
+ def describe_dataframe(
55
+ df: pd.DataFrame, filter_fields: List[str] = [], n_vals: int = 10
56
+ ) -> str:
57
+ """
58
+ Generates a description of the columns in the dataframe,
59
+ along with a listing of up to `n_vals` unique values for each column.
60
+ Intended to be used to insert into an LLM context so it can generate
61
+ appropriate queries or filters on the df.
62
+
63
+ Args:
64
+ df (pd.DataFrame): The dataframe to describe.
65
+ filter_fields (list): A list of fields that can be used for filtering.
66
+ When non-empty, the values-list will be restricted to these.
67
+ n_vals (int): How many unique values to show for each column.
68
+
69
+ Returns:
70
+ str: A description of the dataframe.
71
+ """
72
+ description = []
73
+ for column in df.columns.to_list():
74
+ unique_values = df[column].dropna().unique()
75
+ unique_count = len(unique_values)
76
+ if column not in filter_fields:
77
+ values_desc = f"{unique_count} unique values"
78
+ else:
79
+ if unique_count > n_vals:
80
+ displayed_values = unique_values[:n_vals]
81
+ more_count = unique_count - n_vals
82
+ values_desc = f" Values - {displayed_values}, ... {more_count} more"
83
+ else:
84
+ values_desc = f" Values - {unique_values}"
85
+ col_type = "string" if df[column].dtype == "object" else df[column].dtype
86
+ col_desc = f"* {column} ({col_type}); {values_desc}"
87
+ description.append(col_desc)
88
+
89
+ all_cols = "\n".join(description)
90
+
91
+ return f"""
92
+ Name of each field, its type and unique values (up to {n_vals}):
93
+ {all_cols}
94
+ """
@@ -1,6 +1,9 @@
1
1
  import logging
2
+ import os
3
+ from tempfile import NamedTemporaryFile
2
4
  from typing import List, no_type_check
3
5
 
6
+ import requests
4
7
  import trafilatura
5
8
  from trafilatura.downloads import (
6
9
  add_to_compressed_dict,
@@ -9,7 +12,7 @@ from trafilatura.downloads import (
9
12
  )
10
13
 
11
14
  from langroid.mytypes import DocMetaData, Document
12
- from langroid.parsing.document_parser import DocumentParser
15
+ from langroid.parsing.document_parser import DocumentParser, ImagePdfParser
13
16
  from langroid.parsing.parser import Parser, ParsingConfig
14
17
 
15
18
  logging.getLogger("trafilatura").setLevel(logging.ERROR)
@@ -44,22 +47,65 @@ class URLLoader:
44
47
  sleep_time=5,
45
48
  )
46
49
  for url, result in buffered_downloads(buffer, threads):
47
- if url.lower().endswith(".pdf") or url.lower().endswith(".docx"):
50
+ if (
51
+ url.lower().endswith(".pdf")
52
+ or url.lower().endswith(".docx")
53
+ or url.lower().endswith(".doc")
54
+ ):
48
55
  doc_parser = DocumentParser.create(
49
56
  url,
50
57
  self.parser.config,
51
58
  )
52
- docs.extend(doc_parser.get_doc_chunks())
59
+ new_chunks = doc_parser.get_doc_chunks()
60
+ if len(new_chunks) == 0:
61
+ # If the document is empty, try to extract images
62
+ img_parser = ImagePdfParser(url, self.parser.config)
63
+ new_chunks = img_parser.get_doc_chunks()
64
+ docs.extend(new_chunks)
53
65
  else:
54
- text = trafilatura.extract(
55
- result,
56
- no_fallback=False,
57
- favor_recall=True,
58
- )
59
- if text is None and result is not None and isinstance(result, str):
60
- text = result
61
- if text is not None and text != "":
62
- docs.append(
63
- Document(content=text, metadata=DocMetaData(source=url))
66
+ # Try to detect content type and handle accordingly
67
+ headers = requests.head(url).headers
68
+ content_type = headers.get("Content-Type", "").lower()
69
+ temp_file_suffix = None
70
+ if "application/pdf" in content_type:
71
+ temp_file_suffix = ".pdf"
72
+ elif (
73
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
74
+ in content_type
75
+ ):
76
+ temp_file_suffix = ".docx"
77
+ elif "application/msword" in content_type:
78
+ temp_file_suffix = ".doc"
79
+
80
+ if temp_file_suffix:
81
+ # Download the document content
82
+ response = requests.get(url)
83
+ with NamedTemporaryFile(
84
+ delete=False, suffix=temp_file_suffix
85
+ ) as temp_file:
86
+ temp_file.write(response.content)
87
+ temp_file_path = temp_file.name
88
+ # Process the downloaded document
89
+ doc_parser = DocumentParser.create(
90
+ temp_file_path, self.parser.config
91
+ )
92
+ docs.extend(doc_parser.get_doc_chunks())
93
+ # Clean up the temporary file
94
+ os.remove(temp_file_path)
95
+ else:
96
+ text = trafilatura.extract(
97
+ result,
98
+ no_fallback=False,
99
+ favor_recall=True,
64
100
  )
101
+ if (
102
+ text is None
103
+ and result is not None
104
+ and isinstance(result, str)
105
+ ):
106
+ text = result
107
+ if text is not None and text != "":
108
+ docs.append(
109
+ Document(content=text, metadata=DocMetaData(source=url))
110
+ )
65
111
  return docs
langroid/parsing/urls.py CHANGED
@@ -112,26 +112,35 @@ def is_url(s: str) -> bool:
112
112
  return False
113
113
 
114
114
 
115
- def get_urls_and_paths(inputs: List[str]) -> Tuple[List[str], List[str]]:
115
+ def get_urls_paths_bytes_indices(
116
+ inputs: List[str | bytes],
117
+ ) -> Tuple[List[int], List[int], List[int]]:
116
118
  """
117
- Given a list of inputs, return a list of URLs and a list of paths.
119
+ Given a list of inputs, return a
120
+ list of indices of URLs, list of indices of paths, list of indices of byte-contents.
118
121
  Args:
119
- inputs: list of strings
122
+ inputs: list of strings or bytes
120
123
  Returns:
121
- list of URLs, list of paths
124
+ list of Indices of URLs,
125
+ list of indices of paths,
126
+ list of indices of byte-contents
122
127
  """
123
128
  urls = []
124
129
  paths = []
125
- for item in inputs:
130
+ byte_list = []
131
+ for i, item in enumerate(inputs):
132
+ if isinstance(item, bytes):
133
+ byte_list.append(i)
134
+ continue
126
135
  try:
127
- m = Url(url=parse_obj_as(HttpUrl, item))
128
- urls.append(str(m.url))
136
+ Url(url=parse_obj_as(HttpUrl, item))
137
+ urls.append(i)
129
138
  except ValidationError:
130
139
  if os.path.exists(item):
131
- paths.append(item)
140
+ paths.append(i)
132
141
  else:
133
142
  logger.warning(f"{item} is neither a URL nor a path.")
134
- return urls, paths
143
+ return urls, paths, byte_list
135
144
 
136
145
 
137
146
  def crawl_url(url: str, max_urls: int = 1) -> List[str]:
langroid/parsing/utils.py CHANGED
@@ -1,16 +1,26 @@
1
1
  import difflib
2
+ import logging
2
3
  import random
3
4
  import re
4
5
  from functools import cache
5
6
  from itertools import islice
6
- from typing import Any, Iterable, List
7
+ from typing import Iterable, List, Sequence, TypeVar
7
8
 
8
9
  import nltk
9
10
  from faker import Faker
10
11
 
12
+ from langroid.mytypes import Document
13
+ from langroid.parsing.document_parser import DocumentType
14
+ from langroid.parsing.parser import Parser, ParsingConfig
15
+ from langroid.parsing.repo_loader import RepoLoader
16
+ from langroid.parsing.url_loader import URLLoader
17
+ from langroid.parsing.urls import get_urls_paths_bytes_indices
18
+
11
19
  Faker.seed(23)
12
20
  random.seed(43)
13
21
 
22
+ logger = logging.getLogger(__name__)
23
+
14
24
 
15
25
  # Ensures the NLTK resource is available
16
26
  @cache
@@ -21,7 +31,10 @@ def download_nltk_resource(resource: str) -> None:
21
31
  nltk.download(resource, quiet=True)
22
32
 
23
33
 
24
- def batched(iterable: Iterable[Any], n: int) -> Iterable[Any]:
34
+ T = TypeVar("T")
35
+
36
+
37
+ def batched(iterable: Iterable[T], n: int) -> Iterable[Sequence[T]]:
25
38
  """Batch data into tuples of length n. The last batch may be shorter."""
26
39
  # batched('ABCDEFG', 3) --> ABC DEF G
27
40
  if n < 1:
@@ -101,14 +114,35 @@ def split_paragraphs(text: str) -> List[str]:
101
114
  return [para.strip() for para in paras if para.strip()]
102
115
 
103
116
 
104
- def number_segments(s: str, len: int = 1) -> str:
117
+ def split_newlines(text: str) -> List[str]:
118
+ """
119
+ Split the input text into lines using "\n" as the delimiter.
120
+
121
+ Args:
122
+ text (str): The input text.
123
+
124
+ Returns:
125
+ list: A list of lines.
126
+ """
127
+ lines = re.split(r"\n", text)
128
+ return [line.strip() for line in lines if line.strip()]
129
+
130
+
131
+ def number_segments(s: str, granularity: int = 1) -> str:
105
132
  """
106
133
  Number the segments in a given text, preserving paragraph structure.
107
- A segment is a sequence of `len` consecutive sentences.
134
+ A segment is a sequence of `len` consecutive "sentences", where a "sentence"
135
+ is either a normal sentence, or if there isn't enough punctuation to properly
136
+ identify sentences, then we use a pseudo-sentence via heuristics (split by newline
137
+ or failing that, just split every 40 words). The goal here is simply to number
138
+ segments at a reasonable granularity so the LLM can identify relevant segments,
139
+ in the RelevanceExtractorAgent.
108
140
 
109
141
  Args:
110
142
  s (str): The input text.
111
- len (int): The number of sentences in a segment.
143
+ granularity (int): The number of sentences in a segment.
144
+ If this is -1, then the entire text is treated as a single segment,
145
+ and is numbered as <#1#>.
112
146
 
113
147
  Returns:
114
148
  str: The text with segments numbered in the style <#1#>, <#2#> etc.
@@ -117,15 +151,42 @@ def number_segments(s: str, len: int = 1) -> str:
117
151
  >>> number_segments("Hello world! How are you? Have a good day.")
118
152
  '<#1#> Hello world! <#2#> How are you? <#3#> Have a good day.'
119
153
  """
154
+ if granularity < 0:
155
+ return "<#1#> " + s
120
156
  numbered_text = []
121
157
  count = 0
122
158
 
123
159
  paragraphs = split_paragraphs(s)
124
160
  for paragraph in paragraphs:
125
161
  sentences = nltk.sent_tokenize(paragraph)
162
+ # Some docs are problematic (e.g. resumes) and have no (or too few) periods,
163
+ # so we can't split usefully into sentences.
164
+ # We try a series of heuristics to split into sentences,
165
+ # until the avg num words per sentence is less than 40.
166
+ avg_words_per_sentence = sum(
167
+ len(nltk.word_tokenize(sentence)) for sentence in sentences
168
+ ) / len(sentences)
169
+ if avg_words_per_sentence > 40:
170
+ sentences = split_newlines(paragraph)
171
+ avg_words_per_sentence = sum(
172
+ len(nltk.word_tokenize(sentence)) for sentence in sentences
173
+ ) / len(sentences)
174
+ if avg_words_per_sentence > 40:
175
+ # Still too long, just split on every 40 words
176
+ sentences = []
177
+ for sentence in nltk.sent_tokenize(paragraph):
178
+ words = nltk.word_tokenize(sentence)
179
+ for i in range(0, len(words), 40):
180
+ # if there are less than 20 words left after this,
181
+ # just add them to the last sentence and break
182
+ if len(words) - i < 20:
183
+ sentences.append(" ".join(words[i:]))
184
+ break
185
+ else:
186
+ sentences.append(" ".join(words[i : i + 40]))
126
187
  for i, sentence in enumerate(sentences):
127
- num = count // len + 1
128
- number_prefix = f"<#{num}#>" if count % len == 0 else ""
188
+ num = count // granularity + 1
189
+ number_prefix = f"<#{num}#>" if count % granularity == 0 else ""
129
190
  sentence = f"{number_prefix} {sentence}"
130
191
  count += 1
131
192
  sentences[i] = sentence
@@ -136,7 +197,7 @@ def number_segments(s: str, len: int = 1) -> str:
136
197
 
137
198
 
138
199
  def number_sentences(s: str) -> str:
139
- return number_segments(s, len=1)
200
+ return number_segments(s, granularity=1)
140
201
 
141
202
 
142
203
  def parse_number_range_list(specs: str) -> List[int]:
@@ -156,6 +217,9 @@ def parse_number_range_list(specs: str) -> List[int]:
156
217
  """
157
218
  spec_indices = set() # type: ignore
158
219
  for part in specs.split(","):
220
+ # some weak LLMs may generate <#1#> instead of 1, so extract just the digits
221
+ # or the "-"
222
+ part = "".join(char for char in part if char.isdigit() or char == "-")
159
223
  if "-" in part:
160
224
  start, end = map(int, part.split("-"))
161
225
  spec_indices.update(range(start, end + 1))
@@ -224,7 +288,8 @@ def extract_numbered_segments(s: str, specs: str) -> str:
224
288
 
225
289
  # Regular expression to identify numbered segments like
226
290
  # <#1#> Hello world! This is me. <#2#> How are you? <#3#> Have a good day.
227
- segment_pattern = re.compile(r"<#(\d+)#> ((?:(?!<#).)+)")
291
+ # Note we match any character between segment markers, including newlines.
292
+ segment_pattern = re.compile(r"<#(\d+)#>([\s\S]*?)(?=<#\d+#>|$)")
228
293
 
229
294
  # Split the text into paragraphs while preserving their boundaries
230
295
  paragraphs = split_paragraphs(s)
@@ -247,3 +312,59 @@ def extract_numbered_segments(s: str, specs: str) -> str:
247
312
  extracted_paragraphs.append(" ".join(extracted_segments))
248
313
 
249
314
  return "\n\n".join(extracted_paragraphs)
315
+
316
+
317
+ def extract_content_from_path(
318
+ path: bytes | str | List[bytes | str],
319
+ parsing: ParsingConfig,
320
+ doc_type: str | DocumentType | None = None,
321
+ ) -> str | List[str]:
322
+ """
323
+ Extract the content from a file path or URL, or a list of file paths or URLs.
324
+
325
+ Args:
326
+ path (bytes | str | List[str]): The file path or URL, or a list of file paths or
327
+ URLs, or bytes content. The bytes option is meant to support cases
328
+ where upstream code may have already loaded the content (e.g., from a
329
+ database or API) and we want to avoid having to copy the content to a
330
+ temporary file.
331
+ parsing (ParsingConfig): The parsing configuration.
332
+ doc_type (str | DocumentType | None): The document type if known.
333
+ If multiple paths are given, this MUST apply to ALL docs.
334
+
335
+ Returns:
336
+ str | List[str]: The extracted content if a single file path or URL is provided,
337
+ or a list of extracted contents if a
338
+ list of file paths or URLs is provided.
339
+ """
340
+ if isinstance(path, str) or isinstance(path, bytes):
341
+ paths = [path]
342
+ elif isinstance(path, list) and len(path) == 0:
343
+ return ""
344
+ else:
345
+ paths = path
346
+
347
+ url_idxs, path_idxs, byte_idxs = get_urls_paths_bytes_indices(paths)
348
+ urls = [paths[i] for i in url_idxs]
349
+ path_list = [paths[i] for i in path_idxs]
350
+ byte_list = [paths[i] for i in byte_idxs]
351
+ path_list.extend(byte_list)
352
+ parser = Parser(parsing)
353
+ docs: List[Document] = []
354
+ try:
355
+ if len(urls) > 0:
356
+ loader = URLLoader(urls=urls, parser=parser) # type: ignore
357
+ docs = loader.load()
358
+ if len(path_list) > 0:
359
+ for p in path_list:
360
+ path_docs = RepoLoader.get_documents(
361
+ p, parser=parser, doc_type=doc_type
362
+ )
363
+ docs.extend(path_docs)
364
+ except Exception as e:
365
+ logger.warning(f"Error loading path {paths}: {e}")
366
+ return ""
367
+ if len(docs) == 1:
368
+ return docs[0].content
369
+ else:
370
+ return [d.content for d in docs]