lexoid 0.1.8__tar.gz → 0.1.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: lexoid
3
- Version: 0.1.8
3
+ Version: 0.1.9
4
4
  Summary:
5
5
  Requires-Python: >=3.10,<4.0
6
6
  Classifier: Programming Language :: Python :: 3
@@ -28,6 +28,7 @@ Requires-Dist: pyqtwebengine (>=5.15.7,<6.0.0) ; platform_system != "debian"
28
28
  Requires-Dist: python-docx (>=1.1.2,<2.0.0)
29
29
  Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
30
30
  Requires-Dist: tabulate (>=0.9.0,<0.10.0)
31
+ Requires-Dist: together (>=1.4.0,<2.0.0)
31
32
  Description-Content-Type: text/markdown
32
33
 
33
34
  # Lexoid
@@ -35,9 +36,12 @@ Description-Content-Type: text/markdown
35
36
  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oidlabs-com/Lexoid/blob/main/examples/example_notebook_colab.ipynb)
36
37
  [![GitHub license](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://github.com/oidlabs-com/Lexoid/blob/main/LICENSE)
37
38
  [![PyPI](https://img.shields.io/pypi/v/lexoid)](https://pypi.org/project/lexoid/)
39
+ [![Docs](https://github.com/oidlabs-com/Lexoid/actions/workflows/deploy_docs.yml/badge.svg)](https://oidlabs-com.github.io/Lexoid/)
38
40
 
39
41
  Lexoid is an efficient document parsing library that supports both LLM-based and non-LLM-based (static) PDF document parsing.
40
42
 
43
+ [Documentation](https://oidlabs-com.github.io/Lexoid/)
44
+
41
45
  ## Motivation:
42
46
  - Use the multi-modal advancement of LLMs
43
47
  - Enable convenience for users
@@ -90,10 +94,10 @@ Here's a quick example to parse documents using Lexoid:
90
94
  from lexoid.api import parse
91
95
  from lexoid.api import ParserType
92
96
 
93
- parsed_md = parse("https://www.justice.gov/eoir/immigration-law-advisor", parser_type="LLM_PARSE", raw=True)
97
+ parsed_md = parse("https://www.justice.gov/eoir/immigration-law-advisor", parser_type="LLM_PARSE")["raw"]
94
98
  # or
95
99
  pdf_path = "path/to/immigration-law-advisor.pdf"
96
- parsed_md = parse(pdf_path, parser_type="LLM_PARSE", raw=True)
100
+ parsed_md = parse(pdf_path, parser_type="LLM_PARSE")["raw"]
97
101
 
98
102
  print(parsed_md)
99
103
  ```
@@ -101,14 +105,15 @@ print(parsed_md)
101
105
  ### Parameters
102
106
  - path (str): The file path or URL.
103
107
  - parser_type (str, optional): The type of parser to use ("LLM_PARSE" or "STATIC_PARSE"). Defaults to "AUTO".
104
- - raw (bool, optional): Return raw text or structured data. Defaults to False.
105
108
  - pages_per_split (int, optional): Number of pages per split for chunking. Defaults to 4.
106
109
  - max_threads (int, optional): Maximum number of threads for parallel processing. Defaults to 4.
107
110
  - **kwargs: Additional arguments for the parser.
108
111
 
109
112
  ## Benchmark
110
113
  Initial results (_more updates soon_)
111
- _Note:_ Benchmarks done in zero-shot scenario currently
114
+
115
+ _Note:_ Benchmarks are currently done in the zero-shot setting.
116
+
112
117
  | Rank | Model/Framework | Similarity | Time (s) |
113
118
  |------|-----------|------------|----------|
114
119
  | 1 | gpt-4o | 0.799 | 21.77|
@@ -3,9 +3,12 @@
3
3
  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oidlabs-com/Lexoid/blob/main/examples/example_notebook_colab.ipynb)
4
4
  [![GitHub license](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://github.com/oidlabs-com/Lexoid/blob/main/LICENSE)
5
5
  [![PyPI](https://img.shields.io/pypi/v/lexoid)](https://pypi.org/project/lexoid/)
6
+ [![Docs](https://github.com/oidlabs-com/Lexoid/actions/workflows/deploy_docs.yml/badge.svg)](https://oidlabs-com.github.io/Lexoid/)
6
7
 
7
8
  Lexoid is an efficient document parsing library that supports both LLM-based and non-LLM-based (static) PDF document parsing.
8
9
 
10
+ [Documentation](https://oidlabs-com.github.io/Lexoid/)
11
+
9
12
  ## Motivation:
10
13
  - Use the multi-modal advancement of LLMs
11
14
  - Enable convenience for users
@@ -58,10 +61,10 @@ Here's a quick example to parse documents using Lexoid:
58
61
  from lexoid.api import parse
59
62
  from lexoid.api import ParserType
60
63
 
61
- parsed_md = parse("https://www.justice.gov/eoir/immigration-law-advisor", parser_type="LLM_PARSE", raw=True)
64
+ parsed_md = parse("https://www.justice.gov/eoir/immigration-law-advisor", parser_type="LLM_PARSE")["raw"]
62
65
  # or
63
66
  pdf_path = "path/to/immigration-law-advisor.pdf"
64
- parsed_md = parse(pdf_path, parser_type="LLM_PARSE", raw=True)
67
+ parsed_md = parse(pdf_path, parser_type="LLM_PARSE")["raw"]
65
68
 
66
69
  print(parsed_md)
67
70
  ```
@@ -69,14 +72,15 @@ print(parsed_md)
69
72
  ### Parameters
70
73
  - path (str): The file path or URL.
71
74
  - parser_type (str, optional): The type of parser to use ("LLM_PARSE" or "STATIC_PARSE"). Defaults to "AUTO".
72
- - raw (bool, optional): Return raw text or structured data. Defaults to False.
73
75
  - pages_per_split (int, optional): Number of pages per split for chunking. Defaults to 4.
74
76
  - max_threads (int, optional): Maximum number of threads for parallel processing. Defaults to 4.
75
77
  - **kwargs: Additional arguments for the parser.
76
78
 
77
79
  ## Benchmark
78
80
  Initial results (_more updates soon_)
79
- _Note:_ Benchmarks done in zero-shot scenario currently
81
+
82
+ _Note:_ Benchmarks are currently done in the zero-shot setting.
83
+
80
84
  | Rank | Model/Framework | Similarity | Time (s) |
81
85
  |------|-----------|------------|----------|
82
86
  | 1 | gpt-4o | 0.799 | 21.77|
@@ -28,20 +28,24 @@ class ParserType(Enum):
28
28
  AUTO = "AUTO"
29
29
 
30
30
 
31
- def parse_chunk(
32
- path: str, parser_type: ParserType, raw: bool, **kwargs
33
- ) -> List[Dict] | str:
31
+ def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
34
32
  """
35
33
  Parses a file using the specified parser type.
36
34
 
37
35
  Args:
38
36
  path (str): The file path or URL.
39
37
  parser_type (ParserType): The type of parser to use (LLM_PARSE, STATIC_PARSE, or AUTO).
40
- raw (bool): Whether to return raw text or structured data.
41
38
  **kwargs: Additional arguments for the parser.
42
39
 
43
40
  Returns:
44
- List[Dict] | str: Parsed document data as a list of dictionaries or raw text.
41
+ Dict: Dictionary containing:
42
+ - raw: Full markdown content as string
43
+ - segments: List of dictionaries with metadata and content
44
+ - title: Title of the document
45
+ - url: URL if applicable
46
+ - parent_title: Title of parent doc if recursively parsed
47
+ - recursive_docs: List of dictionaries for recursively parsed documents
48
+ - token_usage: Dictionary containing token usage statistics
45
49
  """
46
50
  if parser_type == ParserType.AUTO:
47
51
  parser_type = ParserType[router(path)]
@@ -52,63 +56,80 @@ def parse_chunk(
52
56
  )
53
57
  if parser_type == ParserType.STATIC_PARSE:
54
58
  logger.debug("Using static parser")
55
- return parse_static_doc(path, raw, **kwargs)
59
+ return parse_static_doc(path, **kwargs)
56
60
  else:
57
61
  logger.debug("Using LLM parser")
58
- return parse_llm_doc(path, raw, **kwargs)
62
+ return parse_llm_doc(path, **kwargs)
59
63
 
60
64
 
61
65
  def parse_chunk_list(
62
- file_paths: List[str], parser_type: ParserType, raw: bool, kwargs: Dict
63
- ) -> List[Dict | str]:
66
+ file_paths: List[str], parser_type: ParserType, kwargs: Dict
67
+ ) -> Dict:
64
68
  """
65
69
  Parses a list of files using the specified parser type.
66
70
 
67
71
  Args:
68
72
  file_paths (list): List of file paths.
69
73
  parser_type (ParserType): The type of parser to use.
70
- raw (bool): Whether to return raw text or structured data.
71
74
  kwargs (dict): Additional arguments for the parser.
72
75
 
73
76
  Returns:
74
- List[Dict | str]: List of parsed documents with raw text and/or metadata.
77
+ Dict: Dictionary containing parsed document data
75
78
  """
76
- local_docs = []
79
+ combined_segments = []
80
+ raw_texts = []
81
+ token_usage = {"input": 0, "output": 0}
77
82
  for file_path in file_paths:
78
- result = parse_chunk(file_path, parser_type, raw, **kwargs)
79
- if isinstance(result, list):
80
- local_docs.extend(result)
81
- else:
82
- local_docs.append(result.replace("<page break>", "\n\n"))
83
- return local_docs
83
+ result = parse_chunk(file_path, parser_type, **kwargs)
84
+ combined_segments.extend(result["segments"])
85
+ raw_texts.append(result["raw"])
86
+ token_usage["input"] += result["token_usage"]["input"]
87
+ token_usage["output"] += result["token_usage"]["output"]
88
+ token_usage["total"] = token_usage["input"] + token_usage["output"]
89
+
90
+ return {
91
+ "raw": "\n\n".join(raw_texts),
92
+ "segments": combined_segments,
93
+ "title": kwargs.get("title", ""),
94
+ "url": kwargs.get("url", ""),
95
+ "parent_title": kwargs.get("parent_title", ""),
96
+ "recursive_docs": [],
97
+ "token_usage": token_usage,
98
+ }
84
99
 
85
100
 
86
101
  def parse(
87
102
  path: str,
88
103
  parser_type: Union[str, ParserType] = "LLM_PARSE",
89
- raw: bool = False,
90
104
  pages_per_split: int = 4,
91
105
  max_processes: int = 4,
92
106
  **kwargs,
93
- ) -> Union[List[Dict], str]:
107
+ ) -> Dict:
94
108
  """
95
109
  Parses a document or URL, optionally splitting it into chunks and using multiprocessing.
96
110
 
97
111
  Args:
98
112
  path (str): The file path or URL.
99
- parser_type (Union[str, ParserType], optional): The type of parser to use ("LLM_PARSE", "STATIC_PARSE", or "AUTO"). Defaults to "LLM_PARSE".
100
- raw (bool, optional): Whether to return raw text or structured data. Defaults to False.
101
- pages_per_split (int, optional): Number of pages per split for chunking. Defaults to 4.
102
- max_processes (int, optional): Maximum number of processes for parallel processing. Defaults to 4.
113
+ parser_type (Union[str, ParserType], optional): Parser type ("LLM_PARSE", "STATIC_PARSE", or "AUTO").
114
+ pages_per_split (int, optional): Number of pages per split for chunking.
115
+ max_processes (int, optional): Maximum number of processes for parallel processing.
103
116
  **kwargs: Additional arguments for the parser.
104
117
 
105
118
  Returns:
106
- Union[List[Dict], str]: Parsed document data as a list of dictionaries or raw text.
119
+ Dict: Dictionary containing:
120
+ - raw: Full markdown content as string
121
+ - segments: List of dictionaries with metadata and content
122
+ - title: Title of the document
123
+ - url: URL if applicable
124
+ - parent_title: Title of parent doc if recursively parsed
125
+ - recursive_docs: List of dictionaries for recursively parsed documents
126
+ - token_usage: Dictionary containing token usage statistics
107
127
  """
108
128
  kwargs["title"] = os.path.basename(path)
109
129
  kwargs["pages_per_split_"] = pages_per_split
110
130
  as_pdf = kwargs.get("as_pdf", False)
111
131
  depth = kwargs.get("depth", 1)
132
+
112
133
  if type(parser_type) == str:
113
134
  parser_type = ParserType[parser_type]
114
135
 
@@ -120,15 +141,19 @@ def parse(
120
141
  as_pdf = True
121
142
 
122
143
  if path.startswith(("http://", "https://")):
123
- download_dir = os.path.join(temp_dir, "downloads/")
144
+ kwargs["url"] = path
145
+ download_dir = kwargs.get("save_dir", os.path.join(temp_dir, "downloads/"))
124
146
  os.makedirs(download_dir, exist_ok=True)
125
147
  if is_supported_url_file_type(path):
126
148
  path = download_file(path, download_dir)
127
149
  elif as_pdf:
128
- pdf_path = os.path.join(download_dir, f"webpage_{int(time())}.pdf")
150
+ pdf_filename = kwargs.get("save_filename", f"webpage_{int(time())}.pdf")
151
+ if not pdf_filename.endswith(".pdf"):
152
+ pdf_filename += ".pdf"
153
+ pdf_path = os.path.join(download_dir, pdf_filename)
129
154
  path = convert_to_pdf(path, pdf_path)
130
155
  else:
131
- return recursive_read_html(path, depth, raw)
156
+ return recursive_read_html(path, depth)
132
157
 
133
158
  assert is_supported_file_type(
134
159
  path
@@ -140,9 +165,7 @@ def parse(
140
165
 
141
166
  if not path.lower().endswith(".pdf") or parser_type == ParserType.STATIC_PARSE:
142
167
  kwargs["split"] = False
143
- all_docs = parse_chunk(path, parser_type, raw, **kwargs)
144
- if raw:
145
- all_docs = [all_docs]
168
+ result = parse_chunk(path, parser_type, **kwargs)
146
169
  else:
147
170
  kwargs["split"] = True
148
171
  split_dir = os.path.join(temp_dir, "splits/")
@@ -156,22 +179,39 @@ def parse(
156
179
  for i in range(0, len(split_files), chunk_size)
157
180
  ]
158
181
 
159
- process_args = [(chunk, parser_type, raw, kwargs) for chunk in file_chunks]
182
+ process_args = [(chunk, parser_type, kwargs) for chunk in file_chunks]
160
183
 
161
184
  if max_processes == 1 or len(file_chunks) == 1:
162
- all_docs = [parse_chunk_list(*args) for args in process_args]
185
+ chunk_results = [parse_chunk_list(*args) for args in process_args]
163
186
  else:
164
187
  with ProcessPoolExecutor(max_workers=max_processes) as executor:
165
- all_docs = list(executor.map(parse_chunk_list, *zip(*process_args)))
166
-
167
- all_docs = [item for sublist in all_docs for item in sublist]
188
+ chunk_results = list(
189
+ executor.map(parse_chunk_list, *zip(*process_args))
190
+ )
191
+
192
+ # Combine results from all chunks
193
+ result = {
194
+ "raw": "\n\n".join(r["raw"] for r in chunk_results),
195
+ "segments": [seg for r in chunk_results for seg in r["segments"]],
196
+ "title": kwargs["title"],
197
+ "url": kwargs.get("url", ""),
198
+ "parent_title": kwargs.get("parent_title", ""),
199
+ "recursive_docs": [],
200
+ "token_usage": {
201
+ "input": sum(r["token_usage"]["input"] for r in chunk_results),
202
+ "output": sum(r["token_usage"]["output"] for r in chunk_results),
203
+ "total": sum(r["token_usage"]["total"] for r in chunk_results),
204
+ },
205
+ }
206
+ if as_pdf:
207
+ result["pdf_path"] = path
168
208
 
169
209
  if depth > 1:
170
- new_docs = all_docs.copy()
171
- for doc in all_docs:
210
+ recursive_docs = []
211
+ for segment in result["segments"]:
172
212
  urls = re.findall(
173
213
  r'https?://[^\s<>"\']+|www\.[^\s<>"\']+(?:\.[^\s<>"\']+)*',
174
- doc if raw else doc["content"],
214
+ segment["content"],
175
215
  )
176
216
  for url in urls:
177
217
  if "](" in url:
@@ -182,19 +222,16 @@ def parse(
182
222
 
183
223
  kwargs_cp = kwargs.copy()
184
224
  kwargs_cp["depth"] = depth - 1
185
- res = parse(
225
+ kwargs_cp["parent_title"] = result["title"]
226
+ sub_doc = parse(
186
227
  url,
187
228
  parser_type=parser_type,
188
- raw=raw,
189
229
  pages_per_split=pages_per_split,
190
230
  max_processes=max_processes,
191
231
  **kwargs_cp,
192
232
  )
233
+ recursive_docs.append(sub_doc)
193
234
 
194
- if raw:
195
- new_docs.append(res)
196
- else:
197
- new_docs.extend(res)
198
- all_docs = new_docs
235
+ result["recursive_docs"] = recursive_docs
199
236
 
200
- return "\n".join(all_docs) if raw else all_docs
237
+ return result
@@ -18,6 +18,7 @@ from lexoid.core.prompt_templates import (
18
18
  from lexoid.core.utils import convert_image_to_pdf
19
19
  from loguru import logger
20
20
  from openai import OpenAI
21
+ from together import Together
21
22
  from huggingface_hub import InferenceClient
22
23
 
23
24
 
@@ -33,38 +34,36 @@ def retry_on_http_error(func):
33
34
  return func(*args, **kwargs)
34
35
  except HTTPError as e:
35
36
  logger.error(f"Retry failed: {e}")
36
- if kwargs.get("raw", False):
37
- return ""
38
- return [
39
- {
40
- "metadata": {
41
- "title": kwargs["title"],
42
- "page": kwargs.get("start", 0),
43
- },
44
- "content": "",
45
- }
46
- ]
37
+ return {
38
+ "raw": "",
39
+ "segments": [],
40
+ "title": kwargs["title"],
41
+ "url": kwargs.get("url", ""),
42
+ "parent_title": kwargs.get("parent_title", ""),
43
+ "recursive_docs": [],
44
+ "error": f"HTTPError encountered on page {kwargs.get('start', 0)}: {e}",
45
+ }
47
46
 
48
47
  return wrapper
49
48
 
50
49
 
51
50
  @retry_on_http_error
52
- def parse_llm_doc(path: str, raw: bool, **kwargs) -> List[Dict] | str:
51
+ def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
53
52
  if "model" not in kwargs:
54
53
  kwargs["model"] = "gemini-1.5-flash"
55
54
  model = kwargs.get("model")
56
55
  if model.startswith("gemini"):
57
- return parse_with_gemini(path, raw, **kwargs)
56
+ return parse_with_gemini(path, **kwargs)
58
57
  if model.startswith("gpt"):
59
- return parse_with_api(path, raw, api="openai", **kwargs)
58
+ return parse_with_api(path, api="openai", **kwargs)
60
59
  if model.startswith("meta-llama"):
61
60
  if model.endswith("Turbo") or model == "meta-llama/Llama-Vision-Free":
62
- return parse_with_together(path, raw, **kwargs)
63
- return parse_with_api(path, raw, api="huggingface", **kwargs)
61
+ return parse_with_api(path, api="together", **kwargs)
62
+ return parse_with_api(path, api="huggingface", **kwargs)
64
63
  raise ValueError(f"Unsupported model: {model}")
65
64
 
66
65
 
67
- def parse_with_gemini(path: str, raw: bool, **kwargs) -> List[Dict] | str:
66
+ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
68
67
  api_key = os.environ.get("GOOGLE_API_KEY")
69
68
  if not api_key:
70
69
  raise ValueError("GOOGLE_API_KEY environment variable is not set")
@@ -119,25 +118,30 @@ def parse_with_gemini(path: str, raw: bool, **kwargs) -> List[Dict] | str:
119
118
  if "text" in part
120
119
  )
121
120
 
122
- result = ""
121
+ combined_text = ""
123
122
  if "<output>" in raw_text:
124
- result = raw_text.split("<output>")[1].strip()
123
+ combined_text = raw_text.split("<output>")[1].strip()
125
124
  if "</output>" in result:
126
- result = result.split("</output>")[0].strip()
127
-
128
- if raw:
129
- return result
130
-
131
- return [
132
- {
133
- "metadata": {
134
- "title": kwargs["title"],
135
- "page": kwargs.get("start", 0) + page_no,
136
- },
137
- "content": page,
138
- }
139
- for page_no, page in enumerate(result.split("<page-break>"), start=1)
140
- ]
125
+ combined_text = result.split("</output>")[0].strip()
126
+
127
+ token_usage = result["usageMetadata"]
128
+
129
+ return {
130
+ "raw": combined_text,
131
+ "segments": [
132
+ {"metadata": {"page": kwargs.get("start", 0) + page_no}, "content": page}
133
+ for page_no, page in enumerate(combined_text.split("<page-break>"), start=1)
134
+ ],
135
+ "title": kwargs["title"],
136
+ "url": kwargs.get("url", ""),
137
+ "parent_title": kwargs.get("parent_title", ""),
138
+ "recursive_docs": [],
139
+ "token_usage": {
140
+ "input": token_usage["promptTokenCount"],
141
+ "output": token_usage["candidatesTokenCount"],
142
+ "total": token_usage["totalTokenCount"],
143
+ },
144
+ }
141
145
 
142
146
 
143
147
  def convert_pdf_page_to_base64(
@@ -155,97 +159,17 @@ def convert_pdf_page_to_base64(
155
159
  return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
156
160
 
157
161
 
158
- def parse_with_together(path: str, raw: bool, **kwargs) -> List[Dict] | str:
159
- api_key = os.environ.get("TOGETHER_API_KEY")
160
- if not api_key:
161
- raise ValueError("TOGETHER_API_KEY environment variable is not set")
162
-
163
- url = "https://api.together.xyz/v1/chat/completions"
164
- headers = {
165
- "Authorization": f"Bearer {api_key}",
166
- "Content-Type": "application/json",
167
- }
168
-
169
- mime_type, _ = mimetypes.guess_type(path)
170
- if mime_type and mime_type.startswith("image"):
171
- with open(path, "rb") as img_file:
172
- image_base64 = base64.b64encode(img_file.read()).decode("utf-8")
173
- images = [(0, f"data:{mime_type};base64,{image_base64}")]
174
- else:
175
- pdf_document = pdfium.PdfDocument(path)
176
- images = [
177
- (
178
- page_num,
179
- f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num)}",
180
- )
181
- for page_num in range(len(pdf_document))
182
- ]
183
-
184
- all_results = []
185
- for page_num, image_url in images:
186
- messages = [
187
- {
188
- "role": "user",
189
- "content": [
190
- {"type": "text", "text": LLAMA_PARSER_PROMPT},
191
- {"type": "image_url", "image_url": {"url": image_url}},
192
- ],
193
- }
194
- ]
195
-
196
- payload = {
197
- "model": kwargs["model"],
198
- "messages": messages,
199
- "max_tokens": kwargs.get("max_tokens", 1024),
200
- "temperature": kwargs.get("temperature", 0.7),
201
- }
202
-
203
- response = requests.post(url, json=payload, headers=headers)
204
- response.raise_for_status()
205
- response_data = response.json()
206
-
207
- page_text = response_data["choices"][0]["message"]["content"]
208
- if kwargs.get("verbose", None):
209
- logger.debug(f"Page {page_num + 1} response: {page_text}")
210
-
211
- result = page_text
212
- if "<output>" in page_text:
213
- result = page_text.split("<output>")[1].strip()
214
- if "</output>" in result:
215
- result = result.split("</output>")[0].strip()
216
- all_results.append((page_num, result))
217
-
218
- all_results.sort(key=lambda x: x[0])
219
- all_texts = [text for _, text in all_results]
220
- combined_text = "<page-break>".join(all_texts)
221
-
222
- if raw:
223
- return combined_text
224
-
225
- return [
226
- {
227
- "metadata": {
228
- "title": kwargs["title"],
229
- "page": kwargs.get("start", 0) + page_no,
230
- },
231
- "content": page,
232
- }
233
- for page_no, page in enumerate(all_texts, start=1)
234
- ]
235
-
236
-
237
- def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str:
162
+ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
238
163
  """
239
164
  Parse documents (PDFs or images) using various vision model APIs.
240
165
 
241
166
  Args:
242
167
  path (str): Path to the document to parse
243
- raw (bool): If True, return raw text; if False, return structured data
244
- api (str): Which API to use ("openai" or "huggingface")
168
+ api (str): Which API to use ("openai", "huggingface", or "together")
245
169
  **kwargs: Additional arguments including model, temperature, title, etc.
246
170
 
247
171
  Returns:
248
- List[Dict] | str: Parsed content either as raw text or structured data
172
+ Dict: Dictionary containing parsed document data
249
173
  """
250
174
  # Initialize appropriate client
251
175
  clients = {
@@ -253,6 +177,7 @@ def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str
253
177
  "huggingface": lambda: InferenceClient(
254
178
  token=os.environ["HUGGINGFACEHUB_API_TOKEN"]
255
179
  ),
180
+ "together": lambda: Together(),
256
181
  }
257
182
  assert api in clients, f"Unsupported API: {api}"
258
183
  logger.debug(f"Parsing with {api} API and model {kwargs['model']}")
@@ -329,6 +254,7 @@ def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str
329
254
 
330
255
  # Get completion from selected API
331
256
  response = client.chat.completions.create(**completion_params)
257
+ token_usage = response.usage
332
258
 
333
259
  # Extract the response text
334
260
  page_text = response.choices[0].message.content
@@ -341,23 +267,44 @@ def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str
341
267
  result = page_text.split("<output>")[1].strip()
342
268
  if "</output>" in result:
343
269
  result = result.split("</output>")[0].strip()
344
- all_results.append((page_num, result))
270
+ all_results.append(
271
+ (
272
+ page_num,
273
+ result,
274
+ token_usage.prompt_tokens,
275
+ token_usage.completion_tokens,
276
+ token_usage.total_tokens,
277
+ )
278
+ )
345
279
 
346
280
  # Sort results by page number and combine
347
281
  all_results.sort(key=lambda x: x[0])
348
- all_texts = [text for _, text in all_results]
282
+ all_texts = [text for _, text, _, _, _ in all_results]
349
283
  combined_text = "<page-break>".join(all_texts)
350
284
 
351
- if raw:
352
- return combined_text
353
-
354
- return [
355
- {
356
- "metadata": {
357
- "title": kwargs["title"],
358
- "page": kwargs.get("start", 0) + page_no,
359
- },
360
- "content": page,
361
- }
362
- for page_no, page in enumerate(all_texts, start=1)
363
- ]
285
+ return {
286
+ "raw": combined_text,
287
+ "segments": [
288
+ {
289
+ "metadata": {
290
+ "page": kwargs.get("start", 0) + page_no + 1,
291
+ "token_usage": {
292
+ "input": input_tokens,
293
+ "output": output_tokens,
294
+ "total": total_tokens,
295
+ },
296
+ },
297
+ "content": page,
298
+ }
299
+ for page_no, page, input_tokens, output_tokens, total_tokens in all_results
300
+ ],
301
+ "title": kwargs["title"],
302
+ "url": kwargs.get("url", ""),
303
+ "parent_title": kwargs.get("parent_title", ""),
304
+ "recursive_docs": [],
305
+ "token_usage": {
306
+ "input": sum(input_tokens for _, _, input_tokens, _, _ in all_results),
307
+ "output": sum(output_tokens for _, _, _, output_tokens, _ in all_results),
308
+ "total": sum(total_tokens for _, _, _, _, total_tokens in all_results),
309
+ },
310
+ }
@@ -9,73 +9,89 @@ from pdfplumber.utils import get_bbox_overlap, obj_to_bbox
9
9
  from docx import Document
10
10
 
11
11
 
12
- def parse_static_doc(path: str, raw: bool, **kwargs) -> List[Dict] | str:
12
+ def parse_static_doc(path: str, **kwargs) -> Dict:
13
+ """
14
+ Parses a document using static parsing methods.
15
+
16
+ Args:
17
+ path (str): The file path.
18
+ **kwargs: Additional arguments for parsing.
19
+
20
+ Returns:
21
+ Dict: Dictionary containing parsed document data
22
+ """
13
23
  framework = kwargs.get("framework", "pdfplumber")
14
24
 
15
25
  file_type = get_file_type(path)
16
26
  if file_type == "application/pdf":
17
27
  if framework == "pdfplumber":
18
- return parse_with_pdfplumber(path, raw, **kwargs)
28
+ return parse_with_pdfplumber(path, **kwargs)
19
29
  elif framework == "pdfminer":
20
- return parse_with_pdfminer(path, raw, **kwargs)
30
+ return parse_with_pdfminer(path, **kwargs)
21
31
  else:
22
32
  raise ValueError(f"Unsupported framework: {framework}")
23
33
  elif "wordprocessing" in file_type:
24
- return parse_with_docx(path, raw, **kwargs)
34
+ return parse_with_docx(path, **kwargs)
25
35
  elif file_type == "text/html":
26
36
  with open(path, "r") as f:
27
37
  html_content = f.read()
28
- return html_to_markdown(html_content, raw, kwargs["title"])
38
+ return html_to_markdown(html_content, kwargs["title"])
29
39
  elif file_type == "text/plain":
30
40
  with open(path, "r") as f:
31
41
  content = f.read()
32
- if raw:
33
- return content
34
- else:
35
- return [
36
- {
37
- "metadata": {"title": kwargs["title"], "page": 1},
38
- "content": content,
39
- }
40
- ]
42
+ return {
43
+ "raw": content,
44
+ "segments": [{"metadata": {"page": 1}, "content": content}],
45
+ "title": kwargs["title"],
46
+ "url": kwargs.get("url", ""),
47
+ "parent_title": kwargs.get("parent_title", ""),
48
+ "recursive_docs": [],
49
+ }
41
50
  elif file_type == "text/csv":
42
51
  df = pd.read_csv(path)
43
52
  content = df.to_markdown(index=False)
44
- if raw:
45
- return content
46
- else:
47
- return [
48
- {
49
- "metadata": {"title": kwargs["title"], "page": 1},
50
- "content": content,
51
- }
52
- ]
53
+ return {
54
+ "raw": content,
55
+ "segments": [{"metadata": {"page": 1}, "content": content}],
56
+ "title": kwargs["title"],
57
+ "url": kwargs.get("url", ""),
58
+ "parent_title": kwargs.get("parent_title", ""),
59
+ "recursive_docs": [],
60
+ }
53
61
  else:
54
62
  raise ValueError(f"Unsupported file type: {file_type}")
55
63
 
56
64
 
57
- def parse_with_pdfminer(path: str, raw: bool, **kwargs) -> List[Dict] | str:
65
+ def parse_with_pdfminer(path: str, **kwargs) -> Dict:
66
+ """
67
+ Parse PDF using pdfminer.
68
+
69
+ Returns:
70
+ Dict: Dictionary containing parsed document data
71
+ """
58
72
  pages = list(extract_pages(path))
59
- docs = []
73
+ segments = []
74
+ raw_texts = []
75
+
60
76
  for page_num, page_layout in enumerate(pages, start=1):
61
77
  page_text = "".join(
62
78
  element.get_text()
63
79
  for element in page_layout
64
80
  if isinstance(element, LTTextContainer)
65
81
  )
66
- if raw:
67
- docs.append(page_text)
68
- else:
69
- docs.append(
70
- {
71
- "metadata": {
72
- "title": kwargs["title"],
73
- "page": kwargs["start"] + page_num,
74
- },
75
- "content": page_text,
76
- }
77
- )
78
- return "\n".join(docs) if raw else docs
82
+ raw_texts.append(page_text)
83
+ segments.append(
84
+ {"metadata": {"page": kwargs["start"] + page_num}, "content": page_text}
85
+ )
86
+
87
+ return {
88
+ "raw": "\n".join(raw_texts),
89
+ "segments": segments,
90
+ "title": kwargs["title"],
91
+ "url": kwargs.get("url", ""),
92
+ "parent_title": kwargs.get("parent_title", ""),
93
+ "recursive_docs": [],
94
+ }
79
95
 
80
96
 
81
97
  def process_table(table) -> str:
@@ -359,44 +375,44 @@ def process_pdf_with_pdfplumber(path: str, **kwargs) -> List[str]:
359
375
  return page_texts
360
376
 
361
377
 
362
- def parse_with_pdfplumber(path: str, raw: bool, **kwargs) -> List[Dict] | str:
378
+ def parse_with_pdfplumber(path: str, **kwargs) -> Dict:
363
379
  """
364
- Parse PDF and return either raw text or structured data.
365
-
366
- Args:
367
- path (str): Path to the PDF file
368
- raw (bool): If True, return raw text with page breaks; if False, return structured data
369
- **kwargs: Additional arguments including 'title' and 'start' page number
380
+ Parse PDF using pdfplumber.
370
381
 
371
382
  Returns:
372
- Union[List[Dict], str]: Either a list of dictionaries containing page metadata and content,
373
- or a string of raw text with page breaks
383
+ Dict: Dictionary containing parsed document data
374
384
  """
375
385
  page_texts = process_pdf_with_pdfplumber(path)
376
- if raw:
377
- return "<page-break>".join(page_texts)
378
- return [
379
- {
380
- "metadata": {"title": kwargs["title"], "page": kwargs["start"] + page_num},
381
- "content": page_text,
382
- }
386
+ segments = [
387
+ {"metadata": {"page": kwargs["start"] + page_num}, "content": page_text}
383
388
  for page_num, page_text in enumerate(page_texts, start=1)
384
389
  ]
385
390
 
391
+ return {
392
+ "raw": "<page-break>".join(page_texts),
393
+ "segments": segments,
394
+ "title": kwargs["title"],
395
+ "url": kwargs.get("url", ""),
396
+ "parent_title": kwargs.get("parent_title", ""),
397
+ "recursive_docs": [],
398
+ }
399
+
386
400
 
387
- def parse_with_docx(path: str, raw: bool, **kwargs) -> List[Dict] | str:
401
+ def parse_with_docx(path: str, **kwargs) -> Dict:
402
+ """
403
+ Parse DOCX document.
404
+
405
+ Returns:
406
+ Dict: Dictionary containing parsed document data
407
+ """
388
408
  doc = Document(path)
389
409
  full_text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
390
410
 
391
- if raw:
392
- return full_text
393
-
394
- return [
395
- {
396
- "metadata": {
397
- "title": kwargs["title"],
398
- "page": kwargs["start"] + 1,
399
- },
400
- "content": full_text,
401
- }
402
- ]
411
+ return {
412
+ "raw": full_text,
413
+ "segments": [{"metadata": {"page": kwargs["start"] + 1}, "content": full_text}],
414
+ "title": kwargs["title"],
415
+ "url": kwargs.get("url", ""),
416
+ "parent_title": kwargs.get("parent_title", ""),
417
+ "recursive_docs": [],
418
+ }
@@ -5,7 +5,8 @@ import os
5
5
  import re
6
6
  import sys
7
7
  from difflib import SequenceMatcher
8
- from typing import Dict, List, Union
8
+ from hashlib import md5
9
+ from typing import Dict, List
9
10
  from urllib.parse import urlparse
10
11
 
11
12
  import nest_asyncio
@@ -184,14 +185,11 @@ def find_dominant_heading_level(markdown_content: str) -> str:
184
185
  return min(heading_counts.keys(), key=len)
185
186
 
186
187
 
187
- def split_md_by_headings(
188
- markdown_content: str, heading_pattern: str, title: str
189
- ) -> List[Dict]:
188
+ def split_md_by_headings(markdown_content: str, heading_pattern: str) -> List[Dict]:
190
189
  """
191
190
  Splits markdown content by the specified heading pattern and structures it.
192
191
 
193
192
  Args:
194
- url (str): The URL of the HTML page
195
193
  markdown_content (str): The markdown content to split
196
194
  heading_pattern (str): The heading pattern to split on (e.g., '##' or 'underline')
197
195
 
@@ -211,7 +209,7 @@ def split_md_by_headings(
211
209
  if sections and not re.match(r"^[^\n]+\n-+$", sections[0], re.MULTILINE):
212
210
  structured_content.append(
213
211
  {
214
- "metadata": {"title": title, "page": "Introduction"},
212
+ "metadata": {"page": "Introduction"},
215
213
  "content": sections.pop(0),
216
214
  }
217
215
  )
@@ -221,7 +219,7 @@ def split_md_by_headings(
221
219
  if i + 1 < len(sections):
222
220
  structured_content.append(
223
221
  {
224
- "metadata": {"title": title, "page": sections[i]},
222
+ "metadata": {"page": sections[i]},
225
223
  "content": sections[i + 1],
226
224
  }
227
225
  )
@@ -238,7 +236,7 @@ def split_md_by_headings(
238
236
  if len(sections) > len(headings):
239
237
  structured_content.append(
240
238
  {
241
- "metadata": {"title": title, "page": "Introduction"},
239
+ "metadata": {"page": "Introduction"},
242
240
  "content": sections.pop(0),
243
241
  }
244
242
  )
@@ -248,7 +246,7 @@ def split_md_by_headings(
248
246
  clean_heading = heading.replace(heading_pattern, "").strip()
249
247
  structured_content.append(
250
248
  {
251
- "metadata": {"title": title, "page": clean_heading},
249
+ "metadata": {"page": clean_heading},
252
250
  "content": content,
253
251
  }
254
252
  )
@@ -256,39 +254,47 @@ def split_md_by_headings(
256
254
  return structured_content
257
255
 
258
256
 
259
- def html_to_markdown(html: str, raw: bool, title: str) -> str:
257
+ def html_to_markdown(html: str, title: str, url: str) -> str:
260
258
  """
261
259
  Converts HTML content to markdown.
262
260
 
263
261
  Args:
264
262
  html (str): The HTML content to convert.
265
- raw (bool): Whether to return raw markdown text or structured data.
263
+ title (str): The title of the HTML page
264
+ url (str): The URL of the HTML page
266
265
 
267
266
  Returns:
268
- Union[str, List[Dict]]: Either raw markdown content or structured data with metadata and content sections.
267
+ Dict: Dictionary containing parsed document data
269
268
  """
270
269
  markdown_content = md(html)
271
270
 
272
- if raw:
273
- return markdown_content
274
-
275
271
  # Find the dominant heading level
276
272
  heading_pattern = find_dominant_heading_level(markdown_content)
277
273
 
278
274
  # Split content by headings and structure it
279
- return split_md_by_headings(markdown_content, heading_pattern, title)
275
+ split_md = split_md_by_headings(markdown_content, heading_pattern)
280
276
 
277
+ content = {
278
+ "raw": markdown_content,
279
+ "segments": split_md,
280
+ "title": title,
281
+ "url": url,
282
+ "parent_title": "",
283
+ "recursive_docs": [],
284
+ }
285
+
286
+ return content
281
287
 
282
- def read_html_content(url: str, raw: bool = False) -> Union[str, List[Dict]]:
288
+
289
+ def read_html_content(url: str) -> Dict:
283
290
  """
284
291
  Reads the content of an HTML page from the given URL and converts it to markdown or structured content.
285
292
 
286
293
  Args:
287
294
  url (str): The URL of the HTML page.
288
- raw (bool): Whether to return raw markdown text or structured data.
289
295
 
290
296
  Returns:
291
- Union[str, List[Dict]]: Either raw markdown content or structured data with metadata and content sections.
297
+ Dict: Dictionary containing parsed document data
292
298
  """
293
299
 
294
300
  try:
@@ -298,9 +304,44 @@ def read_html_content(url: str, raw: bool = False) -> Union[str, List[Dict]]:
298
304
 
299
305
  async def fetch_page():
300
306
  async with async_playwright() as p:
301
- browser = await p.chromium.launch(headless=True)
302
- page = await browser.new_page()
307
+ browser = await p.chromium.launch(
308
+ headless=True,
309
+ args=[
310
+ "--disable-blink-features=AutomationControlled",
311
+ "--no-sandbox",
312
+ "--window-size=1920,1080",
313
+ ],
314
+ )
315
+ context = await browser.new_context(
316
+ viewport={"width": 1920, "height": 1080},
317
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
318
+ bypass_csp=True,
319
+ )
320
+ page = await context.new_page()
321
+
322
+ # Add headers to appear more like a real browser
323
+ await page.set_extra_http_headers(
324
+ {
325
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
326
+ "Accept-Language": "en-US,en;q=0.5",
327
+ "Sec-Fetch-Dest": "document",
328
+ "Sec-Fetch-Mode": "navigate",
329
+ "Sec-Fetch-Site": "none",
330
+ "Sec-Fetch-User": "?1",
331
+ }
332
+ )
333
+
303
334
  await page.goto(url)
335
+
336
+ # Wait for Cloudflare check to complete
337
+ await page.wait_for_load_state("networkidle")
338
+
339
+ # Additional wait for any dynamic content
340
+ try:
341
+ await page.wait_for_selector("body", timeout=30000)
342
+ except:
343
+ pass
344
+
304
345
  html = await page.content()
305
346
  await browser.close()
306
347
  return html
@@ -316,7 +357,10 @@ def read_html_content(url: str, raw: bool = False) -> Union[str, List[Dict]]:
316
357
  soup = BeautifulSoup(
317
358
  response.content, "html.parser", from_encoding="iso-8859-1"
318
359
  )
319
- return html_to_markdown(str(soup), raw, title=url)
360
+ title = soup.title.string.strip() if soup.title else "No title"
361
+ url_hash = md5(url.encode("utf-8")).hexdigest()[:8]
362
+ full_title = f"{title} - {url_hash}"
363
+ return html_to_markdown(str(soup), title=full_title, url=url)
320
364
 
321
365
 
322
366
  def extract_urls_from_markdown(content: str) -> List[str]:
@@ -343,61 +387,60 @@ def extract_urls_from_markdown(content: str) -> List[str]:
343
387
  return list(set(urls)) # Remove duplicates
344
388
 
345
389
 
346
- def recursive_read_html(
347
- url: str, depth: int, raw: bool, visited_urls: set = None
348
- ) -> Union[str, List[Dict]]:
390
+ def recursive_read_html(url: str, depth: int, visited_urls: set = None) -> Dict:
349
391
  """
350
392
  Recursively reads HTML content from URLs up to specified depth.
351
393
 
352
394
  Args:
353
395
  url (str): The URL to parse
354
396
  depth (int): How many levels deep to recursively parse
355
- raw (bool): Whether to return raw text or structured data
356
397
  visited_urls (set): Set of already visited URLs to prevent cycles
357
398
 
358
399
  Returns:
359
- Union[str, List[Dict]]: Combined content from all parsed URLs
400
+ Dict: Dictionary containing parsed document data
360
401
  """
361
402
  if visited_urls is None:
362
403
  visited_urls = set()
363
404
 
364
405
  if url in visited_urls:
365
- return "" if raw else []
406
+ return {
407
+ "raw": "",
408
+ "segments": [],
409
+ "title": "",
410
+ "url": url,
411
+ "parent_title": "",
412
+ "recursive_docs": [],
413
+ }
366
414
 
367
415
  visited_urls.add(url)
368
416
 
369
417
  try:
370
- content = read_html_content(url, raw)
418
+ content = read_html_content(url)
371
419
  except Exception as e:
372
420
  print(f"Error processing URL {url}: {str(e)}")
373
- return "" if raw else []
421
+ return {
422
+ "raw": "",
423
+ "segments": [],
424
+ "title": "",
425
+ "url": url,
426
+ "parent_title": "",
427
+ "recursive_docs": [],
428
+ }
374
429
 
375
430
  if depth <= 1:
376
431
  return content
377
432
 
378
- # Extract URLs from the content
379
- if raw:
380
- urls = extract_urls_from_markdown(content)
381
- else:
382
- # Extract URLs from all content sections
383
- urls = []
384
- for doc in content:
385
- urls.extend(extract_urls_from_markdown(doc["content"]))
433
+ # Extract URLs from all content sections
434
+ urls = extract_urls_from_markdown(content["raw"])
386
435
 
387
436
  # Recursively process each URL
437
+ recursive_docs = []
388
438
  for sub_url in urls:
389
439
  if sub_url not in visited_urls:
390
- sub_content = recursive_read_html(sub_url, depth - 1, raw, visited_urls)
391
-
392
- if raw:
393
- if sub_content:
394
- content += f"\n\n--- Begin content from {sub_url} ---\n\n"
395
- content += sub_content
396
- content += f"\n\n--- End content from {sub_url} ---\n\n"
397
- else:
398
- if isinstance(sub_content, list):
399
- content.extend(sub_content)
440
+ sub_content = recursive_read_html(sub_url, depth - 1, visited_urls)
441
+ recursive_docs.append(sub_content)
400
442
 
443
+ content["recursive_docs"] = recursive_docs
401
444
  return content
402
445
 
403
446
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "lexoid"
3
- version = "0.1.8"
3
+ version = "0.1.9"
4
4
  description = ""
5
5
  authors = []
6
6
  readme = "README.md"
@@ -27,12 +27,18 @@ nest-asyncio ="^1.6.0"
27
27
  pyqt5 = {version = "^5.15.11", markers = "platform_system != 'debian'"}
28
28
  pyqtwebengine = {version = "^5.15.7", markers = "platform_system != 'debian'"}
29
29
  huggingface-hub = "^0.27.0"
30
+ together = "^1.4.0"
30
31
 
31
32
  [tool.poetry.group.dev.dependencies]
32
33
  ipykernel = "^6.29.5"
33
34
  pytest-asyncio = "^0.23.8"
34
35
  pytest = "^8.3.2"
35
36
 
37
+
38
+ [tool.poetry.group.docs.dependencies]
39
+ sphinx = "^8.1.3"
40
+ pydata-sphinx-theme = "^0.16.1"
41
+
36
42
  [build-system]
37
43
  requires = ["poetry-core", "wheel"]
38
44
  build-backend = "poetry.core.masonry.api"
File without changes