lexoid 0.1.8.post1__py3-none-any.whl → 0.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lexoid/api.py CHANGED
@@ -19,6 +19,7 @@ from lexoid.core.utils import (
19
19
  recursive_read_html,
20
20
  router,
21
21
  split_pdf,
22
+ create_sub_pdf,
22
23
  )
23
24
 
24
25
 
@@ -28,20 +29,24 @@ class ParserType(Enum):
28
29
  AUTO = "AUTO"
29
30
 
30
31
 
31
- def parse_chunk(
32
- path: str, parser_type: ParserType, raw: bool, **kwargs
33
- ) -> List[Dict] | str:
32
+ def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
34
33
  """
35
34
  Parses a file using the specified parser type.
36
35
 
37
36
  Args:
38
37
  path (str): The file path or URL.
39
38
  parser_type (ParserType): The type of parser to use (LLM_PARSE, STATIC_PARSE, or AUTO).
40
- raw (bool): Whether to return raw text or structured data.
41
39
  **kwargs: Additional arguments for the parser.
42
40
 
43
41
  Returns:
44
- List[Dict] | str: Parsed document data as a list of dictionaries or raw text.
42
+ Dict: Dictionary containing:
43
+ - raw: Full markdown content as string
44
+ - segments: List of dictionaries with metadata and content
45
+ - title: Title of the document
46
+ - url: URL if applicable
47
+ - parent_title: Title of parent doc if recursively parsed
48
+ - recursive_docs: List of dictionaries for recursively parsed documents
49
+ - token_usage: Dictionary containing token usage statistics
45
50
  """
46
51
  if parser_type == ParserType.AUTO:
47
52
  parser_type = ParserType[router(path)]
@@ -52,63 +57,81 @@ def parse_chunk(
52
57
  )
53
58
  if parser_type == ParserType.STATIC_PARSE:
54
59
  logger.debug("Using static parser")
55
- return parse_static_doc(path, raw, **kwargs)
60
+ return parse_static_doc(path, **kwargs)
56
61
  else:
57
62
  logger.debug("Using LLM parser")
58
- return parse_llm_doc(path, raw, **kwargs)
63
+ return parse_llm_doc(path, **kwargs)
59
64
 
60
65
 
61
66
  def parse_chunk_list(
62
- file_paths: List[str], parser_type: ParserType, raw: bool, kwargs: Dict
63
- ) -> List[Dict | str]:
67
+ file_paths: List[str], parser_type: ParserType, kwargs: Dict
68
+ ) -> Dict:
64
69
  """
65
70
  Parses a list of files using the specified parser type.
66
71
 
67
72
  Args:
68
73
  file_paths (list): List of file paths.
69
74
  parser_type (ParserType): The type of parser to use.
70
- raw (bool): Whether to return raw text or structured data.
71
75
  kwargs (dict): Additional arguments for the parser.
72
76
 
73
77
  Returns:
74
- List[Dict | str]: List of parsed documents with raw text and/or metadata.
78
+ Dict: Dictionary containing parsed document data
75
79
  """
76
- local_docs = []
80
+ combined_segments = []
81
+ raw_texts = []
82
+ token_usage = {"input": 0, "output": 0}
77
83
  for file_path in file_paths:
78
- result = parse_chunk(file_path, parser_type, raw, **kwargs)
79
- if isinstance(result, list):
80
- local_docs.extend(result)
81
- else:
82
- local_docs.append(result.replace("<page break>", "\n\n"))
83
- return local_docs
84
+ result = parse_chunk(file_path, parser_type, **kwargs)
85
+ combined_segments.extend(result["segments"])
86
+ raw_texts.append(result["raw"])
87
+ if "token_usage" in result:
88
+ token_usage["input"] += result["token_usage"]["input"]
89
+ token_usage["output"] += result["token_usage"]["output"]
90
+ token_usage["total"] = token_usage["input"] + token_usage["output"]
91
+
92
+ return {
93
+ "raw": "\n\n".join(raw_texts),
94
+ "segments": combined_segments,
95
+ "title": kwargs.get("title", ""),
96
+ "url": kwargs.get("url", ""),
97
+ "parent_title": kwargs.get("parent_title", ""),
98
+ "recursive_docs": [],
99
+ "token_usage": token_usage,
100
+ }
84
101
 
85
102
 
86
103
  def parse(
87
104
  path: str,
88
105
  parser_type: Union[str, ParserType] = "LLM_PARSE",
89
- raw: bool = False,
90
106
  pages_per_split: int = 4,
91
107
  max_processes: int = 4,
92
108
  **kwargs,
93
- ) -> Union[List[Dict], str]:
109
+ ) -> Dict:
94
110
  """
95
111
  Parses a document or URL, optionally splitting it into chunks and using multiprocessing.
96
112
 
97
113
  Args:
98
114
  path (str): The file path or URL.
99
- parser_type (Union[str, ParserType], optional): The type of parser to use ("LLM_PARSE", "STATIC_PARSE", or "AUTO"). Defaults to "LLM_PARSE".
100
- raw (bool, optional): Whether to return raw text or structured data. Defaults to False.
101
- pages_per_split (int, optional): Number of pages per split for chunking. Defaults to 4.
102
- max_processes (int, optional): Maximum number of processes for parallel processing. Defaults to 4.
115
+ parser_type (Union[str, ParserType], optional): Parser type ("LLM_PARSE", "STATIC_PARSE", or "AUTO").
116
+ pages_per_split (int, optional): Number of pages per split for chunking.
117
+ max_processes (int, optional): Maximum number of processes for parallel processing.
103
118
  **kwargs: Additional arguments for the parser.
104
119
 
105
120
  Returns:
106
- Union[List[Dict], str]: Parsed document data as a list of dictionaries or raw text.
121
+ Dict: Dictionary containing:
122
+ - raw: Full markdown content as string
123
+ - segments: List of dictionaries with metadata and content
124
+ - title: Title of the document
125
+ - url: URL if applicable
126
+ - parent_title: Title of parent doc if recursively parsed
127
+ - recursive_docs: List of dictionaries for recursively parsed documents
128
+ - token_usage: Dictionary containing token usage statistics
107
129
  """
108
130
  kwargs["title"] = os.path.basename(path)
109
131
  kwargs["pages_per_split_"] = pages_per_split
110
132
  as_pdf = kwargs.get("as_pdf", False)
111
133
  depth = kwargs.get("depth", 1)
134
+
112
135
  if type(parser_type) == str:
113
136
  parser_type = ParserType[parser_type]
114
137
 
@@ -120,15 +143,19 @@ def parse(
120
143
  as_pdf = True
121
144
 
122
145
  if path.startswith(("http://", "https://")):
123
- download_dir = os.path.join(temp_dir, "downloads/")
146
+ kwargs["url"] = path
147
+ download_dir = kwargs.get("save_dir", os.path.join(temp_dir, "downloads/"))
124
148
  os.makedirs(download_dir, exist_ok=True)
125
149
  if is_supported_url_file_type(path):
126
150
  path = download_file(path, download_dir)
127
151
  elif as_pdf:
128
- pdf_path = os.path.join(download_dir, f"webpage_{int(time())}.pdf")
152
+ pdf_filename = kwargs.get("save_filename", f"webpage_{int(time())}.pdf")
153
+ if not pdf_filename.endswith(".pdf"):
154
+ pdf_filename += ".pdf"
155
+ pdf_path = os.path.join(download_dir, pdf_filename)
129
156
  path = convert_to_pdf(path, pdf_path)
130
157
  else:
131
- return recursive_read_html(path, depth, raw)
158
+ return recursive_read_html(path, depth)
132
159
 
133
160
  assert is_supported_file_type(
134
161
  path
@@ -138,11 +165,15 @@ def parse(
138
165
  pdf_path = os.path.join(temp_dir, "converted.pdf")
139
166
  path = convert_to_pdf(path, pdf_path)
140
167
 
168
+ if "page_nums" in kwargs and path.lower().endswith(".pdf"):
169
+ sub_pdf_dir = os.path.join(temp_dir, "sub_pdfs")
170
+ os.makedirs(sub_pdf_dir, exist_ok=True)
171
+ sub_pdf_path = os.path.join(sub_pdf_dir, f"{os.path.basename(path)}")
172
+ path = create_sub_pdf(path, sub_pdf_path, kwargs["page_nums"])
173
+
141
174
  if not path.lower().endswith(".pdf") or parser_type == ParserType.STATIC_PARSE:
142
175
  kwargs["split"] = False
143
- all_docs = parse_chunk(path, parser_type, raw, **kwargs)
144
- if raw:
145
- all_docs = [all_docs]
176
+ result = parse_chunk(path, parser_type, **kwargs)
146
177
  else:
147
178
  kwargs["split"] = True
148
179
  split_dir = os.path.join(temp_dir, "splits/")
@@ -156,22 +187,39 @@ def parse(
156
187
  for i in range(0, len(split_files), chunk_size)
157
188
  ]
158
189
 
159
- process_args = [(chunk, parser_type, raw, kwargs) for chunk in file_chunks]
190
+ process_args = [(chunk, parser_type, kwargs) for chunk in file_chunks]
160
191
 
161
192
  if max_processes == 1 or len(file_chunks) == 1:
162
- all_docs = [parse_chunk_list(*args) for args in process_args]
193
+ chunk_results = [parse_chunk_list(*args) for args in process_args]
163
194
  else:
164
195
  with ProcessPoolExecutor(max_workers=max_processes) as executor:
165
- all_docs = list(executor.map(parse_chunk_list, *zip(*process_args)))
166
-
167
- all_docs = [item for sublist in all_docs for item in sublist]
196
+ chunk_results = list(
197
+ executor.map(parse_chunk_list, *zip(*process_args))
198
+ )
199
+
200
+ # Combine results from all chunks
201
+ result = {
202
+ "raw": "\n\n".join(r["raw"] for r in chunk_results),
203
+ "segments": [seg for r in chunk_results for seg in r["segments"]],
204
+ "title": kwargs["title"],
205
+ "url": kwargs.get("url", ""),
206
+ "parent_title": kwargs.get("parent_title", ""),
207
+ "recursive_docs": [],
208
+ "token_usage": {
209
+ "input": sum(r["token_usage"]["input"] for r in chunk_results),
210
+ "output": sum(r["token_usage"]["output"] for r in chunk_results),
211
+ "total": sum(r["token_usage"]["total"] for r in chunk_results),
212
+ },
213
+ }
214
+ if as_pdf:
215
+ result["pdf_path"] = path
168
216
 
169
217
  if depth > 1:
170
- new_docs = all_docs.copy()
171
- for doc in all_docs:
218
+ recursive_docs = []
219
+ for segment in result["segments"]:
172
220
  urls = re.findall(
173
221
  r'https?://[^\s<>"\']+|www\.[^\s<>"\']+(?:\.[^\s<>"\']+)*',
174
- doc if raw else doc["content"],
222
+ segment["content"],
175
223
  )
176
224
  for url in urls:
177
225
  if "](" in url:
@@ -182,19 +230,16 @@ def parse(
182
230
 
183
231
  kwargs_cp = kwargs.copy()
184
232
  kwargs_cp["depth"] = depth - 1
185
- res = parse(
233
+ kwargs_cp["parent_title"] = result["title"]
234
+ sub_doc = parse(
186
235
  url,
187
236
  parser_type=parser_type,
188
- raw=raw,
189
237
  pages_per_split=pages_per_split,
190
238
  max_processes=max_processes,
191
239
  **kwargs_cp,
192
240
  )
241
+ recursive_docs.append(sub_doc)
193
242
 
194
- if raw:
195
- new_docs.append(res)
196
- else:
197
- new_docs.extend(res)
198
- all_docs = new_docs
243
+ result["recursive_docs"] = recursive_docs
199
244
 
200
- return "\n".join(all_docs) if raw else all_docs
245
+ return result
@@ -18,6 +18,7 @@ from lexoid.core.prompt_templates import (
18
18
  from lexoid.core.utils import convert_image_to_pdf
19
19
  from loguru import logger
20
20
  from openai import OpenAI
21
+ from together import Together
21
22
  from huggingface_hub import InferenceClient
22
23
 
23
24
 
@@ -33,38 +34,36 @@ def retry_on_http_error(func):
33
34
  return func(*args, **kwargs)
34
35
  except HTTPError as e:
35
36
  logger.error(f"Retry failed: {e}")
36
- if kwargs.get("raw", False):
37
- return ""
38
- return [
39
- {
40
- "metadata": {
41
- "title": kwargs["title"],
42
- "page": kwargs.get("start", 0),
43
- },
44
- "content": "",
45
- }
46
- ]
37
+ return {
38
+ "raw": "",
39
+ "segments": [],
40
+ "title": kwargs["title"],
41
+ "url": kwargs.get("url", ""),
42
+ "parent_title": kwargs.get("parent_title", ""),
43
+ "recursive_docs": [],
44
+ "error": f"HTTPError encountered on page {kwargs.get('start', 0)}: {e}",
45
+ }
47
46
 
48
47
  return wrapper
49
48
 
50
49
 
51
50
  @retry_on_http_error
52
- def parse_llm_doc(path: str, raw: bool, **kwargs) -> List[Dict] | str:
51
+ def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
53
52
  if "model" not in kwargs:
54
53
  kwargs["model"] = "gemini-1.5-flash"
55
54
  model = kwargs.get("model")
56
55
  if model.startswith("gemini"):
57
- return parse_with_gemini(path, raw, **kwargs)
56
+ return parse_with_gemini(path, **kwargs)
58
57
  if model.startswith("gpt"):
59
- return parse_with_api(path, raw, api="openai", **kwargs)
58
+ return parse_with_api(path, api="openai", **kwargs)
60
59
  if model.startswith("meta-llama"):
61
60
  if model.endswith("Turbo") or model == "meta-llama/Llama-Vision-Free":
62
- return parse_with_together(path, raw, **kwargs)
63
- return parse_with_api(path, raw, api="huggingface", **kwargs)
61
+ return parse_with_api(path, api="together", **kwargs)
62
+ return parse_with_api(path, api="huggingface", **kwargs)
64
63
  raise ValueError(f"Unsupported model: {model}")
65
64
 
66
65
 
67
- def parse_with_gemini(path: str, raw: bool, **kwargs) -> List[Dict] | str:
66
+ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
68
67
  api_key = os.environ.get("GOOGLE_API_KEY")
69
68
  if not api_key:
70
69
  raise ValueError("GOOGLE_API_KEY environment variable is not set")
@@ -119,25 +118,33 @@ def parse_with_gemini(path: str, raw: bool, **kwargs) -> List[Dict] | str:
119
118
  if "text" in part
120
119
  )
121
120
 
122
- result = ""
121
+ combined_text = ""
123
122
  if "<output>" in raw_text:
124
- result = raw_text.split("<output>")[1].strip()
123
+ combined_text = raw_text.split("<output>")[1].strip()
125
124
  if "</output>" in result:
126
- result = result.split("</output>")[0].strip()
127
-
128
- if raw:
129
- return result
130
-
131
- return [
132
- {
133
- "metadata": {
134
- "title": kwargs["title"],
135
- "page": kwargs.get("start", 0) + page_no,
136
- },
137
- "content": page,
138
- }
139
- for page_no, page in enumerate(result.split("<page-break>"), start=1)
140
- ]
125
+ combined_text = result.split("</output>")[0].strip()
126
+
127
+ token_usage = result["usageMetadata"]
128
+ input_tokens = token_usage.get("promptTokenCount", 0)
129
+ output_tokens = token_usage.get("candidatesTokenCount", 0)
130
+ total_tokens = input_tokens + output_tokens
131
+
132
+ return {
133
+ "raw": combined_text,
134
+ "segments": [
135
+ {"metadata": {"page": kwargs.get("start", 0) + page_no}, "content": page}
136
+ for page_no, page in enumerate(combined_text.split("<page-break>"), start=1)
137
+ ],
138
+ "title": kwargs["title"],
139
+ "url": kwargs.get("url", ""),
140
+ "parent_title": kwargs.get("parent_title", ""),
141
+ "recursive_docs": [],
142
+ "token_usage": {
143
+ "input": input_tokens,
144
+ "output": output_tokens,
145
+ "total": total_tokens,
146
+ },
147
+ }
141
148
 
142
149
 
143
150
  def convert_pdf_page_to_base64(
@@ -155,97 +162,17 @@ def convert_pdf_page_to_base64(
155
162
  return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
156
163
 
157
164
 
158
- def parse_with_together(path: str, raw: bool, **kwargs) -> List[Dict] | str:
159
- api_key = os.environ.get("TOGETHER_API_KEY")
160
- if not api_key:
161
- raise ValueError("TOGETHER_API_KEY environment variable is not set")
162
-
163
- url = "https://api.together.xyz/v1/chat/completions"
164
- headers = {
165
- "Authorization": f"Bearer {api_key}",
166
- "Content-Type": "application/json",
167
- }
168
-
169
- mime_type, _ = mimetypes.guess_type(path)
170
- if mime_type and mime_type.startswith("image"):
171
- with open(path, "rb") as img_file:
172
- image_base64 = base64.b64encode(img_file.read()).decode("utf-8")
173
- images = [(0, f"data:{mime_type};base64,{image_base64}")]
174
- else:
175
- pdf_document = pdfium.PdfDocument(path)
176
- images = [
177
- (
178
- page_num,
179
- f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num)}",
180
- )
181
- for page_num in range(len(pdf_document))
182
- ]
183
-
184
- all_results = []
185
- for page_num, image_url in images:
186
- messages = [
187
- {
188
- "role": "user",
189
- "content": [
190
- {"type": "text", "text": LLAMA_PARSER_PROMPT},
191
- {"type": "image_url", "image_url": {"url": image_url}},
192
- ],
193
- }
194
- ]
195
-
196
- payload = {
197
- "model": kwargs["model"],
198
- "messages": messages,
199
- "max_tokens": kwargs.get("max_tokens", 1024),
200
- "temperature": kwargs.get("temperature", 0.7),
201
- }
202
-
203
- response = requests.post(url, json=payload, headers=headers)
204
- response.raise_for_status()
205
- response_data = response.json()
206
-
207
- page_text = response_data["choices"][0]["message"]["content"]
208
- if kwargs.get("verbose", None):
209
- logger.debug(f"Page {page_num + 1} response: {page_text}")
210
-
211
- result = page_text
212
- if "<output>" in page_text:
213
- result = page_text.split("<output>")[1].strip()
214
- if "</output>" in result:
215
- result = result.split("</output>")[0].strip()
216
- all_results.append((page_num, result))
217
-
218
- all_results.sort(key=lambda x: x[0])
219
- all_texts = [text for _, text in all_results]
220
- combined_text = "<page-break>".join(all_texts)
221
-
222
- if raw:
223
- return combined_text
224
-
225
- return [
226
- {
227
- "metadata": {
228
- "title": kwargs["title"],
229
- "page": kwargs.get("start", 0) + page_no,
230
- },
231
- "content": page,
232
- }
233
- for page_no, page in enumerate(all_texts, start=1)
234
- ]
235
-
236
-
237
- def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str:
165
+ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
238
166
  """
239
167
  Parse documents (PDFs or images) using various vision model APIs.
240
168
 
241
169
  Args:
242
170
  path (str): Path to the document to parse
243
- raw (bool): If True, return raw text; if False, return structured data
244
- api (str): Which API to use ("openai" or "huggingface")
171
+ api (str): Which API to use ("openai", "huggingface", or "together")
245
172
  **kwargs: Additional arguments including model, temperature, title, etc.
246
173
 
247
174
  Returns:
248
- List[Dict] | str: Parsed content either as raw text or structured data
175
+ Dict: Dictionary containing parsed document data
249
176
  """
250
177
  # Initialize appropriate client
251
178
  clients = {
@@ -253,6 +180,7 @@ def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str
253
180
  "huggingface": lambda: InferenceClient(
254
181
  token=os.environ["HUGGINGFACEHUB_API_TOKEN"]
255
182
  ),
183
+ "together": lambda: Together(),
256
184
  }
257
185
  assert api in clients, f"Unsupported API: {api}"
258
186
  logger.debug(f"Parsing with {api} API and model {kwargs['model']}")
@@ -329,6 +257,7 @@ def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str
329
257
 
330
258
  # Get completion from selected API
331
259
  response = client.chat.completions.create(**completion_params)
260
+ token_usage = response.usage
332
261
 
333
262
  # Extract the response text
334
263
  page_text = response.choices[0].message.content
@@ -341,23 +270,44 @@ def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str
341
270
  result = page_text.split("<output>")[1].strip()
342
271
  if "</output>" in result:
343
272
  result = result.split("</output>")[0].strip()
344
- all_results.append((page_num, result))
273
+ all_results.append(
274
+ (
275
+ page_num,
276
+ result,
277
+ token_usage.prompt_tokens,
278
+ token_usage.completion_tokens,
279
+ token_usage.total_tokens,
280
+ )
281
+ )
345
282
 
346
283
  # Sort results by page number and combine
347
284
  all_results.sort(key=lambda x: x[0])
348
- all_texts = [text for _, text in all_results]
285
+ all_texts = [text for _, text, _, _, _ in all_results]
349
286
  combined_text = "<page-break>".join(all_texts)
350
287
 
351
- if raw:
352
- return combined_text
353
-
354
- return [
355
- {
356
- "metadata": {
357
- "title": kwargs["title"],
358
- "page": kwargs.get("start", 0) + page_no,
359
- },
360
- "content": page,
361
- }
362
- for page_no, page in enumerate(all_texts, start=1)
363
- ]
288
+ return {
289
+ "raw": combined_text,
290
+ "segments": [
291
+ {
292
+ "metadata": {
293
+ "page": kwargs.get("start", 0) + page_no + 1,
294
+ "token_usage": {
295
+ "input": input_tokens,
296
+ "output": output_tokens,
297
+ "total": total_tokens,
298
+ },
299
+ },
300
+ "content": page,
301
+ }
302
+ for page_no, page, input_tokens, output_tokens, total_tokens in all_results
303
+ ],
304
+ "title": kwargs["title"],
305
+ "url": kwargs.get("url", ""),
306
+ "parent_title": kwargs.get("parent_title", ""),
307
+ "recursive_docs": [],
308
+ "token_usage": {
309
+ "input": sum(input_tokens for _, _, input_tokens, _, _ in all_results),
310
+ "output": sum(output_tokens for _, _, _, output_tokens, _ in all_results),
311
+ "total": sum(total_tokens for _, _, _, _, total_tokens in all_results),
312
+ },
313
+ }
@@ -9,73 +9,89 @@ from pdfplumber.utils import get_bbox_overlap, obj_to_bbox
9
9
  from docx import Document
10
10
 
11
11
 
12
- def parse_static_doc(path: str, raw: bool, **kwargs) -> List[Dict] | str:
12
+ def parse_static_doc(path: str, **kwargs) -> Dict:
13
+ """
14
+ Parses a document using static parsing methods.
15
+
16
+ Args:
17
+ path (str): The file path.
18
+ **kwargs: Additional arguments for parsing.
19
+
20
+ Returns:
21
+ Dict: Dictionary containing parsed document data
22
+ """
13
23
  framework = kwargs.get("framework", "pdfplumber")
14
24
 
15
25
  file_type = get_file_type(path)
16
26
  if file_type == "application/pdf":
17
27
  if framework == "pdfplumber":
18
- return parse_with_pdfplumber(path, raw, **kwargs)
28
+ return parse_with_pdfplumber(path, **kwargs)
19
29
  elif framework == "pdfminer":
20
- return parse_with_pdfminer(path, raw, **kwargs)
30
+ return parse_with_pdfminer(path, **kwargs)
21
31
  else:
22
32
  raise ValueError(f"Unsupported framework: {framework}")
23
33
  elif "wordprocessing" in file_type:
24
- return parse_with_docx(path, raw, **kwargs)
34
+ return parse_with_docx(path, **kwargs)
25
35
  elif file_type == "text/html":
26
36
  with open(path, "r") as f:
27
37
  html_content = f.read()
28
- return html_to_markdown(html_content, raw, kwargs["title"])
38
+ return html_to_markdown(html_content, kwargs["title"])
29
39
  elif file_type == "text/plain":
30
40
  with open(path, "r") as f:
31
41
  content = f.read()
32
- if raw:
33
- return content
34
- else:
35
- return [
36
- {
37
- "metadata": {"title": kwargs["title"], "page": 1},
38
- "content": content,
39
- }
40
- ]
42
+ return {
43
+ "raw": content,
44
+ "segments": [{"metadata": {"page": 1}, "content": content}],
45
+ "title": kwargs["title"],
46
+ "url": kwargs.get("url", ""),
47
+ "parent_title": kwargs.get("parent_title", ""),
48
+ "recursive_docs": [],
49
+ }
41
50
  elif file_type == "text/csv":
42
51
  df = pd.read_csv(path)
43
52
  content = df.to_markdown(index=False)
44
- if raw:
45
- return content
46
- else:
47
- return [
48
- {
49
- "metadata": {"title": kwargs["title"], "page": 1},
50
- "content": content,
51
- }
52
- ]
53
+ return {
54
+ "raw": content,
55
+ "segments": [{"metadata": {"page": 1}, "content": content}],
56
+ "title": kwargs["title"],
57
+ "url": kwargs.get("url", ""),
58
+ "parent_title": kwargs.get("parent_title", ""),
59
+ "recursive_docs": [],
60
+ }
53
61
  else:
54
62
  raise ValueError(f"Unsupported file type: {file_type}")
55
63
 
56
64
 
57
- def parse_with_pdfminer(path: str, raw: bool, **kwargs) -> List[Dict] | str:
65
+ def parse_with_pdfminer(path: str, **kwargs) -> Dict:
66
+ """
67
+ Parse PDF using pdfminer.
68
+
69
+ Returns:
70
+ Dict: Dictionary containing parsed document data
71
+ """
58
72
  pages = list(extract_pages(path))
59
- docs = []
73
+ segments = []
74
+ raw_texts = []
75
+
60
76
  for page_num, page_layout in enumerate(pages, start=1):
61
77
  page_text = "".join(
62
78
  element.get_text()
63
79
  for element in page_layout
64
80
  if isinstance(element, LTTextContainer)
65
81
  )
66
- if raw:
67
- docs.append(page_text)
68
- else:
69
- docs.append(
70
- {
71
- "metadata": {
72
- "title": kwargs["title"],
73
- "page": kwargs["start"] + page_num,
74
- },
75
- "content": page_text,
76
- }
77
- )
78
- return "\n".join(docs) if raw else docs
82
+ raw_texts.append(page_text)
83
+ segments.append(
84
+ {"metadata": {"page": kwargs["start"] + page_num}, "content": page_text}
85
+ )
86
+
87
+ return {
88
+ "raw": "\n".join(raw_texts),
89
+ "segments": segments,
90
+ "title": kwargs["title"],
91
+ "url": kwargs.get("url", ""),
92
+ "parent_title": kwargs.get("parent_title", ""),
93
+ "recursive_docs": [],
94
+ }
79
95
 
80
96
 
81
97
  def process_table(table) -> str:
@@ -359,44 +375,44 @@ def process_pdf_with_pdfplumber(path: str, **kwargs) -> List[str]:
359
375
  return page_texts
360
376
 
361
377
 
362
- def parse_with_pdfplumber(path: str, raw: bool, **kwargs) -> List[Dict] | str:
378
+ def parse_with_pdfplumber(path: str, **kwargs) -> Dict:
363
379
  """
364
- Parse PDF and return either raw text or structured data.
365
-
366
- Args:
367
- path (str): Path to the PDF file
368
- raw (bool): If True, return raw text with page breaks; if False, return structured data
369
- **kwargs: Additional arguments including 'title' and 'start' page number
380
+ Parse PDF using pdfplumber.
370
381
 
371
382
  Returns:
372
- Union[List[Dict], str]: Either a list of dictionaries containing page metadata and content,
373
- or a string of raw text with page breaks
383
+ Dict: Dictionary containing parsed document data
374
384
  """
375
385
  page_texts = process_pdf_with_pdfplumber(path)
376
- if raw:
377
- return "<page-break>".join(page_texts)
378
- return [
379
- {
380
- "metadata": {"title": kwargs["title"], "page": kwargs["start"] + page_num},
381
- "content": page_text,
382
- }
386
+ segments = [
387
+ {"metadata": {"page": kwargs["start"] + page_num}, "content": page_text}
383
388
  for page_num, page_text in enumerate(page_texts, start=1)
384
389
  ]
385
390
 
391
+ return {
392
+ "raw": "<page-break>".join(page_texts),
393
+ "segments": segments,
394
+ "title": kwargs["title"],
395
+ "url": kwargs.get("url", ""),
396
+ "parent_title": kwargs.get("parent_title", ""),
397
+ "recursive_docs": [],
398
+ }
399
+
386
400
 
387
- def parse_with_docx(path: str, raw: bool, **kwargs) -> List[Dict] | str:
401
+ def parse_with_docx(path: str, **kwargs) -> Dict:
402
+ """
403
+ Parse DOCX document.
404
+
405
+ Returns:
406
+ Dict: Dictionary containing parsed document data
407
+ """
388
408
  doc = Document(path)
389
409
  full_text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
390
410
 
391
- if raw:
392
- return full_text
393
-
394
- return [
395
- {
396
- "metadata": {
397
- "title": kwargs["title"],
398
- "page": kwargs["start"] + 1,
399
- },
400
- "content": full_text,
401
- }
402
- ]
411
+ return {
412
+ "raw": full_text,
413
+ "segments": [{"metadata": {"page": kwargs["start"] + 1}, "content": full_text}],
414
+ "title": kwargs["title"],
415
+ "url": kwargs.get("url", ""),
416
+ "parent_title": kwargs.get("parent_title", ""),
417
+ "recursive_docs": [],
418
+ }
lexoid/core/utils.py CHANGED
@@ -5,7 +5,8 @@ import os
5
5
  import re
6
6
  import sys
7
7
  from difflib import SequenceMatcher
8
- from typing import Dict, List, Union
8
+ from hashlib import md5
9
+ from typing import Dict, List, Optional
9
10
  from urllib.parse import urlparse
10
11
 
11
12
  import nest_asyncio
@@ -44,6 +45,20 @@ def split_pdf(input_path: str, output_dir: str, pages_per_split: int):
44
45
  return paths
45
46
 
46
47
 
48
+ def create_sub_pdf(
49
+ input_path: str, output_path: str, page_nums: Optional[tuple[int, ...]|int] = None
50
+ ) -> str:
51
+ if isinstance(page_nums, int):
52
+ page_nums = (page_nums,)
53
+ page_nums = tuple(sorted(set(page_nums)))
54
+ with pikepdf.open(input_path) as pdf:
55
+ indices = page_nums if page_nums else range(len(pdf.pages))
56
+ with pikepdf.new() as new_pdf:
57
+ new_pdf.pages.extend([pdf.pages[i - 1] for i in indices])
58
+ new_pdf.save(output_path)
59
+ return output_path
60
+
61
+
47
62
  def convert_image_to_pdf(image_path: str) -> bytes:
48
63
  with Image.open(image_path) as img:
49
64
  img_rgb = img.convert("RGB")
@@ -184,14 +199,11 @@ def find_dominant_heading_level(markdown_content: str) -> str:
184
199
  return min(heading_counts.keys(), key=len)
185
200
 
186
201
 
187
- def split_md_by_headings(
188
- markdown_content: str, heading_pattern: str, title: str
189
- ) -> List[Dict]:
202
+ def split_md_by_headings(markdown_content: str, heading_pattern: str) -> List[Dict]:
190
203
  """
191
204
  Splits markdown content by the specified heading pattern and structures it.
192
205
 
193
206
  Args:
194
- url (str): The URL of the HTML page
195
207
  markdown_content (str): The markdown content to split
196
208
  heading_pattern (str): The heading pattern to split on (e.g., '##' or 'underline')
197
209
 
@@ -211,7 +223,7 @@ def split_md_by_headings(
211
223
  if sections and not re.match(r"^[^\n]+\n-+$", sections[0], re.MULTILINE):
212
224
  structured_content.append(
213
225
  {
214
- "metadata": {"title": title, "page": "Introduction"},
226
+ "metadata": {"page": "Introduction"},
215
227
  "content": sections.pop(0),
216
228
  }
217
229
  )
@@ -221,7 +233,7 @@ def split_md_by_headings(
221
233
  if i + 1 < len(sections):
222
234
  structured_content.append(
223
235
  {
224
- "metadata": {"title": title, "page": sections[i]},
236
+ "metadata": {"page": sections[i]},
225
237
  "content": sections[i + 1],
226
238
  }
227
239
  )
@@ -238,7 +250,7 @@ def split_md_by_headings(
238
250
  if len(sections) > len(headings):
239
251
  structured_content.append(
240
252
  {
241
- "metadata": {"title": title, "page": "Introduction"},
253
+ "metadata": {"page": "Introduction"},
242
254
  "content": sections.pop(0),
243
255
  }
244
256
  )
@@ -248,7 +260,7 @@ def split_md_by_headings(
248
260
  clean_heading = heading.replace(heading_pattern, "").strip()
249
261
  structured_content.append(
250
262
  {
251
- "metadata": {"title": title, "page": clean_heading},
263
+ "metadata": {"page": clean_heading},
252
264
  "content": content,
253
265
  }
254
266
  )
@@ -256,39 +268,47 @@ def split_md_by_headings(
256
268
  return structured_content
257
269
 
258
270
 
259
- def html_to_markdown(html: str, raw: bool, title: str) -> str:
271
+ def html_to_markdown(html: str, title: str, url: str) -> str:
260
272
  """
261
273
  Converts HTML content to markdown.
262
274
 
263
275
  Args:
264
276
  html (str): The HTML content to convert.
265
- raw (bool): Whether to return raw markdown text or structured data.
277
+ title (str): The title of the HTML page
278
+ url (str): The URL of the HTML page
266
279
 
267
280
  Returns:
268
- Union[str, List[Dict]]: Either raw markdown content or structured data with metadata and content sections.
281
+ Dict: Dictionary containing parsed document data
269
282
  """
270
283
  markdown_content = md(html)
271
284
 
272
- if raw:
273
- return markdown_content
274
-
275
285
  # Find the dominant heading level
276
286
  heading_pattern = find_dominant_heading_level(markdown_content)
277
287
 
278
288
  # Split content by headings and structure it
279
- return split_md_by_headings(markdown_content, heading_pattern, title)
289
+ split_md = split_md_by_headings(markdown_content, heading_pattern)
290
+
291
+ content = {
292
+ "raw": markdown_content,
293
+ "segments": split_md,
294
+ "title": title,
295
+ "url": url,
296
+ "parent_title": "",
297
+ "recursive_docs": [],
298
+ }
299
+
300
+ return content
280
301
 
281
302
 
282
- def read_html_content(url: str, raw: bool = False) -> Union[str, List[Dict]]:
303
+ def read_html_content(url: str) -> Dict:
283
304
  """
284
305
  Reads the content of an HTML page from the given URL and converts it to markdown or structured content.
285
306
 
286
307
  Args:
287
308
  url (str): The URL of the HTML page.
288
- raw (bool): Whether to return raw markdown text or structured data.
289
309
 
290
310
  Returns:
291
- Union[str, List[Dict]]: Either raw markdown content or structured data with metadata and content sections.
311
+ Dict: Dictionary containing parsed document data
292
312
  """
293
313
 
294
314
  try:
@@ -351,7 +371,10 @@ def read_html_content(url: str, raw: bool = False) -> Union[str, List[Dict]]:
351
371
  soup = BeautifulSoup(
352
372
  response.content, "html.parser", from_encoding="iso-8859-1"
353
373
  )
354
- return html_to_markdown(str(soup), raw, title=url)
374
+ title = soup.title.string.strip() if soup.title else "No title"
375
+ url_hash = md5(url.encode("utf-8")).hexdigest()[:8]
376
+ full_title = f"{title} - {url_hash}"
377
+ return html_to_markdown(str(soup), title=full_title, url=url)
355
378
 
356
379
 
357
380
  def extract_urls_from_markdown(content: str) -> List[str]:
@@ -378,61 +401,60 @@ def extract_urls_from_markdown(content: str) -> List[str]:
378
401
  return list(set(urls)) # Remove duplicates
379
402
 
380
403
 
381
- def recursive_read_html(
382
- url: str, depth: int, raw: bool, visited_urls: set = None
383
- ) -> Union[str, List[Dict]]:
404
+ def recursive_read_html(url: str, depth: int, visited_urls: set = None) -> Dict:
384
405
  """
385
406
  Recursively reads HTML content from URLs up to specified depth.
386
407
 
387
408
  Args:
388
409
  url (str): The URL to parse
389
410
  depth (int): How many levels deep to recursively parse
390
- raw (bool): Whether to return raw text or structured data
391
411
  visited_urls (set): Set of already visited URLs to prevent cycles
392
412
 
393
413
  Returns:
394
- Union[str, List[Dict]]: Combined content from all parsed URLs
414
+ Dict: Dictionary containing parsed document data
395
415
  """
396
416
  if visited_urls is None:
397
417
  visited_urls = set()
398
418
 
399
419
  if url in visited_urls:
400
- return "" if raw else []
420
+ return {
421
+ "raw": "",
422
+ "segments": [],
423
+ "title": "",
424
+ "url": url,
425
+ "parent_title": "",
426
+ "recursive_docs": [],
427
+ }
401
428
 
402
429
  visited_urls.add(url)
403
430
 
404
431
  try:
405
- content = read_html_content(url, raw)
432
+ content = read_html_content(url)
406
433
  except Exception as e:
407
434
  print(f"Error processing URL {url}: {str(e)}")
408
- return "" if raw else []
435
+ return {
436
+ "raw": "",
437
+ "segments": [],
438
+ "title": "",
439
+ "url": url,
440
+ "parent_title": "",
441
+ "recursive_docs": [],
442
+ }
409
443
 
410
444
  if depth <= 1:
411
445
  return content
412
446
 
413
- # Extract URLs from the content
414
- if raw:
415
- urls = extract_urls_from_markdown(content)
416
- else:
417
- # Extract URLs from all content sections
418
- urls = []
419
- for doc in content:
420
- urls.extend(extract_urls_from_markdown(doc["content"]))
447
+ # Extract URLs from all content sections
448
+ urls = extract_urls_from_markdown(content["raw"])
421
449
 
422
450
  # Recursively process each URL
451
+ recursive_docs = []
423
452
  for sub_url in urls:
424
453
  if sub_url not in visited_urls:
425
- sub_content = recursive_read_html(sub_url, depth - 1, raw, visited_urls)
426
-
427
- if raw:
428
- if sub_content:
429
- content += f"\n\n--- Begin content from {sub_url} ---\n\n"
430
- content += sub_content
431
- content += f"\n\n--- End content from {sub_url} ---\n\n"
432
- else:
433
- if isinstance(sub_content, list):
434
- content.extend(sub_content)
454
+ sub_content = recursive_read_html(sub_url, depth - 1, visited_urls)
455
+ recursive_docs.append(sub_content)
435
456
 
457
+ content["recursive_docs"] = recursive_docs
436
458
  return content
437
459
 
438
460
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: lexoid
3
- Version: 0.1.8.post1
3
+ Version: 0.1.10
4
4
  Summary:
5
5
  Requires-Python: >=3.10,<4.0
6
6
  Classifier: Programming Language :: Python :: 3
@@ -28,6 +28,7 @@ Requires-Dist: pyqtwebengine (>=5.15.7,<6.0.0) ; platform_system != "debian"
28
28
  Requires-Dist: python-docx (>=1.1.2,<2.0.0)
29
29
  Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
30
30
  Requires-Dist: tabulate (>=0.9.0,<0.10.0)
31
+ Requires-Dist: together (>=1.4.0,<2.0.0)
31
32
  Description-Content-Type: text/markdown
32
33
 
33
34
  # Lexoid
@@ -93,10 +94,10 @@ Here's a quick example to parse documents using Lexoid:
93
94
  from lexoid.api import parse
94
95
  from lexoid.api import ParserType
95
96
 
96
- parsed_md = parse("https://www.justice.gov/eoir/immigration-law-advisor", parser_type="LLM_PARSE", raw=True)
97
+ parsed_md = parse("https://www.justice.gov/eoir/immigration-law-advisor", parser_type="LLM_PARSE")["raw"]
97
98
  # or
98
99
  pdf_path = "path/to/immigration-law-advisor.pdf"
99
- parsed_md = parse(pdf_path, parser_type="LLM_PARSE", raw=True)
100
+ parsed_md = parse(pdf_path, parser_type="LLM_PARSE")["raw"]
100
101
 
101
102
  print(parsed_md)
102
103
  ```
@@ -104,7 +105,6 @@ print(parsed_md)
104
105
  ### Parameters
105
106
  - path (str): The file path or URL.
106
107
  - parser_type (str, optional): The type of parser to use ("LLM_PARSE" or "STATIC_PARSE"). Defaults to "AUTO".
107
- - raw (bool, optional): Return raw text or structured data. Defaults to False.
108
108
  - pages_per_split (int, optional): Number of pages per split for chunking. Defaults to 4.
109
109
  - max_threads (int, optional): Maximum number of threads for parallel processing. Defaults to 4.
110
110
  - **kwargs: Additional arguments for the parser.
@@ -0,0 +1,9 @@
1
+ lexoid/api.py,sha256=45nkTuQcxdppeUiRsiyioJtvlVeWeoq_WgKtGCthIBY,9193
2
+ lexoid/core/parse_type/llm_parser.py,sha256=tH19B0w78OowkDdqJg3rom0kQmyuTaTfDP98Qnwufo0,10625
3
+ lexoid/core/parse_type/static_parser.py,sha256=j3khirFnXq2j3IFEu0TsYWA5sHMpe_oQLFM9Uv3hScM,14100
4
+ lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
5
+ lexoid/core/utils.py,sha256=HT37qmdhPpUNN6O571G7ItE5K2Mv8SreBHmxrhdiXA8,18951
6
+ lexoid-0.1.10.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
7
+ lexoid-0.1.10.dist-info/METADATA,sha256=4uhJ_IaHEKPl9lxKg8RRrBQ5dn7oB23XCnJNG5sNpH4,4576
8
+ lexoid-0.1.10.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
9
+ lexoid-0.1.10.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- lexoid/api.py,sha256=0we4A-U_nWMg57ZusthN27f1zXde2igi9jQujrjXthw,7120
2
- lexoid/core/parse_type/llm_parser.py,sha256=JsrVALlK4h2j8URSgNIhdWPB6chWXrNrMlImtxVTyyU,11833
3
- lexoid/core/parse_type/static_parser.py,sha256=NlAE_WMMNvNnVo2aQrA9mN3fwJ6ZrshMC8S9kG0h8CA,13772
4
- lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
5
- lexoid/core/utils.py,sha256=peWuMVTk90-j0aSDaRnwigpoAz_Q5y8vSosCDc6Zl3g,18642
6
- lexoid-0.1.8.post1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
7
- lexoid-0.1.8.post1.dist-info/METADATA,sha256=mz8A_92-GrLfOmT8UYcIxWIEkcskad_9vSnNnlbE4dI,4625
8
- lexoid-0.1.8.post1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
9
- lexoid-0.1.8.post1.dist-info/RECORD,,