lexoid 0.1.6__tar.gz → 0.1.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: lexoid
3
- Version: 0.1.6
3
+ Version: 0.1.7
4
4
  Summary:
5
5
  Requires-Python: >=3.10,<4.0
6
6
  Classifier: Programming Language :: Python :: 3
@@ -11,6 +11,7 @@ Classifier: Programming Language :: Python :: 3.13
11
11
  Requires-Dist: bs4 (>=0.0.2,<0.0.3)
12
12
  Requires-Dist: docx2pdf (>=0.1.8,<0.2.0)
13
13
  Requires-Dist: google-generativeai (>=0.8.1,<0.9.0)
14
+ Requires-Dist: huggingface-hub (>=0.27.0,<0.28.0)
14
15
  Requires-Dist: loguru (>=0.7.2,<0.8.0)
15
16
  Requires-Dist: markdown (>=3.7,<4.0)
16
17
  Requires-Dist: markdownify (>=0.13.1,<0.14.0)
@@ -22,9 +23,12 @@ Requires-Dist: pdfplumber (>=0.11.4,<0.12.0)
22
23
  Requires-Dist: pikepdf (>=9.3.0,<10.0.0)
23
24
  Requires-Dist: playwright (>=1.49.0,<2.0.0)
24
25
  Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
26
+ Requires-Dist: pyqt5 (>=5.15.11,<6.0.0) ; platform_system != "debian"
27
+ Requires-Dist: pyqtwebengine (>=5.15.7,<6.0.0) ; platform_system != "debian"
25
28
  Requires-Dist: python-docx (>=1.1.2,<2.0.0)
26
29
  Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
27
30
  Requires-Dist: tabulate (>=0.9.0,<0.10.0)
31
+ Requires-Dist: together (>=1.3.10,<2.0.0)
28
32
  Description-Content-Type: text/markdown
29
33
 
30
34
  # Lexoid
@@ -37,38 +41,46 @@ Lexoid is an efficient document parsing library that supports both LLM-based and
37
41
  - Collaborate with a permissive license
38
42
 
39
43
  ## Installation
40
- To install dependencies:
44
+ ### Installing with pip
41
45
  ```
42
- make install
46
+ pip install lexoid
43
47
  ```
44
- or, to install with dev-dependencies:
48
+
49
+ To use LLM-based parsing, define the following environment variables or create a `.env` file with the following definitions
45
50
  ```
46
- make dev
51
+ OPENAI_API_KEY=""
52
+ GOOGLE_API_KEY=""
47
53
  ```
48
54
 
49
- To activate virtual environment:
55
+ Optionally, to use `Playwright` for retrieving web content with the `.whl` package (else regular requests will be used by default):
50
56
  ```
51
- source .venv/bin/activate
57
+ playwright install --with-deps --only-shell chromium
52
58
  ```
53
59
 
54
- To use LLM-based parsing, define the following environment variables or create a `.env` file with the following definitions
60
+ ### Building `.whl` from source
61
+ To create `.whl`:
55
62
  ```
56
- OPENAI_API_KEY=""
57
- GOOGLE_API_KEY=""
63
+ make build
58
64
  ```
59
65
 
60
- To build a `.whl` file for testing:
66
+ ### Creating a local installation
67
+ To install dependencies:
68
+ ```
69
+ make install
70
+ ```
71
+ or, to install with dev-dependencies:
61
72
  ```
62
- poetry build
73
+ make dev
63
74
  ```
64
75
 
65
- Optionally, to use `Playwright` for retrieving web content with the `.whl` package (else regular requests will be used by default):
76
+ To activate virtual environment:
66
77
  ```
67
- playwright install --with-deps --only-shell chromium
78
+ source .venv/bin/activate
68
79
  ```
69
80
 
70
81
  ## Usage
71
82
  [Example Notebook](https://github.com/oidlabs-com/Lexoid/blob/main/examples/example_notebook.ipynb)
83
+ [Example Colab Notebook](https://drive.google.com/file/d/1v9R6VOUp9CEGalgZGeg5G57XzHqh_tB6/view?usp=sharing)
72
84
 
73
85
  Here's a quick example to parse documents using Lexoid:
74
86
  ``` python
@@ -96,7 +108,13 @@ Initial results (_more updates soon_)
96
108
  | Rank | Model/Framework | Similarity | Time (s) |
97
109
  |------|-----------|------------|----------|
98
110
  | 1 | gpt-4o | 0.799 | 21.77|
99
- | 2 | gemini-1.5-pro | 0.742 | 15.77 |
100
- | 3 | gpt-4o-mini | 0.721 | 14.86 |
101
- | 4 | gemini-1.5-flash | 0.702 | 4.56 |
111
+ | 2 | gemini-2.0-flash-exp | 0.797 | 13.47 |
112
+ | 3 | gemini-exp-1121 | 0.779 | 30.88 |
113
+ | 4 | gemini-1.5-pro | 0.742 | 15.77 |
114
+ | 5 | gpt-4o-mini | 0.721 | 14.86 |
115
+ | 6 | gemini-1.5-flash | 0.702 | 4.56 |
116
+ | 7 | Llama-3.2-11B-Vision-Instruct (via HF) | 0.582 | 21.74 |
117
+ | 8 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.556 | 4.58 |
118
+ | 9 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.527 | 10.57 |
119
+ | 10 | Llama-Vision-Free (via Together AI) | 0.435 | 8.42 |
102
120
 
@@ -8,38 +8,46 @@ Lexoid is an efficient document parsing library that supports both LLM-based and
8
8
  - Collaborate with a permissive license
9
9
 
10
10
  ## Installation
11
- To install dependencies:
11
+ ### Installing with pip
12
12
  ```
13
- make install
13
+ pip install lexoid
14
14
  ```
15
- or, to install with dev-dependencies:
15
+
16
+ To use LLM-based parsing, define the following environment variables or create a `.env` file with the following definitions
16
17
  ```
17
- make dev
18
+ OPENAI_API_KEY=""
19
+ GOOGLE_API_KEY=""
18
20
  ```
19
21
 
20
- To activate virtual environment:
22
+ Optionally, to use `Playwright` for retrieving web content with the `.whl` package (else regular requests will be used by default):
21
23
  ```
22
- source .venv/bin/activate
24
+ playwright install --with-deps --only-shell chromium
23
25
  ```
24
26
 
25
- To use LLM-based parsing, define the following environment variables or create a `.env` file with the following definitions
27
+ ### Building `.whl` from source
28
+ To create `.whl`:
26
29
  ```
27
- OPENAI_API_KEY=""
28
- GOOGLE_API_KEY=""
30
+ make build
29
31
  ```
30
32
 
31
- To build a `.whl` file for testing:
33
+ ### Creating a local installation
34
+ To install dependencies:
35
+ ```
36
+ make install
37
+ ```
38
+ or, to install with dev-dependencies:
32
39
  ```
33
- poetry build
40
+ make dev
34
41
  ```
35
42
 
36
- Optionally, to use `Playwright` for retrieving web content with the `.whl` package (else regular requests will be used by default):
43
+ To activate virtual environment:
37
44
  ```
38
- playwright install --with-deps --only-shell chromium
45
+ source .venv/bin/activate
39
46
  ```
40
47
 
41
48
  ## Usage
42
49
  [Example Notebook](https://github.com/oidlabs-com/Lexoid/blob/main/examples/example_notebook.ipynb)
50
+ [Example Colab Notebook](https://drive.google.com/file/d/1v9R6VOUp9CEGalgZGeg5G57XzHqh_tB6/view?usp=sharing)
43
51
 
44
52
  Here's a quick example to parse documents using Lexoid:
45
53
  ``` python
@@ -67,6 +75,12 @@ Initial results (_more updates soon_)
67
75
  | Rank | Model/Framework | Similarity | Time (s) |
68
76
  |------|-----------|------------|----------|
69
77
  | 1 | gpt-4o | 0.799 | 21.77|
70
- | 2 | gemini-1.5-pro | 0.742 | 15.77 |
71
- | 3 | gpt-4o-mini | 0.721 | 14.86 |
72
- | 4 | gemini-1.5-flash | 0.702 | 4.56 |
78
+ | 2 | gemini-2.0-flash-exp | 0.797 | 13.47 |
79
+ | 3 | gemini-exp-1121 | 0.779 | 30.88 |
80
+ | 4 | gemini-1.5-pro | 0.742 | 15.77 |
81
+ | 5 | gpt-4o-mini | 0.721 | 14.86 |
82
+ | 6 | gemini-1.5-flash | 0.702 | 4.56 |
83
+ | 7 | Llama-3.2-11B-Vision-Instruct (via HF) | 0.582 | 21.74 |
84
+ | 8 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.556 | 4.58 |
85
+ | 9 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.527 | 10.57 |
86
+ | 10 | Llama-Vision-Free (via Together AI) | 0.435 | 8.42 |
@@ -10,10 +10,13 @@ from lexoid.core.prompt_templates import (
10
10
  INSTRUCTIONS_ADD_PG_BREAK,
11
11
  OPENAI_USER_PROMPT,
12
12
  PARSER_PROMPT,
13
+ LLAMA_PARSER_PROMPT,
13
14
  )
14
15
  from lexoid.core.utils import convert_image_to_pdf
15
16
  from loguru import logger
16
17
  from openai import OpenAI
18
+ from huggingface_hub import InferenceClient
19
+ from together import Together
17
20
 
18
21
 
19
22
  def parse_llm_doc(path: str, raw: bool, **kwargs) -> List[Dict] | str:
@@ -22,10 +25,13 @@ def parse_llm_doc(path: str, raw: bool, **kwargs) -> List[Dict] | str:
22
25
  model = kwargs.get("model")
23
26
  if model.startswith("gemini"):
24
27
  return parse_with_gemini(path, raw, **kwargs)
25
- elif model.startswith("gpt"):
26
- return parse_with_gpt(path, raw, **kwargs)
27
- else:
28
- raise ValueError(f"Unsupported model: {model}")
28
+ if model.startswith("gpt"):
29
+ return parse_with_api(path, raw, api="openai", **kwargs)
30
+ if model.startswith("meta-llama"):
31
+ if model.endswith("Turbo") or model == "meta-llama/Llama-Vision-Free":
32
+ return parse_with_api(path, raw, api="together", **kwargs)
33
+ return parse_with_api(path, raw, api="huggingface", **kwargs)
34
+ raise ValueError(f"Unsupported model: {model}")
29
35
 
30
36
 
31
37
  def parse_with_gemini(path: str, raw: bool, **kwargs) -> List[Dict] | str:
@@ -120,8 +126,30 @@ def convert_pdf_page_to_base64(
120
126
  return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
121
127
 
122
128
 
123
- def parse_with_gpt(path: str, raw: bool, **kwargs) -> List[Dict] | str:
124
- client = OpenAI()
129
+ def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str:
130
+ """
131
+ Parse documents (PDFs or images) using various vision model APIs.
132
+
133
+ Args:
134
+ path (str): Path to the document to parse
135
+ raw (bool): If True, return raw text; if False, return structured data
136
+ api (str): Which API to use ("openai", "huggingface", or "together")
137
+ **kwargs: Additional arguments including model, temperature, title, etc.
138
+
139
+ Returns:
140
+ List[Dict] | str: Parsed content either as raw text or structured data
141
+ """
142
+ # Initialize appropriate client
143
+ clients = {
144
+ "openai": lambda: OpenAI(),
145
+ "huggingface": lambda: InferenceClient(
146
+ token=os.environ["HUGGINGFACEHUB_API_TOKEN"]
147
+ ),
148
+ "together": lambda: Together(),
149
+ }
150
+ assert api in clients, f"Unsupported API: {api}"
151
+ logger.debug(f"Parsing with {api} API and model {kwargs['model']}")
152
+ client = clients[api]()
125
153
 
126
154
  # Handle different input types
127
155
  mime_type, _ = mimetypes.guess_type(path)
@@ -129,50 +157,79 @@ def parse_with_gpt(path: str, raw: bool, **kwargs) -> List[Dict] | str:
129
157
  # Single image processing
130
158
  with open(path, "rb") as img_file:
131
159
  image_base64 = base64.b64encode(img_file.read()).decode("utf-8")
132
- images = [(0, image_base64)]
160
+ images = [(0, f"data:{mime_type};base64,{image_base64}")]
133
161
  else:
134
162
  # PDF processing
135
163
  pdf_document = pdfium.PdfDocument(path)
136
164
  images = [
137
- (page_num, convert_pdf_page_to_base64(pdf_document, page_num))
165
+ (
166
+ page_num,
167
+ f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num)}",
168
+ )
138
169
  for page_num in range(len(pdf_document))
139
170
  ]
140
171
 
172
+ # API-specific message formatting
173
+ def get_messages(page_num: int, image_url: str) -> List[Dict]:
174
+ base_message = {
175
+ "type": "text",
176
+ "text": LLAMA_PARSER_PROMPT,
177
+ }
178
+ image_message = {
179
+ "type": "image_url",
180
+ "image_url": {"url": image_url},
181
+ }
182
+
183
+ if api == "openai":
184
+ return [
185
+ {
186
+ "role": "system",
187
+ "content": PARSER_PROMPT.format(
188
+ custom_instructions=INSTRUCTIONS_ADD_PG_BREAK
189
+ ),
190
+ },
191
+ {
192
+ "role": "user",
193
+ "content": [
194
+ {
195
+ "type": "text",
196
+ "text": f"{OPENAI_USER_PROMPT} (Page {page_num + 1})",
197
+ },
198
+ image_message,
199
+ ],
200
+ },
201
+ ]
202
+ else:
203
+ return [
204
+ {
205
+ "role": "user",
206
+ "content": [base_message, image_message],
207
+ }
208
+ ]
209
+
141
210
  # Process each page/image
142
211
  all_results = []
143
- for page_num, image_base64 in images:
144
- messages = [
145
- {
146
- "role": "system",
147
- "content": PARSER_PROMPT,
148
- },
149
- {
150
- "role": "user",
151
- "content": [
152
- {
153
- "type": "text",
154
- "text": f"{OPENAI_USER_PROMPT} (Page {page_num + 1})",
155
- },
156
- {
157
- "type": "image_url",
158
- "image_url": {"url": f"data:image/png;base64,{image_base64}"},
159
- },
160
- ],
161
- },
162
- ]
212
+ for page_num, image_url in images:
213
+ messages = get_messages(page_num, image_url)
163
214
 
164
- # Get completion from GPT-4 Vision
165
- response = client.chat.completions.create(
166
- model=kwargs["model"],
167
- temperature=kwargs.get("temperature", 0.7),
168
- messages=messages,
169
- )
215
+ # Common completion parameters
216
+ completion_params = {
217
+ "model": kwargs["model"],
218
+ "messages": messages,
219
+ "max_tokens": kwargs.get("max_tokens", 1024),
220
+ "temperature": kwargs.get("temperature", 0.7),
221
+ }
222
+
223
+ # Get completion from selected API
224
+ response = client.chat.completions.create(**completion_params)
170
225
 
171
226
  # Extract the response text
172
227
  page_text = response.choices[0].message.content
173
228
  if kwargs.get("verbose", None):
174
229
  logger.debug(f"Page {page_num + 1} response: {page_text}")
175
- result = ""
230
+
231
+ # Extract content between output tags if present
232
+ result = page_text
176
233
  if "<output>" in page_text:
177
234
  result = page_text.split("<output>")[1].strip()
178
235
  if "</output>" in result:
@@ -89,15 +89,21 @@ def process_table(table) -> str:
89
89
 
90
90
  # Convert to DataFrame and handle empty cells
91
91
  df = pd.DataFrame(table_data)
92
+ df.replace("", pd.NA, inplace=True)
93
+ df = df.dropna(how="all", axis=0)
94
+ df = df.dropna(how="all", axis=1)
92
95
  df = df.fillna("")
96
+ if len(df) == 0:
97
+ return ""
93
98
 
94
99
  # Use first row as header and clean it up
95
100
  df.columns = df.iloc[0]
96
- df = df.drop(0)
101
+ df = df.drop(df.index[0])
102
+ df.replace(r"\n", "<br>", regex=True, inplace=True)
97
103
 
98
104
  # Convert to markdown with some formatting options
99
105
  markdown_table = df.to_markdown(index=False, tablefmt="pipe")
100
- return f"\n{markdown_table}\n\n" # Add newlines for proper markdown rendering
106
+ return f"\n{markdown_table}\n\n"
101
107
 
102
108
 
103
109
  def embed_links_in_text(page, text, links):
@@ -157,8 +163,20 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
157
163
  x_tolerance = kwargs.get("x_tolerance", 1)
158
164
  y_tolerance = kwargs.get("y_tolerance", 5)
159
165
 
160
- # First, identify tables and their positions
161
- tables = page.find_tables()
166
+ # Table settings
167
+ vertical_strategy = kwargs.get("vertical_strategy", "lines")
168
+ horizontal_strategy = kwargs.get("horizontal_strategy", "lines")
169
+ snap_x_tolerance = kwargs.get("snap_x_tolerance", 10)
170
+ snap_y_tolerance = kwargs.get("snap_y_tolerance", 0)
171
+
172
+ tables = page.find_tables(
173
+ table_settings={
174
+ "vertical_strategy": vertical_strategy,
175
+ "horizontal_strategy": horizontal_strategy,
176
+ "snap_x_tolerance": snap_x_tolerance,
177
+ "snap_y_tolerance": snap_y_tolerance,
178
+ }
179
+ )
162
180
  table_zones = [(table.bbox, process_table(table)) for table in tables]
163
181
 
164
182
  # Create a filtered page excluding table areas
@@ -171,12 +189,46 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
171
189
  words = filtered_page.extract_words(
172
190
  x_tolerance=x_tolerance,
173
191
  y_tolerance=y_tolerance,
174
- extra_attrs=["size", "top", "bottom"],
192
+ extra_attrs=["size", "top", "bottom", "fontname"],
175
193
  )
176
194
 
177
- def format_paragraph(text):
178
- text = " ".join(text.split())
179
- return f"{text}\n\n"
195
+ def format_paragraph(text_elements):
196
+ """Format a paragraph with styling applied to individual words"""
197
+ formatted_words = []
198
+ for element in text_elements:
199
+ text = element["text"]
200
+ formatting = get_text_formatting(element)
201
+ formatted_words.append(apply_markdown_formatting(text, formatting))
202
+ return f"{' '.join(formatted_words)}\n\n"
203
+
204
+ def get_text_formatting(word):
205
+ """
206
+ Detect text formatting based on font properties
207
+ Returns a dict of formatting attributes
208
+ """
209
+ formatting = {
210
+ "bold": False,
211
+ "italic": False,
212
+ }
213
+
214
+ # Check font name for common bold/italic indicators
215
+ font_name = word.get("fontname", "").lower()
216
+ if any(style in font_name for style in ["bold", "heavy", "black"]):
217
+ formatting["bold"] = True
218
+ if any(style in font_name for style in ["italic", "oblique"]):
219
+ formatting["italic"] = True
220
+
221
+ return formatting
222
+
223
+ def apply_markdown_formatting(text, formatting):
224
+ """Apply markdown formatting to text based on detected styles"""
225
+ if formatting["bold"] and formatting["italic"]:
226
+ text = f"***{text}***"
227
+ elif formatting["bold"]:
228
+ text = f"**{text}**"
229
+ elif formatting["italic"]:
230
+ text = f"*{text}*"
231
+ return text
180
232
 
181
233
  def detect_heading_level(font_size):
182
234
  if font_size >= 24:
@@ -205,17 +257,18 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
205
257
  while tables and word["bottom"] > tables[0][1]["bottom"]:
206
258
  content_elements.append(tables.pop(0))
207
259
  content_elements.append(("word", word))
260
+ content_elements.extend(tables)
208
261
 
209
262
  for element_type, element in content_elements:
210
263
  if element_type == "table":
211
264
  # If there are any pending paragraphs or headings, add them first
212
265
  if current_heading:
213
266
  level = detect_heading_level(current_heading[0]["size"])
214
- heading_text = " ".join(word["text"] for word in current_heading)
215
- markdown_content.append(f"{'#' * level} {heading_text}\n\n")
267
+ heading_text = format_paragraph(current_heading)
268
+ markdown_content.append(f"{'#' * level} {heading_text}")
216
269
  current_heading = []
217
270
  if current_paragraph:
218
- markdown_content.append(format_paragraph(" ".join(current_paragraph)))
271
+ markdown_content.append(format_paragraph(current_paragraph))
219
272
  current_paragraph = []
220
273
  # Add the table
221
274
  markdown_content.append(element["content"])
@@ -233,46 +286,42 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
233
286
  # If we were collecting a heading
234
287
  if current_heading:
235
288
  level = detect_heading_level(current_heading[0]["size"])
236
- heading_text = " ".join(word["text"] for word in current_heading)
237
- markdown_content.append(f"{'#' * level} {heading_text}\n\n")
289
+ heading_text = format_paragraph(current_heading)
290
+ markdown_content.append(f"{'#' * level} {heading_text}")
238
291
  current_heading = []
239
292
 
240
293
  # If we were collecting a paragraph
241
294
  if current_paragraph:
242
- markdown_content.append(
243
- format_paragraph(" ".join(current_paragraph))
244
- )
295
+ markdown_content.append(format_paragraph(current_paragraph))
245
296
  current_paragraph = []
246
297
 
247
298
  # Add word to appropriate collection
248
299
  if heading_level:
249
300
  if current_paragraph: # Flush any pending paragraph
250
- markdown_content.append(
251
- format_paragraph(" ".join(current_paragraph))
252
- )
301
+ markdown_content.append(format_paragraph(current_paragraph))
253
302
  current_paragraph = []
254
- current_heading.append({"text": word["text"], "size": word["size"]})
303
+ current_heading.append(word)
255
304
  else:
256
305
  if current_heading: # Flush any pending heading
257
306
  level = detect_heading_level(current_heading[0]["size"])
258
- heading_text = " ".join(word["text"] for word in current_heading)
259
- markdown_content.append(f"{'#' * level} {heading_text}\n\n")
307
+ heading_text = format_paragraph(current_heading)
308
+ markdown_content.append(f"{'#' * level} {heading_text}")
260
309
  current_heading = []
261
- current_paragraph.append(word["text"])
310
+ current_paragraph.append(word)
262
311
 
263
312
  last_y = word["top"]
264
313
 
265
314
  # Handle remaining content
266
315
  if current_heading:
267
316
  level = detect_heading_level(current_heading[0]["size"])
268
- heading_text = " ".join(word["text"] for word in current_heading)
269
- markdown_content.append(f"{'#' * level} {heading_text}\n\n")
317
+ heading_text = format_paragraph(current_heading)
318
+ markdown_content.append(f"{'#' * level} {heading_text}")
270
319
 
271
320
  if current_paragraph:
272
- markdown_content.append(format_paragraph(" ".join(current_paragraph)))
321
+ markdown_content.append(format_paragraph(current_paragraph))
273
322
 
274
323
  # Process links for the page
275
- content = "".join(markdown_content) # Process links using the new function
324
+ content = "".join(markdown_content)
276
325
  if page.annots:
277
326
  links = []
278
327
  for annot in page.annots:
@@ -283,6 +332,9 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
283
332
  if links:
284
333
  content = embed_links_in_text(page, content, links)
285
334
 
335
+ # Remove redundant formatting
336
+ content = content.replace("** **", " ").replace("* *", " ")
337
+
286
338
  return content
287
339
 
288
340
 
@@ -76,3 +76,22 @@ Ensure accurate representation of all content, including tables and visual eleme
76
76
  """
77
77
 
78
78
  INSTRUCTIONS_ADD_PG_BREAK = "Insert a `<page-break>` tag between the content of each page to maintain the original page structure."
79
+
80
+ LLAMA_PARSER_PROMPT = """\
81
+ You are a document conversion assistant. Your task is to accurately reproduce the content of an image in Markdown and HTML format, maintaining the visual structure and layout of the original document as closely as possible.
82
+
83
+ Instructions:
84
+ 1. Use a combination of Markdown and HTML to replicate the document's layout and formatting.
85
+ 2. Reproduce all text content exactly as it appears, including preserving capitalization, punctuation, and any apparent errors or inconsistencies in the original.
86
+ 3. Use appropriate Markdown syntax for headings, emphasis (bold, italic), and lists where applicable.
87
+ 4. Always use HTML (`<table>`, `<tr>`, `<td>`) to represent tabular data. Include `colspan` and `rowspan` attributes if needed.
88
+ 5. For figures, graphs, or diagrams, represent them using `<img>` tags and use appropriate `alt` text.
89
+ 6. For handwritten documents, reproduce the content as typed text, maintaining the original structure and layout.
90
+ 7. Do not include any descriptions of the document's appearance, paper type, or writing implements used.
91
+ 8. Do not add any explanatory notes, comments, or additional information outside of the converted content.
92
+ 9. Ensure all special characters, symbols, and equations are accurately represented.
93
+ 10. Provide the output only once, without any duplication.
94
+ 11. Enclose the entire output within <output> and </output> tags.
95
+
96
+ Output the converted content directly in Markdown and HTML without any additional explanations, descriptions, or notes.
97
+ """
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "lexoid"
3
- version = "0.1.6"
3
+ version = "0.1.7"
4
4
  description = ""
5
5
  authors = []
6
6
  readme = "README.md"
@@ -24,16 +24,16 @@ playwright = "^1.49.0"
24
24
  docx2pdf = "^0.1.8"
25
25
  python-docx = "^1.1.2"
26
26
  nest-asyncio ="^1.6.0"
27
+ pyqt5 = {version = "^5.15.11", markers = "platform_system != 'debian'"}
28
+ pyqtwebengine = {version = "^5.15.7", markers = "platform_system != 'debian'"}
29
+ huggingface-hub = "^0.27.0"
30
+ together = "^1.3.10"
27
31
 
28
32
  [tool.poetry.group.dev.dependencies]
29
33
  ipykernel = "^6.29.5"
30
34
  pytest-asyncio = "^0.23.8"
31
35
  pytest = "^8.3.2"
32
36
 
33
- [tool.poetry.group.qt5.dependencies]
34
- pyqt5 = "^5.15.11"
35
- pyqtwebengine = "^5.15.7"
36
-
37
37
  [build-system]
38
- requires = ["poetry-core"]
38
+ requires = ["poetry-core", "wheel"]
39
39
  build-backend = "poetry.core.masonry.api"
File without changes
File without changes
File without changes