sparrow-parse 0.3.4__tar.gz → 0.3.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sparrow-parse-0.3.4 → sparrow-parse-0.3.5}/PKG-INFO +11 -98
- {sparrow-parse-0.3.4 → sparrow-parse-0.3.5}/README.md +8 -95
- {sparrow-parse-0.3.4 → sparrow-parse-0.3.5}/setup.py +3 -3
- sparrow-parse-0.3.5/sparrow_parse/__init__.py +1 -0
- {sparrow-parse-0.3.4 → sparrow-parse-0.3.5}/sparrow_parse.egg-info/PKG-INFO +11 -98
- {sparrow-parse-0.3.4 → sparrow-parse-0.3.5}/sparrow_parse.egg-info/SOURCES.txt +0 -5
- {sparrow-parse-0.3.4 → sparrow-parse-0.3.5}/sparrow_parse.egg-info/requires.txt +1 -4
- sparrow-parse-0.3.4/sparrow_parse/__init__.py +0 -1
- sparrow-parse-0.3.4/sparrow_parse/extractors/html_extractor.py +0 -251
- sparrow-parse-0.3.4/sparrow_parse/helpers/html_extractor_helper.py +0 -374
- sparrow-parse-0.3.4/sparrow_parse/processors/markdown_processor.py +0 -137
- sparrow-parse-0.3.4/sparrow_parse/processors/unstructured_processor.py +0 -178
- sparrow-parse-0.3.4/sparrow_parse/temp.py +0 -27
- {sparrow-parse-0.3.4 → sparrow-parse-0.3.5}/setup.cfg +0 -0
- {sparrow-parse-0.3.4 → sparrow-parse-0.3.5}/sparrow_parse/__main__.py +0 -0
- {sparrow-parse-0.3.4 → sparrow-parse-0.3.5}/sparrow_parse/extractors/__init__.py +0 -0
- {sparrow-parse-0.3.4 → sparrow-parse-0.3.5}/sparrow_parse/extractors/vllm_extractor.py +0 -0
- {sparrow-parse-0.3.4 → sparrow-parse-0.3.5}/sparrow_parse/helpers/__init__.py +0 -0
- {sparrow-parse-0.3.4 → sparrow-parse-0.3.5}/sparrow_parse/helpers/pdf_optimizer.py +0 -0
- {sparrow-parse-0.3.4 → sparrow-parse-0.3.5}/sparrow_parse/processors/__init__.py +0 -0
- {sparrow-parse-0.3.4 → sparrow-parse-0.3.5}/sparrow_parse/processors/table_structure_processor.py +0 -0
- {sparrow-parse-0.3.4 → sparrow-parse-0.3.5}/sparrow_parse/vllm/__init__.py +0 -0
- {sparrow-parse-0.3.4 → sparrow-parse-0.3.5}/sparrow_parse/vllm/huggingface_inference.py +0 -0
- {sparrow-parse-0.3.4 → sparrow-parse-0.3.5}/sparrow_parse/vllm/inference_base.py +0 -0
- {sparrow-parse-0.3.4 → sparrow-parse-0.3.5}/sparrow_parse/vllm/inference_factory.py +0 -0
- {sparrow-parse-0.3.4 → sparrow-parse-0.3.5}/sparrow_parse/vllm/local_gpu_inference.py +0 -0
- {sparrow-parse-0.3.4 → sparrow-parse-0.3.5}/sparrow_parse.egg-info/dependency_links.txt +0 -0
- {sparrow-parse-0.3.4 → sparrow-parse-0.3.5}/sparrow_parse.egg-info/entry_points.txt +0 -0
- {sparrow-parse-0.3.4 → sparrow-parse-0.3.5}/sparrow_parse.egg-info/top_level.txt +0 -0
@@ -1,14 +1,14 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sparrow-parse
|
3
|
-
Version: 0.3.
|
4
|
-
Summary: Sparrow Parse is a Python package for parsing and extracting information from documents.
|
3
|
+
Version: 0.3.5
|
4
|
+
Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
|
5
5
|
Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
|
6
6
|
Author: Andrej Baranovskij
|
7
7
|
Author-email: andrejus.baranovskis@gmail.com
|
8
8
|
License: UNKNOWN
|
9
9
|
Project-URL: Homepage, https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
|
10
10
|
Project-URL: Repository, https://github.com/katanaml/sparrow
|
11
|
-
Keywords: llm,
|
11
|
+
Keywords: llm,vllm,ocr,vision
|
12
12
|
Platform: UNKNOWN
|
13
13
|
Classifier: Operating System :: OS Independent
|
14
14
|
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
@@ -21,7 +21,7 @@ Description-Content-Type: text/markdown
|
|
21
21
|
|
22
22
|
## Description
|
23
23
|
|
24
|
-
This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-parse/) with helpful methods for data pre-processing, parsing and extracting information.
|
24
|
+
This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-parse/) library with helpful methods for data pre-processing, parsing and extracting information. This library relies on Visual LLM functionality, Table Transformers and is part of Sparrow. Check main [README](https://github.com/katanaml/sparrow)
|
25
25
|
|
26
26
|
## Install
|
27
27
|
|
@@ -29,101 +29,14 @@ This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-
|
|
29
29
|
pip install sparrow-parse
|
30
30
|
```
|
31
31
|
|
32
|
-
## Pre-processing
|
33
|
-
|
34
|
-
### Unstructured
|
35
|
-
|
36
|
-
```
|
37
|
-
from sparrow_parse.extractor.unstructured_processor import UnstructuredProcessor
|
38
|
-
|
39
|
-
processor = UnstructuredProcessor()
|
40
|
-
|
41
|
-
content, table_content = processor.extract_data(
|
42
|
-
file_path, # file to process
|
43
|
-
strategy, # data processing strategy supported by unstructured
|
44
|
-
model_name, # model supported by unstructured
|
45
|
-
options, # table extraction into HTML format
|
46
|
-
local, # True if running from CLI, or False if running from FastAPI
|
47
|
-
debug) # Debug
|
48
|
-
```
|
49
|
-
|
50
|
-
Example:
|
51
|
-
|
52
|
-
*file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
|
53
|
-
|
54
|
-
*strategy* - `hi_res`
|
55
|
-
|
56
|
-
*model_name* - `yolox`
|
57
|
-
|
58
|
-
*options* - `['tables', 'unstructured']`
|
59
|
-
|
60
|
-
*local* - `True`
|
61
|
-
|
62
|
-
*debug* - `True`
|
63
|
-
|
64
|
-
### Markdown
|
65
|
-
|
66
|
-
```
|
67
|
-
from sparrow_parse.extractor.markdown_processor import MarkdownProcessor
|
68
|
-
|
69
|
-
processor = MarkdownProcessor()
|
70
|
-
|
71
|
-
content, table_content = processor.extract_data(
|
72
|
-
file_path, # file to process
|
73
|
-
options, # table extraction into HTML format
|
74
|
-
local, # True if running from CLI, or False if running from FastAPI
|
75
|
-
debug) # Debug
|
76
|
-
```
|
77
|
-
|
78
|
-
Example:
|
79
|
-
|
80
|
-
*file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
|
81
|
-
|
82
|
-
*options* - `['tables', 'markdown']`
|
83
|
-
|
84
|
-
*local* - `True`
|
85
|
-
|
86
|
-
*debug* - `True`
|
87
|
-
|
88
32
|
## Parsing and extraction
|
89
33
|
|
90
|
-
###
|
34
|
+
### Sparrow Parse VL (vision-language model) extractor with Hugging Face GPU infra
|
91
35
|
|
92
36
|
```
|
93
|
-
from sparrow_parse.
|
37
|
+
from sparrow_parse.vllm.inference_factory import InferenceFactory
|
38
|
+
from sparrow_parse.extractors.vllm_extractor import VLLMExtractor
|
94
39
|
|
95
|
-
extractor = HTMLExtractor()
|
96
|
-
|
97
|
-
answer, targets_unprocessed = extractor.read_data(
|
98
|
-
target_columns, # list of table columns data to fetch
|
99
|
-
data, # list of HTML tables
|
100
|
-
column_keywords, # list of valid column names, can be empty. Useful to filter junk content
|
101
|
-
group_by_rows, # JSON result grouping
|
102
|
-
update_targets, # Set to true, if page contains multiple tables with the same columns
|
103
|
-
local, # True if running from CLI, or False if running from FastAPI
|
104
|
-
debug) # Debug
|
105
|
-
|
106
|
-
```
|
107
|
-
|
108
|
-
Example:
|
109
|
-
|
110
|
-
*target_columns* - `['description', 'qty', 'net_price', 'net_worth', 'vat', 'gross_worth']`
|
111
|
-
|
112
|
-
*data* - `list of HTML tables`
|
113
|
-
|
114
|
-
*column_keywords* - `None`
|
115
|
-
|
116
|
-
*group_by_rows* - `True`
|
117
|
-
|
118
|
-
*update_targets* - `True`
|
119
|
-
|
120
|
-
*local* - `True`
|
121
|
-
|
122
|
-
*debug* - `True`
|
123
|
-
|
124
|
-
### Sparrow Parse VL (vision-language) extractor
|
125
|
-
|
126
|
-
```
|
127
40
|
extractor = VLLMExtractor()
|
128
41
|
|
129
42
|
# export HF_TOKEN="hf_"
|
@@ -141,8 +54,8 @@ model_inference_instance = factory.get_inference_instance()
|
|
141
54
|
|
142
55
|
input_data = [
|
143
56
|
{
|
144
|
-
"image": "/
|
145
|
-
"text_input": "retrieve
|
57
|
+
"image": "/data/bonds_table.png",
|
58
|
+
"text_input": "retrieve all data. return response in JSON format"
|
146
59
|
}
|
147
60
|
]
|
148
61
|
|
@@ -151,7 +64,7 @@ result = extractor.run_inference(model_inference_instance, input_data, generic_q
|
|
151
64
|
print("Inference Result:", result)
|
152
65
|
```
|
153
66
|
|
154
|
-
## PDF
|
67
|
+
## PDF pre-processing
|
155
68
|
|
156
69
|
```
|
157
70
|
from sparrow_parse.extractor.pdf_optimizer import PDFOptimizer
|
@@ -166,7 +79,7 @@ num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages(file_path,
|
|
166
79
|
|
167
80
|
Example:
|
168
81
|
|
169
|
-
*file_path* - `/
|
82
|
+
*file_path* - `/data/invoice_1.pdf`
|
170
83
|
|
171
84
|
*output_directory* - set to not `None`, for debug purposes only
|
172
85
|
|
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
## Description
|
4
4
|
|
5
|
-
This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-parse/) with helpful methods for data pre-processing, parsing and extracting information.
|
5
|
+
This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-parse/) library with helpful methods for data pre-processing, parsing and extracting information. This library relies on Visual LLM functionality, Table Transformers and is part of Sparrow. Check main [README](https://github.com/katanaml/sparrow)
|
6
6
|
|
7
7
|
## Install
|
8
8
|
|
@@ -10,101 +10,14 @@ This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-
|
|
10
10
|
pip install sparrow-parse
|
11
11
|
```
|
12
12
|
|
13
|
-
## Pre-processing
|
14
|
-
|
15
|
-
### Unstructured
|
16
|
-
|
17
|
-
```
|
18
|
-
from sparrow_parse.extractor.unstructured_processor import UnstructuredProcessor
|
19
|
-
|
20
|
-
processor = UnstructuredProcessor()
|
21
|
-
|
22
|
-
content, table_content = processor.extract_data(
|
23
|
-
file_path, # file to process
|
24
|
-
strategy, # data processing strategy supported by unstructured
|
25
|
-
model_name, # model supported by unstructured
|
26
|
-
options, # table extraction into HTML format
|
27
|
-
local, # True if running from CLI, or False if running from FastAPI
|
28
|
-
debug) # Debug
|
29
|
-
```
|
30
|
-
|
31
|
-
Example:
|
32
|
-
|
33
|
-
*file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
|
34
|
-
|
35
|
-
*strategy* - `hi_res`
|
36
|
-
|
37
|
-
*model_name* - `yolox`
|
38
|
-
|
39
|
-
*options* - `['tables', 'unstructured']`
|
40
|
-
|
41
|
-
*local* - `True`
|
42
|
-
|
43
|
-
*debug* - `True`
|
44
|
-
|
45
|
-
### Markdown
|
46
|
-
|
47
|
-
```
|
48
|
-
from sparrow_parse.extractor.markdown_processor import MarkdownProcessor
|
49
|
-
|
50
|
-
processor = MarkdownProcessor()
|
51
|
-
|
52
|
-
content, table_content = processor.extract_data(
|
53
|
-
file_path, # file to process
|
54
|
-
options, # table extraction into HTML format
|
55
|
-
local, # True if running from CLI, or False if running from FastAPI
|
56
|
-
debug) # Debug
|
57
|
-
```
|
58
|
-
|
59
|
-
Example:
|
60
|
-
|
61
|
-
*file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
|
62
|
-
|
63
|
-
*options* - `['tables', 'markdown']`
|
64
|
-
|
65
|
-
*local* - `True`
|
66
|
-
|
67
|
-
*debug* - `True`
|
68
|
-
|
69
13
|
## Parsing and extraction
|
70
14
|
|
71
|
-
###
|
15
|
+
### Sparrow Parse VL (vision-language model) extractor with Hugging Face GPU infra
|
72
16
|
|
73
17
|
```
|
74
|
-
from sparrow_parse.
|
18
|
+
from sparrow_parse.vllm.inference_factory import InferenceFactory
|
19
|
+
from sparrow_parse.extractors.vllm_extractor import VLLMExtractor
|
75
20
|
|
76
|
-
extractor = HTMLExtractor()
|
77
|
-
|
78
|
-
answer, targets_unprocessed = extractor.read_data(
|
79
|
-
target_columns, # list of table columns data to fetch
|
80
|
-
data, # list of HTML tables
|
81
|
-
column_keywords, # list of valid column names, can be empty. Useful to filter junk content
|
82
|
-
group_by_rows, # JSON result grouping
|
83
|
-
update_targets, # Set to true, if page contains multiple tables with the same columns
|
84
|
-
local, # True if running from CLI, or False if running from FastAPI
|
85
|
-
debug) # Debug
|
86
|
-
|
87
|
-
```
|
88
|
-
|
89
|
-
Example:
|
90
|
-
|
91
|
-
*target_columns* - `['description', 'qty', 'net_price', 'net_worth', 'vat', 'gross_worth']`
|
92
|
-
|
93
|
-
*data* - `list of HTML tables`
|
94
|
-
|
95
|
-
*column_keywords* - `None`
|
96
|
-
|
97
|
-
*group_by_rows* - `True`
|
98
|
-
|
99
|
-
*update_targets* - `True`
|
100
|
-
|
101
|
-
*local* - `True`
|
102
|
-
|
103
|
-
*debug* - `True`
|
104
|
-
|
105
|
-
### Sparrow Parse VL (vision-language) extractor
|
106
|
-
|
107
|
-
```
|
108
21
|
extractor = VLLMExtractor()
|
109
22
|
|
110
23
|
# export HF_TOKEN="hf_"
|
@@ -122,8 +35,8 @@ model_inference_instance = factory.get_inference_instance()
|
|
122
35
|
|
123
36
|
input_data = [
|
124
37
|
{
|
125
|
-
"image": "/
|
126
|
-
"text_input": "retrieve
|
38
|
+
"image": "/data/bonds_table.png",
|
39
|
+
"text_input": "retrieve all data. return response in JSON format"
|
127
40
|
}
|
128
41
|
]
|
129
42
|
|
@@ -132,7 +45,7 @@ result = extractor.run_inference(model_inference_instance, input_data, generic_q
|
|
132
45
|
print("Inference Result:", result)
|
133
46
|
```
|
134
47
|
|
135
|
-
## PDF
|
48
|
+
## PDF pre-processing
|
136
49
|
|
137
50
|
```
|
138
51
|
from sparrow_parse.extractor.pdf_optimizer import PDFOptimizer
|
@@ -147,7 +60,7 @@ num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages(file_path,
|
|
147
60
|
|
148
61
|
Example:
|
149
62
|
|
150
|
-
*file_path* - `/
|
63
|
+
*file_path* - `/data/invoice_1.pdf`
|
151
64
|
|
152
65
|
*output_directory* - set to not `None`, for debug purposes only
|
153
66
|
|
@@ -8,10 +8,10 @@ with open("requirements.txt", "r", encoding="utf-8") as fh:
|
|
8
8
|
|
9
9
|
setup(
|
10
10
|
name="sparrow-parse",
|
11
|
-
version="0.3.
|
11
|
+
version="0.3.5",
|
12
12
|
author="Andrej Baranovskij",
|
13
13
|
author_email="andrejus.baranovskis@gmail.com",
|
14
|
-
description="Sparrow Parse is a Python package for parsing and extracting information from documents.",
|
14
|
+
description="Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.",
|
15
15
|
long_description=long_description,
|
16
16
|
long_description_content_type="text/markdown",
|
17
17
|
url="https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse",
|
@@ -30,7 +30,7 @@ setup(
|
|
30
30
|
'sparrow-parse=sparrow_parse:main',
|
31
31
|
],
|
32
32
|
},
|
33
|
-
keywords="llm,
|
33
|
+
keywords="llm, vllm, ocr, vision",
|
34
34
|
packages=find_packages(),
|
35
35
|
python_requires='>=3.10',
|
36
36
|
install_requires=requirements,
|
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = '0.3.5'
|
@@ -1,14 +1,14 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sparrow-parse
|
3
|
-
Version: 0.3.
|
4
|
-
Summary: Sparrow Parse is a Python package for parsing and extracting information from documents.
|
3
|
+
Version: 0.3.5
|
4
|
+
Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
|
5
5
|
Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
|
6
6
|
Author: Andrej Baranovskij
|
7
7
|
Author-email: andrejus.baranovskis@gmail.com
|
8
8
|
License: UNKNOWN
|
9
9
|
Project-URL: Homepage, https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
|
10
10
|
Project-URL: Repository, https://github.com/katanaml/sparrow
|
11
|
-
Keywords: llm,
|
11
|
+
Keywords: llm,vllm,ocr,vision
|
12
12
|
Platform: UNKNOWN
|
13
13
|
Classifier: Operating System :: OS Independent
|
14
14
|
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
@@ -21,7 +21,7 @@ Description-Content-Type: text/markdown
|
|
21
21
|
|
22
22
|
## Description
|
23
23
|
|
24
|
-
This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-parse/) with helpful methods for data pre-processing, parsing and extracting information.
|
24
|
+
This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-parse/) library with helpful methods for data pre-processing, parsing and extracting information. This library relies on Visual LLM functionality, Table Transformers and is part of Sparrow. Check main [README](https://github.com/katanaml/sparrow)
|
25
25
|
|
26
26
|
## Install
|
27
27
|
|
@@ -29,101 +29,14 @@ This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-
|
|
29
29
|
pip install sparrow-parse
|
30
30
|
```
|
31
31
|
|
32
|
-
## Pre-processing
|
33
|
-
|
34
|
-
### Unstructured
|
35
|
-
|
36
|
-
```
|
37
|
-
from sparrow_parse.extractor.unstructured_processor import UnstructuredProcessor
|
38
|
-
|
39
|
-
processor = UnstructuredProcessor()
|
40
|
-
|
41
|
-
content, table_content = processor.extract_data(
|
42
|
-
file_path, # file to process
|
43
|
-
strategy, # data processing strategy supported by unstructured
|
44
|
-
model_name, # model supported by unstructured
|
45
|
-
options, # table extraction into HTML format
|
46
|
-
local, # True if running from CLI, or False if running from FastAPI
|
47
|
-
debug) # Debug
|
48
|
-
```
|
49
|
-
|
50
|
-
Example:
|
51
|
-
|
52
|
-
*file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
|
53
|
-
|
54
|
-
*strategy* - `hi_res`
|
55
|
-
|
56
|
-
*model_name* - `yolox`
|
57
|
-
|
58
|
-
*options* - `['tables', 'unstructured']`
|
59
|
-
|
60
|
-
*local* - `True`
|
61
|
-
|
62
|
-
*debug* - `True`
|
63
|
-
|
64
|
-
### Markdown
|
65
|
-
|
66
|
-
```
|
67
|
-
from sparrow_parse.extractor.markdown_processor import MarkdownProcessor
|
68
|
-
|
69
|
-
processor = MarkdownProcessor()
|
70
|
-
|
71
|
-
content, table_content = processor.extract_data(
|
72
|
-
file_path, # file to process
|
73
|
-
options, # table extraction into HTML format
|
74
|
-
local, # True if running from CLI, or False if running from FastAPI
|
75
|
-
debug) # Debug
|
76
|
-
```
|
77
|
-
|
78
|
-
Example:
|
79
|
-
|
80
|
-
*file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
|
81
|
-
|
82
|
-
*options* - `['tables', 'markdown']`
|
83
|
-
|
84
|
-
*local* - `True`
|
85
|
-
|
86
|
-
*debug* - `True`
|
87
|
-
|
88
32
|
## Parsing and extraction
|
89
33
|
|
90
|
-
###
|
34
|
+
### Sparrow Parse VL (vision-language model) extractor with Hugging Face GPU infra
|
91
35
|
|
92
36
|
```
|
93
|
-
from sparrow_parse.
|
37
|
+
from sparrow_parse.vllm.inference_factory import InferenceFactory
|
38
|
+
from sparrow_parse.extractors.vllm_extractor import VLLMExtractor
|
94
39
|
|
95
|
-
extractor = HTMLExtractor()
|
96
|
-
|
97
|
-
answer, targets_unprocessed = extractor.read_data(
|
98
|
-
target_columns, # list of table columns data to fetch
|
99
|
-
data, # list of HTML tables
|
100
|
-
column_keywords, # list of valid column names, can be empty. Useful to filter junk content
|
101
|
-
group_by_rows, # JSON result grouping
|
102
|
-
update_targets, # Set to true, if page contains multiple tables with the same columns
|
103
|
-
local, # True if running from CLI, or False if running from FastAPI
|
104
|
-
debug) # Debug
|
105
|
-
|
106
|
-
```
|
107
|
-
|
108
|
-
Example:
|
109
|
-
|
110
|
-
*target_columns* - `['description', 'qty', 'net_price', 'net_worth', 'vat', 'gross_worth']`
|
111
|
-
|
112
|
-
*data* - `list of HTML tables`
|
113
|
-
|
114
|
-
*column_keywords* - `None`
|
115
|
-
|
116
|
-
*group_by_rows* - `True`
|
117
|
-
|
118
|
-
*update_targets* - `True`
|
119
|
-
|
120
|
-
*local* - `True`
|
121
|
-
|
122
|
-
*debug* - `True`
|
123
|
-
|
124
|
-
### Sparrow Parse VL (vision-language) extractor
|
125
|
-
|
126
|
-
```
|
127
40
|
extractor = VLLMExtractor()
|
128
41
|
|
129
42
|
# export HF_TOKEN="hf_"
|
@@ -141,8 +54,8 @@ model_inference_instance = factory.get_inference_instance()
|
|
141
54
|
|
142
55
|
input_data = [
|
143
56
|
{
|
144
|
-
"image": "/
|
145
|
-
"text_input": "retrieve
|
57
|
+
"image": "/data/bonds_table.png",
|
58
|
+
"text_input": "retrieve all data. return response in JSON format"
|
146
59
|
}
|
147
60
|
]
|
148
61
|
|
@@ -151,7 +64,7 @@ result = extractor.run_inference(model_inference_instance, input_data, generic_q
|
|
151
64
|
print("Inference Result:", result)
|
152
65
|
```
|
153
66
|
|
154
|
-
## PDF
|
67
|
+
## PDF pre-processing
|
155
68
|
|
156
69
|
```
|
157
70
|
from sparrow_parse.extractor.pdf_optimizer import PDFOptimizer
|
@@ -166,7 +79,7 @@ num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages(file_path,
|
|
166
79
|
|
167
80
|
Example:
|
168
81
|
|
169
|
-
*file_path* - `/
|
82
|
+
*file_path* - `/data/invoice_1.pdf`
|
170
83
|
|
171
84
|
*output_directory* - set to not `None`, for debug purposes only
|
172
85
|
|
@@ -2,7 +2,6 @@ README.md
|
|
2
2
|
setup.py
|
3
3
|
sparrow_parse/__init__.py
|
4
4
|
sparrow_parse/__main__.py
|
5
|
-
sparrow_parse/temp.py
|
6
5
|
sparrow_parse.egg-info/PKG-INFO
|
7
6
|
sparrow_parse.egg-info/SOURCES.txt
|
8
7
|
sparrow_parse.egg-info/dependency_links.txt
|
@@ -10,15 +9,11 @@ sparrow_parse.egg-info/entry_points.txt
|
|
10
9
|
sparrow_parse.egg-info/requires.txt
|
11
10
|
sparrow_parse.egg-info/top_level.txt
|
12
11
|
sparrow_parse/extractors/__init__.py
|
13
|
-
sparrow_parse/extractors/html_extractor.py
|
14
12
|
sparrow_parse/extractors/vllm_extractor.py
|
15
13
|
sparrow_parse/helpers/__init__.py
|
16
|
-
sparrow_parse/helpers/html_extractor_helper.py
|
17
14
|
sparrow_parse/helpers/pdf_optimizer.py
|
18
15
|
sparrow_parse/processors/__init__.py
|
19
|
-
sparrow_parse/processors/markdown_processor.py
|
20
16
|
sparrow_parse/processors/table_structure_processor.py
|
21
|
-
sparrow_parse/processors/unstructured_processor.py
|
22
17
|
sparrow_parse/vllm/__init__.py
|
23
18
|
sparrow_parse/vllm/huggingface_inference.py
|
24
19
|
sparrow_parse/vllm/inference_base.py
|
@@ -1 +0,0 @@
|
|
1
|
-
__version__ = '0.3.4'
|