sparrow-parse 0.3.3__tar.gz → 0.3.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {sparrow-parse-0.3.3 → sparrow-parse-0.3.5}/PKG-INFO +31 -86
  2. {sparrow-parse-0.3.3 → sparrow-parse-0.3.5}/README.md +28 -83
  3. {sparrow-parse-0.3.3 → sparrow-parse-0.3.5}/setup.py +3 -3
  4. sparrow-parse-0.3.5/sparrow_parse/__init__.py +1 -0
  5. {sparrow-parse-0.3.3 → sparrow-parse-0.3.5}/sparrow_parse.egg-info/PKG-INFO +31 -86
  6. {sparrow-parse-0.3.3 → sparrow-parse-0.3.5}/sparrow_parse.egg-info/SOURCES.txt +0 -5
  7. {sparrow-parse-0.3.3 → sparrow-parse-0.3.5}/sparrow_parse.egg-info/requires.txt +1 -4
  8. sparrow-parse-0.3.3/sparrow_parse/__init__.py +0 -1
  9. sparrow-parse-0.3.3/sparrow_parse/extractors/html_extractor.py +0 -251
  10. sparrow-parse-0.3.3/sparrow_parse/helpers/html_extractor_helper.py +0 -374
  11. sparrow-parse-0.3.3/sparrow_parse/processors/markdown_processor.py +0 -137
  12. sparrow-parse-0.3.3/sparrow_parse/processors/unstructured_processor.py +0 -178
  13. sparrow-parse-0.3.3/sparrow_parse/temp.py +0 -27
  14. {sparrow-parse-0.3.3 → sparrow-parse-0.3.5}/setup.cfg +0 -0
  15. {sparrow-parse-0.3.3 → sparrow-parse-0.3.5}/sparrow_parse/__main__.py +0 -0
  16. {sparrow-parse-0.3.3 → sparrow-parse-0.3.5}/sparrow_parse/extractors/__init__.py +0 -0
  17. {sparrow-parse-0.3.3 → sparrow-parse-0.3.5}/sparrow_parse/extractors/vllm_extractor.py +0 -0
  18. {sparrow-parse-0.3.3 → sparrow-parse-0.3.5}/sparrow_parse/helpers/__init__.py +0 -0
  19. {sparrow-parse-0.3.3 → sparrow-parse-0.3.5}/sparrow_parse/helpers/pdf_optimizer.py +0 -0
  20. {sparrow-parse-0.3.3 → sparrow-parse-0.3.5}/sparrow_parse/processors/__init__.py +0 -0
  21. {sparrow-parse-0.3.3 → sparrow-parse-0.3.5}/sparrow_parse/processors/table_structure_processor.py +0 -0
  22. {sparrow-parse-0.3.3 → sparrow-parse-0.3.5}/sparrow_parse/vllm/__init__.py +0 -0
  23. {sparrow-parse-0.3.3 → sparrow-parse-0.3.5}/sparrow_parse/vllm/huggingface_inference.py +0 -0
  24. {sparrow-parse-0.3.3 → sparrow-parse-0.3.5}/sparrow_parse/vllm/inference_base.py +0 -0
  25. {sparrow-parse-0.3.3 → sparrow-parse-0.3.5}/sparrow_parse/vllm/inference_factory.py +0 -0
  26. {sparrow-parse-0.3.3 → sparrow-parse-0.3.5}/sparrow_parse/vllm/local_gpu_inference.py +0 -0
  27. {sparrow-parse-0.3.3 → sparrow-parse-0.3.5}/sparrow_parse.egg-info/dependency_links.txt +0 -0
  28. {sparrow-parse-0.3.3 → sparrow-parse-0.3.5}/sparrow_parse.egg-info/entry_points.txt +0 -0
  29. {sparrow-parse-0.3.3 → sparrow-parse-0.3.5}/sparrow_parse.egg-info/top_level.txt +0 -0
@@ -1,14 +1,14 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sparrow-parse
3
- Version: 0.3.3
4
- Summary: Sparrow Parse is a Python package for parsing and extracting information from documents.
3
+ Version: 0.3.5
4
+ Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
5
5
  Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
6
  Author: Andrej Baranovskij
7
7
  Author-email: andrejus.baranovskis@gmail.com
8
8
  License: UNKNOWN
9
9
  Project-URL: Homepage, https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
10
10
  Project-URL: Repository, https://github.com/katanaml/sparrow
11
- Keywords: llm,rag,vision
11
+ Keywords: llm,vllm,ocr,vision
12
12
  Platform: UNKNOWN
13
13
  Classifier: Operating System :: OS Independent
14
14
  Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
@@ -21,7 +21,7 @@ Description-Content-Type: text/markdown
21
21
 
22
22
  ## Description
23
23
 
24
- This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-parse/) with helpful methods for data pre-processing, parsing and extracting information.
24
+ This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-parse/) library with helpful methods for data pre-processing, parsing and extracting information. This library relies on Visual LLM functionality, Table Transformers and is part of Sparrow. Check main [README](https://github.com/katanaml/sparrow)
25
25
 
26
26
  ## Install
27
27
 
@@ -29,97 +29,42 @@ This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-
29
29
  pip install sparrow-parse
30
30
  ```
31
31
 
32
- ## Pre-processing
33
-
34
- ### Unstructured
35
-
36
- ```
37
- from sparrow_parse.extractor.unstructured_processor import UnstructuredProcessor
38
-
39
- processor = UnstructuredProcessor()
40
-
41
- content, table_content = processor.extract_data(
42
- file_path, # file to process
43
- strategy, # data processing strategy supported by unstructured
44
- model_name, # model supported by unstructured
45
- options, # table extraction into HTML format
46
- local, # True if running from CLI, or False if running from FastAPI
47
- debug) # Debug
48
- ```
49
-
50
- Example:
51
-
52
- *file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
53
-
54
- *strategy* - `hi_res`
55
-
56
- *model_name* - `yolox`
57
-
58
- *options* - `['tables', 'unstructured']`
59
-
60
- *local* - `True`
61
-
62
- *debug* - `True`
63
-
64
- ### Markdown
65
-
66
- ```
67
- from sparrow_parse.extractor.markdown_processor import MarkdownProcessor
68
-
69
- processor = MarkdownProcessor()
70
-
71
- content, table_content = processor.extract_data(
72
- file_path, # file to process
73
- options, # table extraction into HTML format
74
- local, # True if running from CLI, or False if running from FastAPI
75
- debug) # Debug
76
- ```
77
-
78
- Example:
79
-
80
- *file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
81
-
82
- *options* - `['tables', 'markdown']`
83
-
84
- *local* - `True`
85
-
86
- *debug* - `True`
87
-
88
32
  ## Parsing and extraction
89
33
 
90
- ```
91
- from sparrow_parse.extractor.html_extractor import HTMLExtractor
92
-
93
- extractor = HTMLExtractor()
94
-
95
- answer, targets_unprocessed = extractor.read_data(
96
- target_columns, # list of table columns data to fetch
97
- data, # list of HTML tables
98
- column_keywords, # list of valid column names, can be empty. Useful to filter junk content
99
- group_by_rows, # JSON result grouping
100
- update_targets, # Set to true, if page contains multiple tables with the same columns
101
- local, # True if running from CLI, or False if running from FastAPI
102
- debug) # Debug
34
+ ### Sparrow Parse VL (vision-language model) extractor with Hugging Face GPU infra
103
35
 
104
36
  ```
37
+ from sparrow_parse.vllm.inference_factory import InferenceFactory
38
+ from sparrow_parse.extractors.vllm_extractor import VLLMExtractor
105
39
 
106
- Example:
107
-
108
- *target_columns* - `['description', 'qty', 'net_price', 'net_worth', 'vat', 'gross_worth']`
109
-
110
- *data* - `list of HTML tables`
40
+ extractor = VLLMExtractor()
111
41
 
112
- *column_keywords* - `None`
42
+ # export HF_TOKEN="hf_"
43
+ config = {
44
+ "method": "huggingface", # Could be 'huggingface' or 'local_gpu'
45
+ "hf_space": "katanaml/sparrow-qwen2-vl-7b",
46
+ "hf_token": os.getenv('HF_TOKEN'),
47
+ # Additional fields for local GPU inference
48
+ # "device": "cuda", "model_path": "model.pth"
49
+ }
113
50
 
114
- *group_by_rows* - `True`
51
+ # Use the factory to get the correct instance
52
+ factory = InferenceFactory(config)
53
+ model_inference_instance = factory.get_inference_instance()
115
54
 
116
- *update_targets* - `True`
55
+ input_data = [
56
+ {
57
+ "image": "/data/bonds_table.png",
58
+ "text_input": "retrieve all data. return response in JSON format"
59
+ }
60
+ ]
117
61
 
118
- *local* - `True`
119
-
120
- *debug* - `True`
62
+ # Now you can run inference without knowing which implementation is used
63
+ result = extractor.run_inference(model_inference_instance, input_data, generic_query=False, debug=True)
64
+ print("Inference Result:", result)
65
+ ```
121
66
 
122
- ## PDF optimization
67
+ ## PDF pre-processing
123
68
 
124
69
  ```
125
70
  from sparrow_parse.extractor.pdf_optimizer import PDFOptimizer
@@ -134,7 +79,7 @@ num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages(file_path,
134
79
 
135
80
  Example:
136
81
 
137
- *file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
82
+ *file_path* - `/data/invoice_1.pdf`
138
83
 
139
84
  *output_directory* - set to not `None`, for debug purposes only
140
85
 
@@ -2,7 +2,7 @@
2
2
 
3
3
  ## Description
4
4
 
5
- This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-parse/) with helpful methods for data pre-processing, parsing and extracting information.
5
+ This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-parse/) library with helpful methods for data pre-processing, parsing and extracting information. This library relies on Visual LLM functionality, Table Transformers and is part of Sparrow. Check main [README](https://github.com/katanaml/sparrow)
6
6
 
7
7
  ## Install
8
8
 
@@ -10,97 +10,42 @@ This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-
10
10
  pip install sparrow-parse
11
11
  ```
12
12
 
13
- ## Pre-processing
14
-
15
- ### Unstructured
16
-
17
- ```
18
- from sparrow_parse.extractor.unstructured_processor import UnstructuredProcessor
19
-
20
- processor = UnstructuredProcessor()
21
-
22
- content, table_content = processor.extract_data(
23
- file_path, # file to process
24
- strategy, # data processing strategy supported by unstructured
25
- model_name, # model supported by unstructured
26
- options, # table extraction into HTML format
27
- local, # True if running from CLI, or False if running from FastAPI
28
- debug) # Debug
29
- ```
30
-
31
- Example:
32
-
33
- *file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
34
-
35
- *strategy* - `hi_res`
36
-
37
- *model_name* - `yolox`
38
-
39
- *options* - `['tables', 'unstructured']`
40
-
41
- *local* - `True`
42
-
43
- *debug* - `True`
44
-
45
- ### Markdown
46
-
47
- ```
48
- from sparrow_parse.extractor.markdown_processor import MarkdownProcessor
49
-
50
- processor = MarkdownProcessor()
51
-
52
- content, table_content = processor.extract_data(
53
- file_path, # file to process
54
- options, # table extraction into HTML format
55
- local, # True if running from CLI, or False if running from FastAPI
56
- debug) # Debug
57
- ```
58
-
59
- Example:
60
-
61
- *file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
62
-
63
- *options* - `['tables', 'markdown']`
64
-
65
- *local* - `True`
66
-
67
- *debug* - `True`
68
-
69
13
  ## Parsing and extraction
70
14
 
71
- ```
72
- from sparrow_parse.extractor.html_extractor import HTMLExtractor
73
-
74
- extractor = HTMLExtractor()
75
-
76
- answer, targets_unprocessed = extractor.read_data(
77
- target_columns, # list of table columns data to fetch
78
- data, # list of HTML tables
79
- column_keywords, # list of valid column names, can be empty. Useful to filter junk content
80
- group_by_rows, # JSON result grouping
81
- update_targets, # Set to true, if page contains multiple tables with the same columns
82
- local, # True if running from CLI, or False if running from FastAPI
83
- debug) # Debug
15
+ ### Sparrow Parse VL (vision-language model) extractor with Hugging Face GPU infra
84
16
 
85
17
  ```
18
+ from sparrow_parse.vllm.inference_factory import InferenceFactory
19
+ from sparrow_parse.extractors.vllm_extractor import VLLMExtractor
86
20
 
87
- Example:
88
-
89
- *target_columns* - `['description', 'qty', 'net_price', 'net_worth', 'vat', 'gross_worth']`
90
-
91
- *data* - `list of HTML tables`
21
+ extractor = VLLMExtractor()
92
22
 
93
- *column_keywords* - `None`
23
+ # export HF_TOKEN="hf_"
24
+ config = {
25
+ "method": "huggingface", # Could be 'huggingface' or 'local_gpu'
26
+ "hf_space": "katanaml/sparrow-qwen2-vl-7b",
27
+ "hf_token": os.getenv('HF_TOKEN'),
28
+ # Additional fields for local GPU inference
29
+ # "device": "cuda", "model_path": "model.pth"
30
+ }
94
31
 
95
- *group_by_rows* - `True`
32
+ # Use the factory to get the correct instance
33
+ factory = InferenceFactory(config)
34
+ model_inference_instance = factory.get_inference_instance()
96
35
 
97
- *update_targets* - `True`
36
+ input_data = [
37
+ {
38
+ "image": "/data/bonds_table.png",
39
+ "text_input": "retrieve all data. return response in JSON format"
40
+ }
41
+ ]
98
42
 
99
- *local* - `True`
100
-
101
- *debug* - `True`
43
+ # Now you can run inference without knowing which implementation is used
44
+ result = extractor.run_inference(model_inference_instance, input_data, generic_query=False, debug=True)
45
+ print("Inference Result:", result)
46
+ ```
102
47
 
103
- ## PDF optimization
48
+ ## PDF pre-processing
104
49
 
105
50
  ```
106
51
  from sparrow_parse.extractor.pdf_optimizer import PDFOptimizer
@@ -115,7 +60,7 @@ num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages(file_path,
115
60
 
116
61
  Example:
117
62
 
118
- *file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
63
+ *file_path* - `/data/invoice_1.pdf`
119
64
 
120
65
  *output_directory* - set to not `None`, for debug purposes only
121
66
 
@@ -8,10 +8,10 @@ with open("requirements.txt", "r", encoding="utf-8") as fh:
8
8
 
9
9
  setup(
10
10
  name="sparrow-parse",
11
- version="0.3.3",
11
+ version="0.3.5",
12
12
  author="Andrej Baranovskij",
13
13
  author_email="andrejus.baranovskis@gmail.com",
14
- description="Sparrow Parse is a Python package for parsing and extracting information from documents.",
14
+ description="Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.",
15
15
  long_description=long_description,
16
16
  long_description_content_type="text/markdown",
17
17
  url="https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse",
@@ -30,7 +30,7 @@ setup(
30
30
  'sparrow-parse=sparrow_parse:main',
31
31
  ],
32
32
  },
33
- keywords="llm, rag, vision",
33
+ keywords="llm, vllm, ocr, vision",
34
34
  packages=find_packages(),
35
35
  python_requires='>=3.10',
36
36
  install_requires=requirements,
@@ -0,0 +1 @@
1
+ __version__ = '0.3.5'
@@ -1,14 +1,14 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sparrow-parse
3
- Version: 0.3.3
4
- Summary: Sparrow Parse is a Python package for parsing and extracting information from documents.
3
+ Version: 0.3.5
4
+ Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
5
5
  Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
6
  Author: Andrej Baranovskij
7
7
  Author-email: andrejus.baranovskis@gmail.com
8
8
  License: UNKNOWN
9
9
  Project-URL: Homepage, https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
10
10
  Project-URL: Repository, https://github.com/katanaml/sparrow
11
- Keywords: llm,rag,vision
11
+ Keywords: llm,vllm,ocr,vision
12
12
  Platform: UNKNOWN
13
13
  Classifier: Operating System :: OS Independent
14
14
  Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
@@ -21,7 +21,7 @@ Description-Content-Type: text/markdown
21
21
 
22
22
  ## Description
23
23
 
24
- This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-parse/) with helpful methods for data pre-processing, parsing and extracting information.
24
+ This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-parse/) library with helpful methods for data pre-processing, parsing and extracting information. This library relies on Visual LLM functionality, Table Transformers and is part of Sparrow. Check main [README](https://github.com/katanaml/sparrow)
25
25
 
26
26
  ## Install
27
27
 
@@ -29,97 +29,42 @@ This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-
29
29
  pip install sparrow-parse
30
30
  ```
31
31
 
32
- ## Pre-processing
33
-
34
- ### Unstructured
35
-
36
- ```
37
- from sparrow_parse.extractor.unstructured_processor import UnstructuredProcessor
38
-
39
- processor = UnstructuredProcessor()
40
-
41
- content, table_content = processor.extract_data(
42
- file_path, # file to process
43
- strategy, # data processing strategy supported by unstructured
44
- model_name, # model supported by unstructured
45
- options, # table extraction into HTML format
46
- local, # True if running from CLI, or False if running from FastAPI
47
- debug) # Debug
48
- ```
49
-
50
- Example:
51
-
52
- *file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
53
-
54
- *strategy* - `hi_res`
55
-
56
- *model_name* - `yolox`
57
-
58
- *options* - `['tables', 'unstructured']`
59
-
60
- *local* - `True`
61
-
62
- *debug* - `True`
63
-
64
- ### Markdown
65
-
66
- ```
67
- from sparrow_parse.extractor.markdown_processor import MarkdownProcessor
68
-
69
- processor = MarkdownProcessor()
70
-
71
- content, table_content = processor.extract_data(
72
- file_path, # file to process
73
- options, # table extraction into HTML format
74
- local, # True if running from CLI, or False if running from FastAPI
75
- debug) # Debug
76
- ```
77
-
78
- Example:
79
-
80
- *file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
81
-
82
- *options* - `['tables', 'markdown']`
83
-
84
- *local* - `True`
85
-
86
- *debug* - `True`
87
-
88
32
  ## Parsing and extraction
89
33
 
90
- ```
91
- from sparrow_parse.extractor.html_extractor import HTMLExtractor
92
-
93
- extractor = HTMLExtractor()
94
-
95
- answer, targets_unprocessed = extractor.read_data(
96
- target_columns, # list of table columns data to fetch
97
- data, # list of HTML tables
98
- column_keywords, # list of valid column names, can be empty. Useful to filter junk content
99
- group_by_rows, # JSON result grouping
100
- update_targets, # Set to true, if page contains multiple tables with the same columns
101
- local, # True if running from CLI, or False if running from FastAPI
102
- debug) # Debug
34
+ ### Sparrow Parse VL (vision-language model) extractor with Hugging Face GPU infra
103
35
 
104
36
  ```
37
+ from sparrow_parse.vllm.inference_factory import InferenceFactory
38
+ from sparrow_parse.extractors.vllm_extractor import VLLMExtractor
105
39
 
106
- Example:
107
-
108
- *target_columns* - `['description', 'qty', 'net_price', 'net_worth', 'vat', 'gross_worth']`
109
-
110
- *data* - `list of HTML tables`
40
+ extractor = VLLMExtractor()
111
41
 
112
- *column_keywords* - `None`
42
+ # export HF_TOKEN="hf_"
43
+ config = {
44
+ "method": "huggingface", # Could be 'huggingface' or 'local_gpu'
45
+ "hf_space": "katanaml/sparrow-qwen2-vl-7b",
46
+ "hf_token": os.getenv('HF_TOKEN'),
47
+ # Additional fields for local GPU inference
48
+ # "device": "cuda", "model_path": "model.pth"
49
+ }
113
50
 
114
- *group_by_rows* - `True`
51
+ # Use the factory to get the correct instance
52
+ factory = InferenceFactory(config)
53
+ model_inference_instance = factory.get_inference_instance()
115
54
 
116
- *update_targets* - `True`
55
+ input_data = [
56
+ {
57
+ "image": "/data/bonds_table.png",
58
+ "text_input": "retrieve all data. return response in JSON format"
59
+ }
60
+ ]
117
61
 
118
- *local* - `True`
119
-
120
- *debug* - `True`
62
+ # Now you can run inference without knowing which implementation is used
63
+ result = extractor.run_inference(model_inference_instance, input_data, generic_query=False, debug=True)
64
+ print("Inference Result:", result)
65
+ ```
121
66
 
122
- ## PDF optimization
67
+ ## PDF pre-processing
123
68
 
124
69
  ```
125
70
  from sparrow_parse.extractor.pdf_optimizer import PDFOptimizer
@@ -134,7 +79,7 @@ num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages(file_path,
134
79
 
135
80
  Example:
136
81
 
137
- *file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
82
+ *file_path* - `/data/invoice_1.pdf`
138
83
 
139
84
  *output_directory* - set to not `None`, for debug purposes only
140
85
 
@@ -2,7 +2,6 @@ README.md
2
2
  setup.py
3
3
  sparrow_parse/__init__.py
4
4
  sparrow_parse/__main__.py
5
- sparrow_parse/temp.py
6
5
  sparrow_parse.egg-info/PKG-INFO
7
6
  sparrow_parse.egg-info/SOURCES.txt
8
7
  sparrow_parse.egg-info/dependency_links.txt
@@ -10,15 +9,11 @@ sparrow_parse.egg-info/entry_points.txt
10
9
  sparrow_parse.egg-info/requires.txt
11
10
  sparrow_parse.egg-info/top_level.txt
12
11
  sparrow_parse/extractors/__init__.py
13
- sparrow_parse/extractors/html_extractor.py
14
12
  sparrow_parse/extractors/vllm_extractor.py
15
13
  sparrow_parse/helpers/__init__.py
16
- sparrow_parse/helpers/html_extractor_helper.py
17
14
  sparrow_parse/helpers/pdf_optimizer.py
18
15
  sparrow_parse/processors/__init__.py
19
- sparrow_parse/processors/markdown_processor.py
20
16
  sparrow_parse/processors/table_structure_processor.py
21
- sparrow_parse/processors/unstructured_processor.py
22
17
  sparrow_parse/vllm/__init__.py
23
18
  sparrow_parse/vllm/huggingface_inference.py
24
19
  sparrow_parse/vllm/inference_base.py
@@ -1,11 +1,8 @@
1
- torch==2.2.2
2
- unstructured[all-docs]==0.14.5
3
- unstructured-inference==0.7.33
4
1
  rich
5
- pymupdf4llm==0.0.9
6
2
  transformers==4.41.2
7
3
  sentence-transformers==3.0.1
8
4
  numpy==1.26.4
9
5
  pypdf==4.3.0
10
6
  easyocr==1.7.1
11
7
  gradio_client
8
+ pdf2image
@@ -1 +0,0 @@
1
- __version__ = '0.3.3'