sparrow-parse 0.1.10__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sparrow-parse
3
- Version: 0.1.10
3
+ Version: 0.2.1
4
4
  Summary: Sparrow Parse is a Python package for parsing and extracting information from documents.
5
5
  Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
6
  License: GPL-3.0
@@ -29,7 +29,7 @@ Description-Content-Type: text/markdown
29
29
 
30
30
  ## Description
31
31
 
32
- This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-parse/) with helpful methods for data pre-processing.
32
+ This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-parse/) with helpful methods for data pre-processing, parsing and extracting information.
33
33
 
34
34
  ## Install
35
35
 
@@ -37,21 +37,96 @@ This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-
37
37
  pip install sparrow-parse
38
38
  ```
39
39
 
40
- ## Use
40
+ ## Pre-processing
41
41
 
42
- Import
42
+ ### Unstructured
43
43
 
44
44
  ```
45
- from sparrow_parse.extractor.file_processor import FileProcessor
45
+ from sparrow_parse.extractor.unstructured_processor import UnstructuredProcessor
46
+
47
+ processor = UnstructuredProcessor()
48
+
49
+ content, table_content = processor.extract_data(
50
+ file_path, # file to process
51
+ strategy, # data processing strategy supported by unstructured
52
+ model_name, # model supported by unstructured
53
+ options, # table extraction into HTML format
54
+ local, # True if running from CLI, or False if running from FastAPI
55
+ debug) # Debug
46
56
  ```
47
57
 
48
- Usage
58
+ Example:
59
+
60
+ *file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
61
+
62
+ *strategy* - `hi_res`
63
+
64
+ *model_name* - `yolox`
65
+
66
+ *options* - `['tables', 'html']`
67
+
68
+ *local* - `True`
69
+
70
+ *debug* - `True`
71
+
72
+ ### Markdown
49
73
 
50
74
  ```
51
- processor = FileProcessor()
52
- content = processor.extract_data(file_path, strategy, model_name, options, local, debug)
75
+ from sparrow_parse.extractor.markdown_processor import MarkdownProcessor
76
+
77
+ processor = MarkdownProcessor()
78
+
79
+ content, table_content = processor.extract_data(
80
+ file_path, # file to process
81
+ options, # table extraction into HTML format
82
+ local, # True if running from CLI, or False if running from FastAPI
83
+ debug) # Debug
53
84
  ```
54
85
 
86
+ Example:
87
+
88
+ *file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
89
+
90
+ *options* - `['tables', 'markdown']`
91
+
92
+ *local* - `True`
93
+
94
+ *debug* - `True`
95
+
96
+ ## Parsing and extraction
97
+
98
+ ```
99
+ from sparrow_parse.extractor.html_extractor import HTMLExtractor
100
+
101
+ extractor = HTMLExtractor()
102
+
103
+ answer, targets_unprocessed = extractor.read_data(
104
+ target_columns, # list of table columns data to fetch
105
+ data, # list of HTML tables
106
+ column_keywords, # list of valid column names, can be empty. Useful to filter junk content
107
+ group_by_rows, # JSON result grouping
108
+ update_targets, # Set to true, if page contains multiple tables with the same columns
109
+ local, # True if running from CLI, or False if running from FastAPI
110
+ debug) # Debug
111
+
112
+ ```
113
+
114
+ Example:
115
+
116
+ *target_columns* - `['description', 'qty', 'net_price', 'net_worth', 'vat', 'gross_worth']`
117
+
118
+ *data* - `list of HTML tables`
119
+
120
+ *column_keywords* - `None`
121
+
122
+ *group_by_rows* - `True`
123
+
124
+ *update_targets* - `True`
125
+
126
+ *local* - `True`
127
+
128
+ *debug* - `True`
129
+
55
130
  ## Library build
56
131
 
57
132
  ```
@@ -2,7 +2,7 @@
2
2
 
3
3
  ## Description
4
4
 
5
- This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-parse/) with helpful methods for data pre-processing.
5
+ This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-parse/) with helpful methods for data pre-processing, parsing and extracting information.
6
6
 
7
7
  ## Install
8
8
 
@@ -10,21 +10,96 @@ This module implements Sparrow Parse [library](https://pypi.org/project/sparrow-
10
10
  pip install sparrow-parse
11
11
  ```
12
12
 
13
- ## Use
13
+ ## Pre-processing
14
14
 
15
- Import
15
+ ### Unstructured
16
16
 
17
17
  ```
18
- from sparrow_parse.extractor.file_processor import FileProcessor
18
+ from sparrow_parse.extractor.unstructured_processor import UnstructuredProcessor
19
+
20
+ processor = UnstructuredProcessor()
21
+
22
+ content, table_content = processor.extract_data(
23
+ file_path, # file to process
24
+ strategy, # data processing strategy supported by unstructured
25
+ model_name, # model supported by unstructured
26
+ options, # table extraction into HTML format
27
+ local, # True if running from CLI, or False if running from FastAPI
28
+ debug) # Debug
19
29
  ```
20
30
 
21
- Usage
31
+ Example:
32
+
33
+ *file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
34
+
35
+ *strategy* - `hi_res`
36
+
37
+ *model_name* - `yolox`
38
+
39
+ *options* - `['tables', 'html']`
40
+
41
+ *local* - `True`
42
+
43
+ *debug* - `True`
44
+
45
+ ### Markdown
22
46
 
23
47
  ```
24
- processor = FileProcessor()
25
- content = processor.extract_data(file_path, strategy, model_name, options, local, debug)
48
+ from sparrow_parse.extractor.markdown_processor import MarkdownProcessor
49
+
50
+ processor = MarkdownProcessor()
51
+
52
+ content, table_content = processor.extract_data(
53
+ file_path, # file to process
54
+ options, # table extraction into HTML format
55
+ local, # True if running from CLI, or False if running from FastAPI
56
+ debug) # Debug
26
57
  ```
27
58
 
59
+ Example:
60
+
61
+ *file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
62
+
63
+ *options* - `['tables', 'markdown']`
64
+
65
+ *local* - `True`
66
+
67
+ *debug* - `True`
68
+
69
+ ## Parsing and extraction
70
+
71
+ ```
72
+ from sparrow_parse.extractor.html_extractor import HTMLExtractor
73
+
74
+ extractor = HTMLExtractor()
75
+
76
+ answer, targets_unprocessed = extractor.read_data(
77
+ target_columns, # list of table columns data to fetch
78
+ data, # list of HTML tables
79
+ column_keywords, # list of valid column names, can be empty. Useful to filter junk content
80
+ group_by_rows, # JSON result grouping
81
+ update_targets, # Set to true, if page contains multiple tables with the same columns
82
+ local, # True if running from CLI, or False if running from FastAPI
83
+ debug) # Debug
84
+
85
+ ```
86
+
87
+ Example:
88
+
89
+ *target_columns* - `['description', 'qty', 'net_price', 'net_worth', 'vat', 'gross_worth']`
90
+
91
+ *data* - `list of HTML tables`
92
+
93
+ *column_keywords* - `None`
94
+
95
+ *group_by_rows* - `True`
96
+
97
+ *update_targets* - `True`
98
+
99
+ *local* - `True`
100
+
101
+ *debug* - `True`
102
+
28
103
  ## Library build
29
104
 
30
105
  ```
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sparrow-parse"
3
- version = "0.1.10"
3
+ version = "0.2.1"
4
4
  description = "Sparrow Parse is a Python package for parsing and extracting information from documents."
5
5
  authors = ["Andrej Baranovskij <andrejus.baranovskis@gmail.com>"]
6
6
  license = "GPL-3.0"
@@ -0,0 +1 @@
1
+ __version__ = '0.2.1'
@@ -3,8 +3,8 @@ from sentence_transformers import SentenceTransformer, util
3
3
  from bs4 import BeautifulSoup
4
4
  import json
5
5
  from rich.progress import Progress, SpinnerColumn, TextColumn
6
- from extractor_helper import merge_html_table_headers
7
- from extractor_helper import clean_html_table_header_names
6
+ from .extractor_helper import merge_html_table_headers
7
+ from .extractor_helper import clean_html_table_header_names
8
8
  import re
9
9
 
10
10
 
@@ -221,7 +221,10 @@ class HTMLExtractor(object):
221
221
 
222
222
 
223
223
  if __name__ == "__main__":
224
- # with open('../data/invoice_1_table.txt', 'r') as file:
224
+ # to run for debugging, navigate to sparrow_parse and run the following command:
225
+ # python -m extractor.html_extractor
226
+
227
+ # with open('data/invoice_1_table.txt', 'r') as file:
225
228
  # file_content = file.read()
226
229
  #
227
230
  # file_content = file_content.strip()[1:-1].strip()
@@ -128,18 +128,10 @@ class MarkdownProcessor(object):
128
128
 
129
129
  if __name__ == "__main__":
130
130
  processor = MarkdownProcessor()
131
+
131
132
  # content, table_content = processor.extract_data(
132
133
  # '/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf',
133
134
  # ['tables', 'markdown'],
134
135
  # True,
135
136
  # True)
136
- content, table_content = processor.extract_data(
137
- '/Users/andrejb/Documents/work/schreiber/invoice_data/test/2618407.pdf',
138
- ['tables', 'markdown'],
139
- True,
140
- True)
141
- # content, table_content = processor.extract_data(
142
- # '/Users/andrejb/Documents/work/epik/bankstatement/POSB_2_1.pdf',
143
- # ['tables', 'markdown'],
144
- # True,
145
- # True)
137
+
@@ -13,4 +13,16 @@
13
13
  # 'yolox',
14
14
  # ['tables', 'html'],
15
15
  # True,
16
- # True)
16
+ # True)
17
+
18
+
19
+ # content, table_content = processor.extract_data(
20
+ # '/Users/andrejb/Documents/work/schreiber/invoice_data/test/2618407.pdf',
21
+ # ['tables', 'markdown'],
22
+ # True,
23
+ # True)
24
+ # content, table_content = processor.extract_data(
25
+ # '/Users/andrejb/Documents/work/epik/bankstatement/POSB_2_1.pdf',
26
+ # ['tables', 'markdown'],
27
+ # True,
28
+ # True)
@@ -1 +0,0 @@
1
- __version__ = '0.1.10'