PyPI - sparrow-parse - Versions diffs - 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl - Mend

sparrow-parse 0.1.8py3-none-any.whl → 0.1.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

sparrow_parse/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = '0.1.8'
1	+ __version__ = '0.1.9'

sparrow_parse/extractor/file_processor.py ADDED Viewed

@@ -0,0 +1,143 @@
+import tempfile
+import os
+from unstructured.partition.pdf import partition_pdf
+from unstructured.partition.image import partition_image
+import json
+from unstructured.staging.base import elements_to_json
+from rich.progress import Progress, SpinnerColumn, TextColumn
+class FileProcessor(object):
+    def __init__(self):
+        pass
+    def extract_data(self, file_path, strategy, model_name, options, local=True, debug=False):
+        # check if string options contains word table
+        extract_tables = False
+        if options is not None and "tables" in options:
+            extract_tables = True
+        # Extracts the elements from the PDF
+        elements = self.invoke_pipeline_step(
+            lambda: self.process_file(file_path, strategy, model_name),
+            "Extracting elements from the document...",
+            local
+        )
+        if debug:
+            new_extension = 'json'  # You can change this to any extension you want
+            new_file_path = self.change_file_extension(file_path, new_extension)
+            content = self.invoke_pipeline_step(
+                lambda: self.load_text_data(elements, new_file_path, extract_tables),
+                "Loading text data...",
+                local
+            )
+        else:
+            with tempfile.TemporaryDirectory() as temp_dir:
+                temp_file_path = os.path.join(temp_dir, "file_data.json")
+                content = self.invoke_pipeline_step(
+                    lambda: self.load_text_data(elements, temp_file_path, extract_tables),
+                    "Loading text data...",
+                    local
+                )
+        return content
+    def process_file(self, file_path, strategy, model_name):
+        elements = None
+        if file_path.lower().endswith('.pdf'):
+            elements = partition_pdf(
+                filename=file_path,
+                strategy=strategy,
+                infer_table_structure=True,
+                model_name=model_name
+            )
+        elif file_path.lower().endswith(('.jpg', '.jpeg', '.png')):
+            elements = partition_image(
+                filename=file_path,
+                strategy=strategy,
+                infer_table_structure=True,
+                model_name=model_name
+            )
+        return elements
+    def change_file_extension(self, file_path, new_extension):
+        # Check if the new extension starts with a dot and add one if not
+        if not new_extension.startswith('.'):
+            new_extension = '.' + new_extension
+        # Split the file path into two parts: the base (everything before the last dot) and the extension
+        # If there's no dot in the filename, it'll just return the original filename without an extension
+        base = file_path.rsplit('.', 1)[0]
+        # Concatenate the base with the new extension
+        new_file_path = base + new_extension
+        return new_file_path
+    def load_text_data(self, elements, file_path, extract_tables):
+        elements_to_json(elements, filename=file_path)
+        text_file = self.process_json_file(file_path, extract_tables)
+        with open(text_file, 'r') as file:
+            content = file.read()
+        return content
+    def process_json_file(self, input_data, extract_tables):
+        # Read the JSON file
+        with open(input_data, 'r') as file:
+            data = json.load(file)
+        # Iterate over the JSON data and extract required table elements
+        extracted_elements = []
+        for entry in data:
+            if entry["type"] == "Table":
+                extracted_elements.append(entry["metadata"]["text_as_html"])
+            elif entry["type"] == "Title" and extract_tables is False:
+                extracted_elements.append(entry["text"])
+            elif entry["type"] == "NarrativeText" and extract_tables is False:
+                extracted_elements.append(entry["text"])
+            elif entry["type"] == "UncategorizedText" and extract_tables is False:
+                extracted_elements.append(entry["text"])
+        # Write the extracted elements to the output file
+        new_extension = 'txt'  # You can change this to any extension you want
+        new_file_path = self.change_file_extension(input_data, new_extension)
+        with open(new_file_path, 'w') as output_file:
+            for element in extracted_elements:
+                output_file.write(element + "\n\n")  # Adding two newlines for separation
+        return new_file_path
+    def invoke_pipeline_step(self, task_call, task_description, local):
+        if local:
+            with Progress(
+                    SpinnerColumn(),
+                    TextColumn("[progress.description]{task.description}"),
+                    transient=False,
+            ) as progress:
+                progress.add_task(description=task_description, total=None)
+                ret = task_call()
+        else:
+            print(task_description)
+            ret = task_call()
+        return ret
+# if __name__ == "__main__":
+#     processor = FileProcessor()
+#     content = processor.extract_data('/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf',
+#                                      'hi_res',
+#                                      'yolox',
+#                                      'tables',
+#                                      False,
+#                                      True)
+#     processor.extract_data("/Users/andrejb/Documents/work/lifung/lemming_test/C16E150001_SUPINV.pdf")
+#     processor.extract_data("/Users/andrejb/Documents/work/epik/bankstatement/OCBC_1_single.pdf")
+#     print(content)

{sparrow_parse-0.1.8.dist-info → sparrow_parse-0.1.9.dist-info}/METADATA RENAMED Viewed

@@ -1,21 +1,24 @@
 Metadata-Version: 2.1
 Name: sparrow-parse
-Version: 0.1.8
+Version: 0.1.9
 Summary: Sparrow Parse is a Python package for parsing and extracting information from documents.
 Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
 License: GPL-3.0
 Keywords: llm,rag,vision
 Author: Andrej Baranovskij
 Author-email: andrejus.baranovskis@gmail.com
-Requires-Python: >=3.10,<4.0
+Requires-Python: >=3.9,<3.12
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
-Classifier: Programming Language :: Python :: 3.12
 Classifier: Topic :: Software Development
-Requires-Dist: requests (>=2.31.0,<3.0.0)
+Requires-Dist: rich (>=13.7.1,<14.0.0)
+Requires-Dist: torch (==2.2.2)
+Requires-Dist: unstructured-inference (==0.7.29)
+Requires-Dist: unstructured[all-docs] (==0.13.6)
 Project-URL: Repository, https://github.com/katanaml/sparrow
 Description-Content-Type: text/markdown
@@ -49,7 +52,13 @@ result = processor.process_file(file_path, strategy, model_name)
 Build for development
 ```
+poetry build
+```
+Publish to PyPi
+```
+poetry publish
 ```
 ## Commercial usage

sparrow_parse-0.1.9.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+sparrow_parse/__init__.py,sha256=m1D6fscvvsMhq5HVNKw7kP5M8AqEzQm1ekrn_nLQF1M,21
+sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
+sparrow_parse/extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sparrow_parse/extractor/file_processor.py,sha256=OrDxFJVEhy_4pCBxknehAM7fxgSlWgUJ0jeTEegHRxo,5621
+sparrow_parse-0.1.9.dist-info/METADATA,sha256=wK7uOpPqsC1iwZs_d5Hl1KV4DIbw8NnfPU6MBTyF_kA,3428
+sparrow_parse-0.1.9.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+sparrow_parse-0.1.9.dist-info/entry_points.txt,sha256=H507qotwq3VX4lv5pY9MZYtupKNE1RRb8gEQucPiGi0,52
+sparrow_parse-0.1.9.dist-info/RECORD,,

sparrow_parse/pdf/pdf_processor.py DELETED Viewed

@@ -1,7 +0,0 @@
-class PDFProcessor(object):
-    def __init__(self):
-        pass
-    def process_file(self, content):
-        print("Processing file...")
-        return "OK"

sparrow_parse-0.1.8.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-sparrow_parse/__init__.py,sha256=zemvJ5zjFE6SQT2xmkxc-ZYwNkUTCEX7mz3Epb2qztE,21
-sparrow_parse/__main__.py,sha256=Xs1bpJV0n08KWOoQE34FBYn6EBXZA9HIYJKrE4ZdG78,153
-sparrow_parse/pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-sparrow_parse/pdf/pdf_processor.py,sha256=hyvOQX_IydRA3z7gQs_g-Ut1hvVHRRxj1_2i-G09-ow,159
-sparrow_parse-0.1.8.dist-info/METADATA,sha256=QTAeFIi-KwyBvSbBrB8wS5WCld3gQ3XfAll4wS4x7Yc,3250
-sparrow_parse-0.1.8.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-sparrow_parse-0.1.8.dist-info/entry_points.txt,sha256=H507qotwq3VX4lv5pY9MZYtupKNE1RRb8gEQucPiGi0,52
-sparrow_parse-0.1.8.dist-info/RECORD,,

/sparrow_parse/{pdf → extractor}/__init__.py RENAMED Viewed

File without changes

{sparrow_parse-0.1.8.dist-info → sparrow_parse-0.1.9.dist-info}/WHEEL RENAMED Viewed

File without changes

{sparrow_parse-0.1.8.dist-info → sparrow_parse-0.1.9.dist-info}/entry_points.txt RENAMED Viewed

File without changes

sparrow-parse 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl

sparrow-parse 0.1.8py3-none-any.whl → 0.1.9py3-none-any.whl