GdoczAI 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gdoczai-0.1.0/GdoczAI.egg-info/PKG-INFO +52 -0
- gdoczai-0.1.0/GdoczAI.egg-info/SOURCES.txt +16 -0
- gdoczai-0.1.0/GdoczAI.egg-info/dependency_links.txt +1 -0
- gdoczai-0.1.0/GdoczAI.egg-info/requires.txt +1 -0
- gdoczai-0.1.0/GdoczAI.egg-info/top_level.txt +1 -0
- gdoczai-0.1.0/PKG-INFO +52 -0
- gdoczai-0.1.0/README.md +38 -0
- gdoczai-0.1.0/gdocz_sdk/__init__.py +13 -0
- gdoczai-0.1.0/gdocz_sdk/client.py +147 -0
- gdoczai-0.1.0/gdocz_sdk/convert_options.py +5 -0
- gdoczai-0.1.0/gdocz_sdk/convert_result.py +22 -0
- gdoczai-0.1.0/gdocz_sdk/exceptions.py +14 -0
- gdoczai-0.1.0/gdocz_sdk/extract_options.py +9 -0
- gdoczai-0.1.0/gdocz_sdk/extract_result.py +13 -0
- gdoczai-0.1.0/gdocz_sdk/segment_options.py +9 -0
- gdoczai-0.1.0/gdocz_sdk/segment_result.py +17 -0
- gdoczai-0.1.0/pyproject.toml +26 -0
- gdoczai-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: GdoczAI
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python SDK for GdoczAI — OCR, extract, and segment documents
|
|
5
|
+
Author-email: Ramkumar <ramkumarlpm4@gmail.com>
|
|
6
|
+
Project-URL: Homepage, https://github.com/GramosoftAI/GdoczAI
|
|
7
|
+
Keywords: ocr,document,pdf,extract,gdocz,ai
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.8
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
Requires-Dist: requests>=2.28.0
|
|
14
|
+
|
|
15
|
+
# gdoczai
|
|
16
|
+
|
|
17
|
+
Official Python SDK for [GdoczAI](https://gdocz.gramopro.ai) — OCR, extract fields,
|
|
18
|
+
and segment documents with ease.
|
|
19
|
+
|
|
20
|
+
## Installation
|
|
21
|
+
```bash
|
|
22
|
+
pip install gdocz_sdk
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Quick Start
|
|
26
|
+
```python
|
|
27
|
+
from gdocz_sdk import GdoczaiClient
|
|
28
|
+
|
|
29
|
+
client = GdoczaiClient(api_key="your-api-key")
|
|
30
|
+
|
|
31
|
+
# Convert PDF to Markdown
|
|
32
|
+
result = client.convert("document.pdf")
|
|
33
|
+
print(result.markdown)
|
|
34
|
+
|
|
35
|
+
# Extract fields
|
|
36
|
+
from gdocz_sdk import ExtractOptions
|
|
37
|
+
result = client.extract("document.pdf", ExtractOptions(fields=["name", "date"]))
|
|
38
|
+
print(result.extracted_data)
|
|
39
|
+
|
|
40
|
+
# Segment document
|
|
41
|
+
result = client.segment("document.pdf")
|
|
42
|
+
print(result.segments)
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## API Key
|
|
46
|
+
|
|
47
|
+
Set your API key as an environment variable:
|
|
48
|
+
```bash
|
|
49
|
+
export GDOCZAI_API_KEY="your-api-key"
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Or pass it directly to the client.
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
GdoczAI.egg-info/PKG-INFO
|
|
4
|
+
GdoczAI.egg-info/SOURCES.txt
|
|
5
|
+
GdoczAI.egg-info/dependency_links.txt
|
|
6
|
+
GdoczAI.egg-info/requires.txt
|
|
7
|
+
GdoczAI.egg-info/top_level.txt
|
|
8
|
+
gdocz_sdk/__init__.py
|
|
9
|
+
gdocz_sdk/client.py
|
|
10
|
+
gdocz_sdk/convert_options.py
|
|
11
|
+
gdocz_sdk/convert_result.py
|
|
12
|
+
gdocz_sdk/exceptions.py
|
|
13
|
+
gdocz_sdk/extract_options.py
|
|
14
|
+
gdocz_sdk/extract_result.py
|
|
15
|
+
gdocz_sdk/segment_options.py
|
|
16
|
+
gdocz_sdk/segment_result.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
requests>=2.28.0
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
gdocz_sdk
|
gdoczai-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: GdoczAI
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python SDK for GdoczAI — OCR, extract, and segment documents
|
|
5
|
+
Author-email: Ramkumar <ramkumarlpm4@gmail.com>
|
|
6
|
+
Project-URL: Homepage, https://github.com/GramosoftAI/GdoczAI
|
|
7
|
+
Keywords: ocr,document,pdf,extract,gdocz,ai
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.8
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
Requires-Dist: requests>=2.28.0
|
|
14
|
+
|
|
15
|
+
# gdoczai
|
|
16
|
+
|
|
17
|
+
Official Python SDK for [GdoczAI](https://gdocz.gramopro.ai) — OCR, extract fields,
|
|
18
|
+
and segment documents with ease.
|
|
19
|
+
|
|
20
|
+
## Installation
|
|
21
|
+
```bash
|
|
22
|
+
pip install gdocz_sdk
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Quick Start
|
|
26
|
+
```python
|
|
27
|
+
from gdocz_sdk import GdoczaiClient
|
|
28
|
+
|
|
29
|
+
client = GdoczaiClient(api_key="your-api-key")
|
|
30
|
+
|
|
31
|
+
# Convert PDF to Markdown
|
|
32
|
+
result = client.convert("document.pdf")
|
|
33
|
+
print(result.markdown)
|
|
34
|
+
|
|
35
|
+
# Extract fields
|
|
36
|
+
from gdocz_sdk import ExtractOptions
|
|
37
|
+
result = client.extract("document.pdf", ExtractOptions(fields=["name", "date"]))
|
|
38
|
+
print(result.extracted_data)
|
|
39
|
+
|
|
40
|
+
# Segment document
|
|
41
|
+
result = client.segment("document.pdf")
|
|
42
|
+
print(result.segments)
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## API Key
|
|
46
|
+
|
|
47
|
+
Set your API key as an environment variable:
|
|
48
|
+
```bash
|
|
49
|
+
export GDOCZAI_API_KEY="your-api-key"
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Or pass it directly to the client.
|
gdoczai-0.1.0/README.md
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# gdoczai
|
|
2
|
+
|
|
3
|
+
Official Python SDK for [GdoczAI](https://gdocz.gramopro.ai) — OCR, extract fields,
|
|
4
|
+
and segment documents with ease.
|
|
5
|
+
|
|
6
|
+
## Installation
|
|
7
|
+
```bash
|
|
8
|
+
pip install gdocz_sdk
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Quick Start
|
|
12
|
+
```python
|
|
13
|
+
from gdocz_sdk import GdoczaiClient
|
|
14
|
+
|
|
15
|
+
client = GdoczaiClient(api_key="your-api-key")
|
|
16
|
+
|
|
17
|
+
# Convert PDF to Markdown
|
|
18
|
+
result = client.convert("document.pdf")
|
|
19
|
+
print(result.markdown)
|
|
20
|
+
|
|
21
|
+
# Extract fields
|
|
22
|
+
from gdocz_sdk import ExtractOptions
|
|
23
|
+
result = client.extract("document.pdf", ExtractOptions(fields=["name", "date"]))
|
|
24
|
+
print(result.extracted_data)
|
|
25
|
+
|
|
26
|
+
# Segment document
|
|
27
|
+
result = client.segment("document.pdf")
|
|
28
|
+
print(result.segments)
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## API Key
|
|
32
|
+
|
|
33
|
+
Set your API key as an environment variable:
|
|
34
|
+
```bash
|
|
35
|
+
export GDOCZAI_API_KEY="your-api-key"
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Or pass it directly to the client.
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from .client import GdoczaiClient
|
|
2
|
+
from .convert_options import ConvertOptions
|
|
3
|
+
from .extract_options import ExtractOptions
|
|
4
|
+
from .segment_options import SegmentOptions
|
|
5
|
+
from .convert_result import ConvertResult
|
|
6
|
+
from .extract_result import ExtractResult
|
|
7
|
+
from .segment_result import SegmentResult
|
|
8
|
+
from .exceptions import (
|
|
9
|
+
GdoczAPIError,
|
|
10
|
+
GdoczTimeoutError,
|
|
11
|
+
GdoczFileError,
|
|
12
|
+
GdoczValidationError,
|
|
13
|
+
)
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
import os
|
|
3
|
+
import json
|
|
4
|
+
from .convert_result import ConvertResult
|
|
5
|
+
from .extract_result import ExtractResult
|
|
6
|
+
from .segment_result import SegmentResult
|
|
7
|
+
from .exceptions import GdoczAPIError, GdoczTimeoutError, GdoczFileError, GdoczValidationError
|
|
8
|
+
|
|
9
|
+
class GdoczaiClient:
|
|
10
|
+
|
|
11
|
+
def __init__(self, api_key=None, convert_base_url=None, extract_base_url=None, segment_base_url=None, timeout=300):
|
|
12
|
+
self.api_key = api_key or os.getenv("GDOCZAI_API_KEY")
|
|
13
|
+
if not self.api_key:
|
|
14
|
+
raise GdoczValidationError("API key required")
|
|
15
|
+
|
|
16
|
+
self.convert_base_url = convert_base_url or "https://gdocz.gramopro.ai/ocr"
|
|
17
|
+
self.extract_base_url = extract_base_url or "https://gdocz.gramopro.ai/ocr"
|
|
18
|
+
self.segment_base_url = segment_base_url or "https://gdocz.gramopro.ai/ocr"
|
|
19
|
+
self.timeout = timeout
|
|
20
|
+
|
|
21
|
+
def convert(self, file_path, options=None):
|
|
22
|
+
|
|
23
|
+
if not os.path.exists(file_path):
|
|
24
|
+
raise GdoczFileError(f"File not found: {file_path}")
|
|
25
|
+
|
|
26
|
+
mode = "chandra"
|
|
27
|
+
page_range = None
|
|
28
|
+
|
|
29
|
+
if options:
|
|
30
|
+
mode = options.mode
|
|
31
|
+
page_range = options.page_range
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
with open(file_path, "rb") as f:
|
|
35
|
+
|
|
36
|
+
files = {"file": (os.path.basename(file_path), f)}
|
|
37
|
+
data = {"model": mode}
|
|
38
|
+
|
|
39
|
+
if page_range:
|
|
40
|
+
data["page_range"] = page_range
|
|
41
|
+
|
|
42
|
+
headers = {"X-API-Key": self.api_key}
|
|
43
|
+
|
|
44
|
+
response = requests.post(
|
|
45
|
+
f"{self.convert_base_url}/ocr/markdown-only",
|
|
46
|
+
files=files,
|
|
47
|
+
data=data,
|
|
48
|
+
headers=headers,
|
|
49
|
+
timeout=self.timeout
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
except requests.exceptions.Timeout:
|
|
53
|
+
raise GdoczTimeoutError("Convert request timed out")
|
|
54
|
+
|
|
55
|
+
if response.status_code != 200:
|
|
56
|
+
raise GdoczAPIError(response.status_code, response.text)
|
|
57
|
+
|
|
58
|
+
return ConvertResult(response.json())
|
|
59
|
+
|
|
60
|
+
def extract(self, file_path, options=None):
|
|
61
|
+
|
|
62
|
+
if not os.path.exists(file_path):
|
|
63
|
+
raise GdoczFileError(f"File not found: {file_path}")
|
|
64
|
+
|
|
65
|
+
if options and options.fields is not None and not isinstance(options.fields, (list, dict)):
|
|
66
|
+
raise GdoczValidationError("fields must be a list or schema dict")
|
|
67
|
+
|
|
68
|
+
headers = {
|
|
69
|
+
"X-API-Key": self.api_key,
|
|
70
|
+
"Content-Type": "application/json"
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
try:
|
|
74
|
+
if options and options.request_id:
|
|
75
|
+
payload = {
|
|
76
|
+
"request_id": options.request_id,
|
|
77
|
+
"fields": options.fields
|
|
78
|
+
}
|
|
79
|
+
else:
|
|
80
|
+
convert_result = self.convert(file_path)
|
|
81
|
+
markdown = convert_result.markdown
|
|
82
|
+
|
|
83
|
+
if not markdown:
|
|
84
|
+
raise GdoczValidationError("Markdown extraction failed — empty response from convert")
|
|
85
|
+
|
|
86
|
+
payload = {
|
|
87
|
+
"markdown_content": markdown,
|
|
88
|
+
"fields": options.fields if options else []
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
response = requests.post(
|
|
92
|
+
f"{self.extract_base_url}/ocr/extract/markdown",
|
|
93
|
+
json=payload,
|
|
94
|
+
headers=headers,
|
|
95
|
+
timeout=self.timeout
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
except requests.exceptions.Timeout:
|
|
99
|
+
raise GdoczTimeoutError("Extract request timed out")
|
|
100
|
+
|
|
101
|
+
if response.status_code != 200:
|
|
102
|
+
raise GdoczAPIError(response.status_code, response.text)
|
|
103
|
+
|
|
104
|
+
return ExtractResult(response.json())
|
|
105
|
+
|
|
106
|
+
def segment(self, file_path, options=None):
|
|
107
|
+
|
|
108
|
+
if not os.path.exists(file_path):
|
|
109
|
+
raise GdoczFileError(f"File not found: {file_path}")
|
|
110
|
+
|
|
111
|
+
mode = "auto"
|
|
112
|
+
segments = None
|
|
113
|
+
|
|
114
|
+
if options:
|
|
115
|
+
mode = options.mode
|
|
116
|
+
segments = options.segments
|
|
117
|
+
|
|
118
|
+
if mode == "guided" and not segments:
|
|
119
|
+
raise GdoczValidationError("Guided mode requires segments list")
|
|
120
|
+
|
|
121
|
+
try:
|
|
122
|
+
with open(file_path, "rb") as f:
|
|
123
|
+
|
|
124
|
+
files = {"file": (os.path.basename(file_path), f)}
|
|
125
|
+
data = {"mode": mode}
|
|
126
|
+
|
|
127
|
+
if mode == "guided" and segments:
|
|
128
|
+
import json
|
|
129
|
+
data["segments"] = json.dumps(segments)
|
|
130
|
+
|
|
131
|
+
headers = {"X-API-Key": self.api_key}
|
|
132
|
+
|
|
133
|
+
response = requests.post(
|
|
134
|
+
f"{self.segment_base_url}/ocr/segment",
|
|
135
|
+
files=files,
|
|
136
|
+
data=data,
|
|
137
|
+
headers=headers,
|
|
138
|
+
timeout=self.timeout
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
except requests.exceptions.Timeout:
|
|
142
|
+
raise GdoczTimeoutError("Segment request timed out")
|
|
143
|
+
|
|
144
|
+
if response.status_code != 200:
|
|
145
|
+
raise GdoczAPIError(response.status_code, response.text)
|
|
146
|
+
|
|
147
|
+
return SegmentResult(response.json())
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
class ConvertResult:
|
|
4
|
+
|
|
5
|
+
def __init__(self, data):
|
|
6
|
+
self.success = data.get("success", False)
|
|
7
|
+
self.request_id = data.get("request_id")
|
|
8
|
+
self.markdown = data.get("markdown")
|
|
9
|
+
|
|
10
|
+
# metadata fields
|
|
11
|
+
metadata = data.get("metadata", {})
|
|
12
|
+
self.metadata = metadata
|
|
13
|
+
self.page_count = metadata.get("page_count")
|
|
14
|
+
self.processing_time = metadata.get("processing_time_seconds")
|
|
15
|
+
self.filename = metadata.get("filename")
|
|
16
|
+
self.model = metadata.get("model")
|
|
17
|
+
|
|
18
|
+
def save_output(self, folder_path):
|
|
19
|
+
os.makedirs(folder_path, exist_ok=True)
|
|
20
|
+
|
|
21
|
+
with open(os.path.join(folder_path, "output.md"), "w", encoding="utf-8") as f:
|
|
22
|
+
f.write(self.markdown or "")
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
class GdoczAPIError(Exception):
|
|
2
|
+
def __init__(self, status_code, response_data):
|
|
3
|
+
self.status_code = status_code
|
|
4
|
+
self.response_data = response_data
|
|
5
|
+
super().__init__(f"API error {status_code}: {response_data}")
|
|
6
|
+
|
|
7
|
+
class GdoczTimeoutError(Exception):
|
|
8
|
+
pass
|
|
9
|
+
|
|
10
|
+
class GdoczFileError(Exception):
|
|
11
|
+
pass
|
|
12
|
+
|
|
13
|
+
class GdoczValidationError(Exception):
|
|
14
|
+
pass
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
class ExtractResult:
|
|
2
|
+
|
|
3
|
+
def __init__(self, data):
|
|
4
|
+
self.success = data.get("success", False)
|
|
5
|
+
self.extracted_data = data.get("extracted_data", {})
|
|
6
|
+
|
|
7
|
+
# metadata fields
|
|
8
|
+
metadata = data.get("metadata", {})
|
|
9
|
+
self.metadata = metadata
|
|
10
|
+
self.fields_requested = metadata.get("fields_requested")
|
|
11
|
+
self.fields_extracted = metadata.get("fields_extracted")
|
|
12
|
+
self.schema_type = metadata.get("schema_type")
|
|
13
|
+
self.token_usage = metadata.get("token_usage", {})
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
class SegmentResult:
|
|
2
|
+
|
|
3
|
+
def __init__(self, data):
|
|
4
|
+
self.success = data.get("success", False)
|
|
5
|
+
self.segments = data.get("segments", [])
|
|
6
|
+
self.request_id = data.get("request_id")
|
|
7
|
+
self.processing_time = data.get("processing_time")
|
|
8
|
+
|
|
9
|
+
# metadata fields
|
|
10
|
+
metadata = data.get("metadata", {})
|
|
11
|
+
self.metadata = metadata
|
|
12
|
+
self.total_pages = metadata.get("total_pages")
|
|
13
|
+
self.strategy = metadata.get("strategy")
|
|
14
|
+
self.mode = metadata.get("mode")
|
|
15
|
+
self.filename = metadata.get("filename")
|
|
16
|
+
self.ocr_time = metadata.get("ocr_time_seconds")
|
|
17
|
+
self.segmentation_time = metadata.get("segmentation_time_seconds")
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "GdoczAI"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Python SDK for GdoczAI — OCR, extract, and segment documents"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { file = "LICENSE" }
|
|
11
|
+
authors = [
|
|
12
|
+
{ name = "Ramkumar", email = "ramkumarlpm4@gmail.com" }
|
|
13
|
+
]
|
|
14
|
+
keywords = ["ocr", "document", "pdf", "extract", "gdocz", "ai"]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Operating System :: OS Independent",
|
|
19
|
+
]
|
|
20
|
+
requires-python = ">=3.8"
|
|
21
|
+
dependencies = [
|
|
22
|
+
"requests>=2.28.0"
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
[project.urls]
|
|
26
|
+
Homepage = "https://github.com/GramosoftAI/GdoczAI"
|
gdoczai-0.1.0/setup.cfg
ADDED