GdoczAI 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,52 @@
1
+ Metadata-Version: 2.4
2
+ Name: GdoczAI
3
+ Version: 0.1.0
4
+ Summary: Python SDK for GdoczAI — OCR, extract, and segment documents
5
+ Author-email: Ramkumar <ramkumarlpm4@gmail.com>
6
+ Project-URL: Homepage, https://github.com/GramosoftAI/GdoczAI
7
+ Keywords: ocr,document,pdf,extract,gdocz,ai
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.8
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: requests>=2.28.0
14
+
15
+ # gdoczai
16
+
17
+ Official Python SDK for [GdoczAI](https://gdocz.gramopro.ai) — OCR, extract fields,
18
+ and segment documents with ease.
19
+
20
+ ## Installation
21
+ ```bash
22
+ pip install gdocz_sdk
23
+ ```
24
+
25
+ ## Quick Start
26
+ ```python
27
+ from gdocz_sdk import GdoczaiClient
28
+
29
+ client = GdoczaiClient(api_key="your-api-key")
30
+
31
+ # Convert PDF to Markdown
32
+ result = client.convert("document.pdf")
33
+ print(result.markdown)
34
+
35
+ # Extract fields
36
+ from gdocz_sdk import ExtractOptions
37
+ result = client.extract("document.pdf", ExtractOptions(fields=["name", "date"]))
38
+ print(result.extracted_data)
39
+
40
+ # Segment document
41
+ result = client.segment("document.pdf")
42
+ print(result.segments)
43
+ ```
44
+
45
+ ## API Key
46
+
47
+ Set your API key as an environment variable:
48
+ ```bash
49
+ export GDOCZAI_API_KEY="your-api-key"
50
+ ```
51
+
52
+ Or pass it directly to the client.
@@ -0,0 +1,16 @@
1
+ README.md
2
+ pyproject.toml
3
+ GdoczAI.egg-info/PKG-INFO
4
+ GdoczAI.egg-info/SOURCES.txt
5
+ GdoczAI.egg-info/dependency_links.txt
6
+ GdoczAI.egg-info/requires.txt
7
+ GdoczAI.egg-info/top_level.txt
8
+ gdocz_sdk/__init__.py
9
+ gdocz_sdk/client.py
10
+ gdocz_sdk/convert_options.py
11
+ gdocz_sdk/convert_result.py
12
+ gdocz_sdk/exceptions.py
13
+ gdocz_sdk/extract_options.py
14
+ gdocz_sdk/extract_result.py
15
+ gdocz_sdk/segment_options.py
16
+ gdocz_sdk/segment_result.py
@@ -0,0 +1 @@
1
+ requests>=2.28.0
@@ -0,0 +1 @@
1
+ gdocz_sdk
gdoczai-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,52 @@
1
+ Metadata-Version: 2.4
2
+ Name: GdoczAI
3
+ Version: 0.1.0
4
+ Summary: Python SDK for GdoczAI — OCR, extract, and segment documents
5
+ Author-email: Ramkumar <ramkumarlpm4@gmail.com>
6
+ Project-URL: Homepage, https://github.com/GramosoftAI/GdoczAI
7
+ Keywords: ocr,document,pdf,extract,gdocz,ai
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.8
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: requests>=2.28.0
14
+
15
+ # gdoczai
16
+
17
+ Official Python SDK for [GdoczAI](https://gdocz.gramopro.ai) — OCR, extract fields,
18
+ and segment documents with ease.
19
+
20
+ ## Installation
21
+ ```bash
22
+ pip install gdocz_sdk
23
+ ```
24
+
25
+ ## Quick Start
26
+ ```python
27
+ from gdocz_sdk import GdoczaiClient
28
+
29
+ client = GdoczaiClient(api_key="your-api-key")
30
+
31
+ # Convert PDF to Markdown
32
+ result = client.convert("document.pdf")
33
+ print(result.markdown)
34
+
35
+ # Extract fields
36
+ from gdocz_sdk import ExtractOptions
37
+ result = client.extract("document.pdf", ExtractOptions(fields=["name", "date"]))
38
+ print(result.extracted_data)
39
+
40
+ # Segment document
41
+ result = client.segment("document.pdf")
42
+ print(result.segments)
43
+ ```
44
+
45
+ ## API Key
46
+
47
+ Set your API key as an environment variable:
48
+ ```bash
49
+ export GDOCZAI_API_KEY="your-api-key"
50
+ ```
51
+
52
+ Or pass it directly to the client.
@@ -0,0 +1,38 @@
1
+ # gdoczai
2
+
3
+ Official Python SDK for [GdoczAI](https://gdocz.gramopro.ai) — OCR, extract fields,
4
+ and segment documents with ease.
5
+
6
+ ## Installation
7
+ ```bash
8
+ pip install gdocz_sdk
9
+ ```
10
+
11
+ ## Quick Start
12
+ ```python
13
+ from gdocz_sdk import GdoczaiClient
14
+
15
+ client = GdoczaiClient(api_key="your-api-key")
16
+
17
+ # Convert PDF to Markdown
18
+ result = client.convert("document.pdf")
19
+ print(result.markdown)
20
+
21
+ # Extract fields
22
+ from gdocz_sdk import ExtractOptions
23
+ result = client.extract("document.pdf", ExtractOptions(fields=["name", "date"]))
24
+ print(result.extracted_data)
25
+
26
+ # Segment document
27
+ result = client.segment("document.pdf")
28
+ print(result.segments)
29
+ ```
30
+
31
+ ## API Key
32
+
33
+ Set your API key as an environment variable:
34
+ ```bash
35
+ export GDOCZAI_API_KEY="your-api-key"
36
+ ```
37
+
38
+ Or pass it directly to the client.
@@ -0,0 +1,13 @@
1
+ from .client import GdoczaiClient
2
+ from .convert_options import ConvertOptions
3
+ from .extract_options import ExtractOptions
4
+ from .segment_options import SegmentOptions
5
+ from .convert_result import ConvertResult
6
+ from .extract_result import ExtractResult
7
+ from .segment_result import SegmentResult
8
+ from .exceptions import (
9
+ GdoczAPIError,
10
+ GdoczTimeoutError,
11
+ GdoczFileError,
12
+ GdoczValidationError,
13
+ )
@@ -0,0 +1,147 @@
1
+ import requests
2
+ import os
3
+ import json
4
+ from .convert_result import ConvertResult
5
+ from .extract_result import ExtractResult
6
+ from .segment_result import SegmentResult
7
+ from .exceptions import GdoczAPIError, GdoczTimeoutError, GdoczFileError, GdoczValidationError
8
+
9
+ class GdoczaiClient:
10
+
11
+ def __init__(self, api_key=None, convert_base_url=None, extract_base_url=None, segment_base_url=None, timeout=300):
12
+ self.api_key = api_key or os.getenv("GDOCZAI_API_KEY")
13
+ if not self.api_key:
14
+ raise GdoczValidationError("API key required")
15
+
16
+ self.convert_base_url = convert_base_url or "https://gdocz.gramopro.ai/ocr"
17
+ self.extract_base_url = extract_base_url or "https://gdocz.gramopro.ai/ocr"
18
+ self.segment_base_url = segment_base_url or "https://gdocz.gramopro.ai/ocr"
19
+ self.timeout = timeout
20
+
21
+ def convert(self, file_path, options=None):
22
+
23
+ if not os.path.exists(file_path):
24
+ raise GdoczFileError(f"File not found: {file_path}")
25
+
26
+ mode = "chandra"
27
+ page_range = None
28
+
29
+ if options:
30
+ mode = options.mode
31
+ page_range = options.page_range
32
+
33
+ try:
34
+ with open(file_path, "rb") as f:
35
+
36
+ files = {"file": (os.path.basename(file_path), f)}
37
+ data = {"model": mode}
38
+
39
+ if page_range:
40
+ data["page_range"] = page_range
41
+
42
+ headers = {"X-API-Key": self.api_key}
43
+
44
+ response = requests.post(
45
+ f"{self.convert_base_url}/ocr/markdown-only",
46
+ files=files,
47
+ data=data,
48
+ headers=headers,
49
+ timeout=self.timeout
50
+ )
51
+
52
+ except requests.exceptions.Timeout:
53
+ raise GdoczTimeoutError("Convert request timed out")
54
+
55
+ if response.status_code != 200:
56
+ raise GdoczAPIError(response.status_code, response.text)
57
+
58
+ return ConvertResult(response.json())
59
+
60
+ def extract(self, file_path, options=None):
61
+
62
+ if not os.path.exists(file_path):
63
+ raise GdoczFileError(f"File not found: {file_path}")
64
+
65
+ if options and options.fields is not None and not isinstance(options.fields, (list, dict)):
66
+ raise GdoczValidationError("fields must be a list or schema dict")
67
+
68
+ headers = {
69
+ "X-API-Key": self.api_key,
70
+ "Content-Type": "application/json"
71
+ }
72
+
73
+ try:
74
+ if options and options.request_id:
75
+ payload = {
76
+ "request_id": options.request_id,
77
+ "fields": options.fields
78
+ }
79
+ else:
80
+ convert_result = self.convert(file_path)
81
+ markdown = convert_result.markdown
82
+
83
+ if not markdown:
84
+ raise GdoczValidationError("Markdown extraction failed — empty response from convert")
85
+
86
+ payload = {
87
+ "markdown_content": markdown,
88
+ "fields": options.fields if options else []
89
+ }
90
+
91
+ response = requests.post(
92
+ f"{self.extract_base_url}/ocr/extract/markdown",
93
+ json=payload,
94
+ headers=headers,
95
+ timeout=self.timeout
96
+ )
97
+
98
+ except requests.exceptions.Timeout:
99
+ raise GdoczTimeoutError("Extract request timed out")
100
+
101
+ if response.status_code != 200:
102
+ raise GdoczAPIError(response.status_code, response.text)
103
+
104
+ return ExtractResult(response.json())
105
+
106
+ def segment(self, file_path, options=None):
107
+
108
+ if not os.path.exists(file_path):
109
+ raise GdoczFileError(f"File not found: {file_path}")
110
+
111
+ mode = "auto"
112
+ segments = None
113
+
114
+ if options:
115
+ mode = options.mode
116
+ segments = options.segments
117
+
118
+ if mode == "guided" and not segments:
119
+ raise GdoczValidationError("Guided mode requires segments list")
120
+
121
+ try:
122
+ with open(file_path, "rb") as f:
123
+
124
+ files = {"file": (os.path.basename(file_path), f)}
125
+ data = {"mode": mode}
126
+
127
+ if mode == "guided" and segments:
128
+ import json
129
+ data["segments"] = json.dumps(segments)
130
+
131
+ headers = {"X-API-Key": self.api_key}
132
+
133
+ response = requests.post(
134
+ f"{self.segment_base_url}/ocr/segment",
135
+ files=files,
136
+ data=data,
137
+ headers=headers,
138
+ timeout=self.timeout
139
+ )
140
+
141
+ except requests.exceptions.Timeout:
142
+ raise GdoczTimeoutError("Segment request timed out")
143
+
144
+ if response.status_code != 200:
145
+ raise GdoczAPIError(response.status_code, response.text)
146
+
147
+ return SegmentResult(response.json())
@@ -0,0 +1,5 @@
1
+ class ConvertOptions:
2
+
3
+ def __init__(self, mode="chandra", page_range=None):
4
+ self.mode = mode
5
+ self.page_range = page_range
@@ -0,0 +1,22 @@
1
+ import os
2
+
3
+ class ConvertResult:
4
+
5
+ def __init__(self, data):
6
+ self.success = data.get("success", False)
7
+ self.request_id = data.get("request_id")
8
+ self.markdown = data.get("markdown")
9
+
10
+ # metadata fields
11
+ metadata = data.get("metadata", {})
12
+ self.metadata = metadata
13
+ self.page_count = metadata.get("page_count")
14
+ self.processing_time = metadata.get("processing_time_seconds")
15
+ self.filename = metadata.get("filename")
16
+ self.model = metadata.get("model")
17
+
18
+ def save_output(self, folder_path):
19
+ os.makedirs(folder_path, exist_ok=True)
20
+
21
+ with open(os.path.join(folder_path, "output.md"), "w", encoding="utf-8") as f:
22
+ f.write(self.markdown or "")
@@ -0,0 +1,14 @@
1
+ class GdoczAPIError(Exception):
2
+ def __init__(self, status_code, response_data):
3
+ self.status_code = status_code
4
+ self.response_data = response_data
5
+ super().__init__(f"API error {status_code}: {response_data}")
6
+
7
+ class GdoczTimeoutError(Exception):
8
+ pass
9
+
10
+ class GdoczFileError(Exception):
11
+ pass
12
+
13
+ class GdoczValidationError(Exception):
14
+ pass
@@ -0,0 +1,9 @@
1
+ class ExtractOptions:
2
+
3
+ def __init__(self, fields=None, request_id=None):
4
+ """
5
+ fields : list of field names e.g. ["total_amount", "tax"]
6
+ request_id : reuse existing convert request
7
+ """
8
+ self.fields = fields or []
9
+ self.request_id = request_id
@@ -0,0 +1,13 @@
1
+ class ExtractResult:
2
+
3
+ def __init__(self, data):
4
+ self.success = data.get("success", False)
5
+ self.extracted_data = data.get("extracted_data", {})
6
+
7
+ # metadata fields
8
+ metadata = data.get("metadata", {})
9
+ self.metadata = metadata
10
+ self.fields_requested = metadata.get("fields_requested")
11
+ self.fields_extracted = metadata.get("fields_extracted")
12
+ self.schema_type = metadata.get("schema_type")
13
+ self.token_usage = metadata.get("token_usage", {})
@@ -0,0 +1,9 @@
1
+ class SegmentOptions:
2
+
3
+ def __init__(self, mode="auto", segments=None):
4
+ """
5
+ mode: "auto" or "guided"
6
+ segments: list of {name, description}
7
+ """
8
+ self.mode = mode
9
+ self.segments = segments or []
@@ -0,0 +1,17 @@
1
+ class SegmentResult:
2
+
3
+ def __init__(self, data):
4
+ self.success = data.get("success", False)
5
+ self.segments = data.get("segments", [])
6
+ self.request_id = data.get("request_id")
7
+ self.processing_time = data.get("processing_time")
8
+
9
+ # metadata fields
10
+ metadata = data.get("metadata", {})
11
+ self.metadata = metadata
12
+ self.total_pages = metadata.get("total_pages")
13
+ self.strategy = metadata.get("strategy")
14
+ self.mode = metadata.get("mode")
15
+ self.filename = metadata.get("filename")
16
+ self.ocr_time = metadata.get("ocr_time_seconds")
17
+ self.segmentation_time = metadata.get("segmentation_time_seconds")
@@ -0,0 +1,26 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "GdoczAI"
7
+ version = "0.1.0"
8
+ description = "Python SDK for GdoczAI — OCR, extract, and segment documents"
9
+ readme = "README.md"
10
+ license = { file = "LICENSE" }
11
+ authors = [
12
+ { name = "Ramkumar", email = "ramkumarlpm4@gmail.com" }
13
+ ]
14
+ keywords = ["ocr", "document", "pdf", "extract", "gdocz", "ai"]
15
+ classifiers = [
16
+ "Programming Language :: Python :: 3",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Operating System :: OS Independent",
19
+ ]
20
+ requires-python = ">=3.8"
21
+ dependencies = [
22
+ "requests>=2.28.0"
23
+ ]
24
+
25
+ [project.urls]
26
+ Homepage = "https://github.com/GramosoftAI/GdoczAI"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+