docextractbr 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 DocExtract
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: docextractbr
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: SDK Python para a DocExtract API — extração de dados de documentos com IA
|
|
5
|
+
Project-URL: Homepage, https://docextract.com.br
|
|
6
|
+
Project-URL: Documentation, https://docextract.com.br/docs
|
|
7
|
+
Author: DocExtract
|
|
8
|
+
License: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Keywords: ai,api,document,extraction,llm,ocr,pdf,sdk
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
16
|
+
Classifier: Topic :: Text Processing
|
|
17
|
+
Requires-Python: >=3.9
|
|
18
|
+
Requires-Dist: httpx>=0.24.0
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
|
|
21
|
+
# DocExtract
|
|
22
|
+
|
|
23
|
+
SDK Python para extração de dados de documentos com IA.
|
|
24
|
+
|
|
25
|
+
Converte PDFs, imagens e documentos brasileiros em JSON estruturado.
|
|
26
|
+
|
|
27
|
+
## Instalação
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
pip install docextract
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Uso rápido
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
from docextract import DocExtract
|
|
37
|
+
|
|
38
|
+
doc = DocExtract(api_key="dk_sua_chave")
|
|
39
|
+
|
|
40
|
+
# Extrair dados de um PDF
|
|
41
|
+
result = doc.extract("nota_fiscal.pdf")
|
|
42
|
+
print(result["fields"]["emitente_cnpj"])
|
|
43
|
+
print(result["confidence"]) # 0.98
|
|
44
|
+
|
|
45
|
+
# Especificar tipo de documento
|
|
46
|
+
result = doc.extract("cnh.jpg", document_type="cnh")
|
|
47
|
+
print(result["fields"]["nome"])
|
|
48
|
+
print(result["fields"]["cpf"])
|
|
49
|
+
|
|
50
|
+
# Auto-detect
|
|
51
|
+
result = doc.extract("documento.pdf", document_type="auto")
|
|
52
|
+
print(result["document_type"]) # "nfe", "cnh", etc.
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Tipos de documentos suportados
|
|
56
|
+
|
|
57
|
+
| Tipo | Código |
|
|
58
|
+
|------|--------|
|
|
59
|
+
| CNH | `cnh` |
|
|
60
|
+
| RG | `rg` |
|
|
61
|
+
| CPF | `cpf` |
|
|
62
|
+
| NF-e | `nfe` |
|
|
63
|
+
| NFS-e | `nfse` |
|
|
64
|
+
| Boleto | `boleto` |
|
|
65
|
+
| Contrato Social | `contrato_social` |
|
|
66
|
+
| Comprovante de Residência | `comprovante_residencia` |
|
|
67
|
+
| Holerite | `comprovante_renda` |
|
|
68
|
+
| Detecção automática | `auto` |
|
|
69
|
+
|
|
70
|
+
## BYOK (Bring Your Own Key)
|
|
71
|
+
|
|
72
|
+
Configure sua própria API key de IA no painel, ou passe inline:
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
result = doc.extract(
|
|
76
|
+
"documento.pdf",
|
|
77
|
+
llm_provider="gemini",
|
|
78
|
+
llm_api_key="sua_gemini_key",
|
|
79
|
+
llm_model="gemini-2.5-flash",
|
|
80
|
+
)
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Extrair de bytes em memória
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
with open("doc.pdf", "rb") as f:
|
|
87
|
+
result = doc.extract_bytes(f.read(), "doc.pdf", document_type="nfe")
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Links
|
|
91
|
+
|
|
92
|
+
- Documentação: https://docextract.com.br/docs
|
|
93
|
+
- Painel: https://docextract.com.br
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# DocExtract
|
|
2
|
+
|
|
3
|
+
SDK Python para extração de dados de documentos com IA.
|
|
4
|
+
|
|
5
|
+
Converte PDFs, imagens e documentos brasileiros em JSON estruturado.
|
|
6
|
+
|
|
7
|
+
## Instalação
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install docextract
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Uso rápido
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
from docextract import DocExtract
|
|
17
|
+
|
|
18
|
+
doc = DocExtract(api_key="dk_sua_chave")
|
|
19
|
+
|
|
20
|
+
# Extrair dados de um PDF
|
|
21
|
+
result = doc.extract("nota_fiscal.pdf")
|
|
22
|
+
print(result["fields"]["emitente_cnpj"])
|
|
23
|
+
print(result["confidence"]) # 0.98
|
|
24
|
+
|
|
25
|
+
# Especificar tipo de documento
|
|
26
|
+
result = doc.extract("cnh.jpg", document_type="cnh")
|
|
27
|
+
print(result["fields"]["nome"])
|
|
28
|
+
print(result["fields"]["cpf"])
|
|
29
|
+
|
|
30
|
+
# Auto-detect
|
|
31
|
+
result = doc.extract("documento.pdf", document_type="auto")
|
|
32
|
+
print(result["document_type"]) # "nfe", "cnh", etc.
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Tipos de documentos suportados
|
|
36
|
+
|
|
37
|
+
| Tipo | Código |
|
|
38
|
+
|------|--------|
|
|
39
|
+
| CNH | `cnh` |
|
|
40
|
+
| RG | `rg` |
|
|
41
|
+
| CPF | `cpf` |
|
|
42
|
+
| NF-e | `nfe` |
|
|
43
|
+
| NFS-e | `nfse` |
|
|
44
|
+
| Boleto | `boleto` |
|
|
45
|
+
| Contrato Social | `contrato_social` |
|
|
46
|
+
| Comprovante de Residência | `comprovante_residencia` |
|
|
47
|
+
| Holerite | `comprovante_renda` |
|
|
48
|
+
| Detecção automática | `auto` |
|
|
49
|
+
|
|
50
|
+
## BYOK (Bring Your Own Key)
|
|
51
|
+
|
|
52
|
+
Configure sua própria API key de IA no painel, ou passe inline:
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
result = doc.extract(
|
|
56
|
+
"documento.pdf",
|
|
57
|
+
llm_provider="gemini",
|
|
58
|
+
llm_api_key="sua_gemini_key",
|
|
59
|
+
llm_model="gemini-2.5-flash",
|
|
60
|
+
)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Extrair de bytes em memória
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
with open("doc.pdf", "rb") as f:
|
|
67
|
+
result = doc.extract_bytes(f.read(), "doc.pdf", document_type="nfe")
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Links
|
|
71
|
+
|
|
72
|
+
- Documentação: https://docextract.com.br/docs
|
|
73
|
+
- Painel: https://docextract.com.br
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""Cliente Python para a DocExtract API."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import httpx
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DocExtract:
|
|
9
|
+
"""Extrai dados estruturados de documentos com IA.
|
|
10
|
+
|
|
11
|
+
Uso:
|
|
12
|
+
doc = DocExtract(api_key="dk_sua_chave")
|
|
13
|
+
result = doc.extract("nota_fiscal.pdf")
|
|
14
|
+
print(result["fields"]["valor_total"])
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
api_key: str,
|
|
20
|
+
base_url: str = "https://api.docextract.com.br",
|
|
21
|
+
timeout: float = 120.0,
|
|
22
|
+
):
|
|
23
|
+
self.api_key = api_key
|
|
24
|
+
self.base_url = base_url.rstrip("/")
|
|
25
|
+
self.timeout = timeout
|
|
26
|
+
|
|
27
|
+
def extract(
|
|
28
|
+
self,
|
|
29
|
+
file_path: str,
|
|
30
|
+
document_type: str = "auto",
|
|
31
|
+
llm_provider: str | None = None,
|
|
32
|
+
llm_api_key: str | None = None,
|
|
33
|
+
llm_model: str | None = None,
|
|
34
|
+
) -> dict:
|
|
35
|
+
"""Extrai dados de um documento.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
file_path: caminho do arquivo (PDF, imagem, DOCX, etc.)
|
|
39
|
+
document_type: "auto", "cnh", "rg", "nfe", "nfse", "boleto", etc.
|
|
40
|
+
llm_provider: override do provider (gemini/openai/anthropic)
|
|
41
|
+
llm_api_key: override da API key do LLM (BYOK inline)
|
|
42
|
+
llm_model: override do modelo
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
dict com success, document_type, confidence, fields, metadata
|
|
46
|
+
"""
|
|
47
|
+
path = Path(file_path)
|
|
48
|
+
if not path.exists():
|
|
49
|
+
raise FileNotFoundError(f"Arquivo não encontrado: {file_path}")
|
|
50
|
+
|
|
51
|
+
data = {"document_type": document_type}
|
|
52
|
+
if llm_provider:
|
|
53
|
+
data["llm_provider"] = llm_provider
|
|
54
|
+
if llm_api_key:
|
|
55
|
+
data["llm_api_key"] = llm_api_key
|
|
56
|
+
if llm_model:
|
|
57
|
+
data["llm_model"] = llm_model
|
|
58
|
+
|
|
59
|
+
with open(path, "rb") as f:
|
|
60
|
+
response = httpx.post(
|
|
61
|
+
f"{self.base_url}/v1/extract",
|
|
62
|
+
headers={"x-api-key": self.api_key},
|
|
63
|
+
files={"file": (path.name, f)},
|
|
64
|
+
data=data,
|
|
65
|
+
timeout=self.timeout,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
if response.status_code == 429:
|
|
69
|
+
error = response.json()
|
|
70
|
+
raise Exception(f"Limite mensal atingido: {error.get('detail', {}).get('message', '')}")
|
|
71
|
+
|
|
72
|
+
response.raise_for_status()
|
|
73
|
+
return response.json()
|
|
74
|
+
|
|
75
|
+
def extract_bytes(
|
|
76
|
+
self,
|
|
77
|
+
file_bytes: bytes,
|
|
78
|
+
filename: str,
|
|
79
|
+
document_type: str = "auto",
|
|
80
|
+
) -> dict:
|
|
81
|
+
"""Extrai dados de bytes em memória.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
file_bytes: conteúdo do arquivo em bytes
|
|
85
|
+
filename: nome do arquivo (para detectar tipo)
|
|
86
|
+
document_type: tipo do documento ou "auto"
|
|
87
|
+
"""
|
|
88
|
+
response = httpx.post(
|
|
89
|
+
f"{self.base_url}/v1/extract",
|
|
90
|
+
headers={"x-api-key": self.api_key},
|
|
91
|
+
files={"file": (filename, file_bytes)},
|
|
92
|
+
data={"document_type": document_type},
|
|
93
|
+
timeout=self.timeout,
|
|
94
|
+
)
|
|
95
|
+
response.raise_for_status()
|
|
96
|
+
return response.json()
|
|
97
|
+
|
|
98
|
+
def types(self) -> dict:
|
|
99
|
+
"""Lista tipos de documentos suportados."""
|
|
100
|
+
response = httpx.get(
|
|
101
|
+
f"{self.base_url}/v1/types",
|
|
102
|
+
timeout=self.timeout,
|
|
103
|
+
)
|
|
104
|
+
response.raise_for_status()
|
|
105
|
+
return response.json()
|
|
106
|
+
|
|
107
|
+
def usage(self) -> dict:
|
|
108
|
+
"""Retorna uso atual do mês."""
|
|
109
|
+
response = httpx.get(
|
|
110
|
+
f"{self.base_url}/v1/usage",
|
|
111
|
+
headers={"x-api-key": self.api_key},
|
|
112
|
+
timeout=self.timeout,
|
|
113
|
+
)
|
|
114
|
+
response.raise_for_status()
|
|
115
|
+
return response.json()
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "docextractbr"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "SDK Python para a DocExtract API — extração de dados de documentos com IA"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.9"
|
|
7
|
+
license = {text = "MIT"}
|
|
8
|
+
authors = [
|
|
9
|
+
{name = "DocExtract"},
|
|
10
|
+
]
|
|
11
|
+
keywords = ["document", "extraction", "ocr", "pdf", "ai", "llm", "api", "sdk"]
|
|
12
|
+
classifiers = [
|
|
13
|
+
"Development Status :: 3 - Alpha",
|
|
14
|
+
"Intended Audience :: Developers",
|
|
15
|
+
"License :: OSI Approved :: MIT License",
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Topic :: Software Development :: Libraries",
|
|
18
|
+
"Topic :: Text Processing",
|
|
19
|
+
]
|
|
20
|
+
dependencies = [
|
|
21
|
+
"httpx>=0.24.0",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
[project.urls]
|
|
25
|
+
Homepage = "https://docextract.com.br"
|
|
26
|
+
Documentation = "https://docextract.com.br/docs"
|
|
27
|
+
|
|
28
|
+
[build-system]
|
|
29
|
+
requires = ["hatchling"]
|
|
30
|
+
build-backend = "hatchling.build"
|
|
31
|
+
|
|
32
|
+
[tool.hatch.build.targets.wheel]
|
|
33
|
+
packages = ["docextract"]
|