docextractbr 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,9 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.egg-info/
4
+ dist/
5
+ .env
6
+ *.db
7
+ .venv/
8
+ .ruff_cache/
9
+ .pytest_cache/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 DocExtract
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,93 @@
1
+ Metadata-Version: 2.4
2
+ Name: docextractbr
3
+ Version: 0.1.0
4
+ Summary: SDK Python para a DocExtract API — extração de dados de documentos com IA
5
+ Project-URL: Homepage, https://docextract.com.br
6
+ Project-URL: Documentation, https://docextract.com.br/docs
7
+ Author: DocExtract
8
+ License: MIT
9
+ License-File: LICENSE
10
+ Keywords: ai,api,document,extraction,llm,ocr,pdf,sdk
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Topic :: Software Development :: Libraries
16
+ Classifier: Topic :: Text Processing
17
+ Requires-Python: >=3.9
18
+ Requires-Dist: httpx>=0.24.0
19
+ Description-Content-Type: text/markdown
20
+
21
+ # DocExtract
22
+
23
+ SDK Python para extração de dados de documentos com IA.
24
+
25
+ Converte PDFs, imagens e documentos brasileiros em JSON estruturado.
26
+
27
+ ## Instalação
28
+
29
+ ```bash
30
+ pip install docextract
31
+ ```
32
+
33
+ ## Uso rápido
34
+
35
+ ```python
36
+ from docextract import DocExtract
37
+
38
+ doc = DocExtract(api_key="dk_sua_chave")
39
+
40
+ # Extrair dados de um PDF
41
+ result = doc.extract("nota_fiscal.pdf")
42
+ print(result["fields"]["emitente_cnpj"])
43
+ print(result["confidence"]) # 0.98
44
+
45
+ # Especificar tipo de documento
46
+ result = doc.extract("cnh.jpg", document_type="cnh")
47
+ print(result["fields"]["nome"])
48
+ print(result["fields"]["cpf"])
49
+
50
+ # Auto-detect
51
+ result = doc.extract("documento.pdf", document_type="auto")
52
+ print(result["document_type"]) # "nfe", "cnh", etc.
53
+ ```
54
+
55
+ ## Tipos de documentos suportados
56
+
57
+ | Tipo | Código |
58
+ |------|--------|
59
+ | CNH | `cnh` |
60
+ | RG | `rg` |
61
+ | CPF | `cpf` |
62
+ | NF-e | `nfe` |
63
+ | NFS-e | `nfse` |
64
+ | Boleto | `boleto` |
65
+ | Contrato Social | `contrato_social` |
66
+ | Comprovante de Residência | `comprovante_residencia` |
67
+ | Holerite | `comprovante_renda` |
68
+ | Detecção automática | `auto` |
69
+
70
+ ## BYOK (Bring Your Own Key)
71
+
72
+ Configure sua própria API key de IA no painel, ou passe inline:
73
+
74
+ ```python
75
+ result = doc.extract(
76
+ "documento.pdf",
77
+ llm_provider="gemini",
78
+ llm_api_key="sua_gemini_key",
79
+ llm_model="gemini-2.5-flash",
80
+ )
81
+ ```
82
+
83
+ ## Extrair de bytes em memória
84
+
85
+ ```python
86
+ with open("doc.pdf", "rb") as f:
87
+ result = doc.extract_bytes(f.read(), "doc.pdf", document_type="nfe")
88
+ ```
89
+
90
+ ## Links
91
+
92
+ - Documentação: https://docextract.com.br/docs
93
+ - Painel: https://docextract.com.br
@@ -0,0 +1,73 @@
1
+ # DocExtract
2
+
3
+ SDK Python para extração de dados de documentos com IA.
4
+
5
+ Converte PDFs, imagens e documentos brasileiros em JSON estruturado.
6
+
7
+ ## Instalação
8
+
9
+ ```bash
10
+ pip install docextract
11
+ ```
12
+
13
+ ## Uso rápido
14
+
15
+ ```python
16
+ from docextract import DocExtract
17
+
18
+ doc = DocExtract(api_key="dk_sua_chave")
19
+
20
+ # Extrair dados de um PDF
21
+ result = doc.extract("nota_fiscal.pdf")
22
+ print(result["fields"]["emitente_cnpj"])
23
+ print(result["confidence"]) # 0.98
24
+
25
+ # Especificar tipo de documento
26
+ result = doc.extract("cnh.jpg", document_type="cnh")
27
+ print(result["fields"]["nome"])
28
+ print(result["fields"]["cpf"])
29
+
30
+ # Auto-detect
31
+ result = doc.extract("documento.pdf", document_type="auto")
32
+ print(result["document_type"]) # "nfe", "cnh", etc.
33
+ ```
34
+
35
+ ## Tipos de documentos suportados
36
+
37
+ | Tipo | Código |
38
+ |------|--------|
39
+ | CNH | `cnh` |
40
+ | RG | `rg` |
41
+ | CPF | `cpf` |
42
+ | NF-e | `nfe` |
43
+ | NFS-e | `nfse` |
44
+ | Boleto | `boleto` |
45
+ | Contrato Social | `contrato_social` |
46
+ | Comprovante de Residência | `comprovante_residencia` |
47
+ | Holerite | `comprovante_renda` |
48
+ | Detecção automática | `auto` |
49
+
50
+ ## BYOK (Bring Your Own Key)
51
+
52
+ Configure sua própria API key de IA no painel, ou passe inline:
53
+
54
+ ```python
55
+ result = doc.extract(
56
+ "documento.pdf",
57
+ llm_provider="gemini",
58
+ llm_api_key="sua_gemini_key",
59
+ llm_model="gemini-2.5-flash",
60
+ )
61
+ ```
62
+
63
+ ## Extrair de bytes em memória
64
+
65
+ ```python
66
+ with open("doc.pdf", "rb") as f:
67
+ result = doc.extract_bytes(f.read(), "doc.pdf", document_type="nfe")
68
+ ```
69
+
70
+ ## Links
71
+
72
+ - Documentação: https://docextract.com.br/docs
73
+ - Painel: https://docextract.com.br
@@ -0,0 +1,6 @@
1
+ """DocExtract SDK — extração de dados de documentos com IA."""
2
+
3
+ from docextract.client import DocExtract
4
+
5
+ __version__ = "0.1.0"
6
+ __all__ = ["DocExtract"]
@@ -0,0 +1,115 @@
1
+ """Cliente Python para a DocExtract API."""
2
+
3
+ from pathlib import Path
4
+
5
+ import httpx
6
+
7
+
8
+ class DocExtract:
9
+ """Extrai dados estruturados de documentos com IA.
10
+
11
+ Uso:
12
+ doc = DocExtract(api_key="dk_sua_chave")
13
+ result = doc.extract("nota_fiscal.pdf")
14
+ print(result["fields"]["valor_total"])
15
+ """
16
+
17
+ def __init__(
18
+ self,
19
+ api_key: str,
20
+ base_url: str = "https://api.docextract.com.br",
21
+ timeout: float = 120.0,
22
+ ):
23
+ self.api_key = api_key
24
+ self.base_url = base_url.rstrip("/")
25
+ self.timeout = timeout
26
+
27
+ def extract(
28
+ self,
29
+ file_path: str,
30
+ document_type: str = "auto",
31
+ llm_provider: str | None = None,
32
+ llm_api_key: str | None = None,
33
+ llm_model: str | None = None,
34
+ ) -> dict:
35
+ """Extrai dados de um documento.
36
+
37
+ Args:
38
+ file_path: caminho do arquivo (PDF, imagem, DOCX, etc.)
39
+ document_type: "auto", "cnh", "rg", "nfe", "nfse", "boleto", etc.
40
+ llm_provider: override do provider (gemini/openai/anthropic)
41
+ llm_api_key: override da API key do LLM (BYOK inline)
42
+ llm_model: override do modelo
43
+
44
+ Returns:
45
+ dict com success, document_type, confidence, fields, metadata
46
+ """
47
+ path = Path(file_path)
48
+ if not path.exists():
49
+ raise FileNotFoundError(f"Arquivo não encontrado: {file_path}")
50
+
51
+ data = {"document_type": document_type}
52
+ if llm_provider:
53
+ data["llm_provider"] = llm_provider
54
+ if llm_api_key:
55
+ data["llm_api_key"] = llm_api_key
56
+ if llm_model:
57
+ data["llm_model"] = llm_model
58
+
59
+ with open(path, "rb") as f:
60
+ response = httpx.post(
61
+ f"{self.base_url}/v1/extract",
62
+ headers={"x-api-key": self.api_key},
63
+ files={"file": (path.name, f)},
64
+ data=data,
65
+ timeout=self.timeout,
66
+ )
67
+
68
+ if response.status_code == 429:
69
+ error = response.json()
70
+ raise Exception(f"Limite mensal atingido: {error.get('detail', {}).get('message', '')}")
71
+
72
+ response.raise_for_status()
73
+ return response.json()
74
+
75
+ def extract_bytes(
76
+ self,
77
+ file_bytes: bytes,
78
+ filename: str,
79
+ document_type: str = "auto",
80
+ ) -> dict:
81
+ """Extrai dados de bytes em memória.
82
+
83
+ Args:
84
+ file_bytes: conteúdo do arquivo em bytes
85
+ filename: nome do arquivo (para detectar tipo)
86
+ document_type: tipo do documento ou "auto"
87
+ """
88
+ response = httpx.post(
89
+ f"{self.base_url}/v1/extract",
90
+ headers={"x-api-key": self.api_key},
91
+ files={"file": (filename, file_bytes)},
92
+ data={"document_type": document_type},
93
+ timeout=self.timeout,
94
+ )
95
+ response.raise_for_status()
96
+ return response.json()
97
+
98
+ def types(self) -> dict:
99
+ """Lista tipos de documentos suportados."""
100
+ response = httpx.get(
101
+ f"{self.base_url}/v1/types",
102
+ timeout=self.timeout,
103
+ )
104
+ response.raise_for_status()
105
+ return response.json()
106
+
107
+ def usage(self) -> dict:
108
+ """Retorna uso atual do mês."""
109
+ response = httpx.get(
110
+ f"{self.base_url}/v1/usage",
111
+ headers={"x-api-key": self.api_key},
112
+ timeout=self.timeout,
113
+ )
114
+ response.raise_for_status()
115
+ return response.json()
@@ -0,0 +1,33 @@
1
+ [project]
2
+ name = "docextractbr"
3
+ version = "0.1.0"
4
+ description = "SDK Python para a DocExtract API — extração de dados de documentos com IA"
5
+ readme = "README.md"
6
+ requires-python = ">=3.9"
7
+ license = {text = "MIT"}
8
+ authors = [
9
+ {name = "DocExtract"},
10
+ ]
11
+ keywords = ["document", "extraction", "ocr", "pdf", "ai", "llm", "api", "sdk"]
12
+ classifiers = [
13
+ "Development Status :: 3 - Alpha",
14
+ "Intended Audience :: Developers",
15
+ "License :: OSI Approved :: MIT License",
16
+ "Programming Language :: Python :: 3",
17
+ "Topic :: Software Development :: Libraries",
18
+ "Topic :: Text Processing",
19
+ ]
20
+ dependencies = [
21
+ "httpx>=0.24.0",
22
+ ]
23
+
24
+ [project.urls]
25
+ Homepage = "https://docextract.com.br"
26
+ Documentation = "https://docextract.com.br/docs"
27
+
28
+ [build-system]
29
+ requires = ["hatchling"]
30
+ build-backend = "hatchling.build"
31
+
32
+ [tool.hatch.build.targets.wheel]
33
+ packages = ["docextract"]