chunkr-ai 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
chunkr_ai/api/auth.py CHANGED
@@ -1,5 +1,3 @@
1
- from typing import Optional
2
-
3
1
  class HeadersMixin:
4
2
  """Mixin class for handling authorization headers"""
5
3
 
chunkr_ai/api/base.py ADDED
@@ -0,0 +1,173 @@
1
+ from .config import Configuration
2
+ from .task import TaskResponse
3
+ from .auth import HeadersMixin
4
+ from abc import abstractmethod
5
+ from dotenv import load_dotenv
6
+ import io
7
+ import json
8
+ import os
9
+ from pathlib import Path
10
+ from PIL import Image
11
+ import requests
12
+ from typing import BinaryIO, Tuple, Union
13
+
14
+ class ChunkrBase(HeadersMixin):
15
+ """Base class with shared functionality for Chunkr API clients."""
16
+
17
+ def __init__(self, url: str = None, api_key: str = None):
18
+ load_dotenv()
19
+ self.url = (
20
+ url or
21
+ os.getenv('CHUNKR_URL') or
22
+ 'https://api.chunkr.ai'
23
+ )
24
+ self._api_key = (
25
+ api_key or
26
+ os.getenv('CHUNKR_API_KEY')
27
+ )
28
+ if not self._api_key:
29
+ raise ValueError("API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai")
30
+
31
+ self.url = self.url.rstrip("/")
32
+
33
+ def _prepare_file(
34
+ self,
35
+ file: Union[str, Path, BinaryIO, Image.Image]
36
+ ) -> Tuple[str, BinaryIO]:
37
+ """Convert various file types into a tuple of (filename, file-like object).
38
+
39
+ Args:
40
+ file: Input file, can be:
41
+ - String or Path to a file
42
+ - URL string starting with http:// or https://
43
+ - Base64 string
44
+ - Opened binary file (mode='rb')
45
+ - PIL/Pillow Image object
46
+
47
+ Returns:
48
+ Tuple[str, BinaryIO]: (filename, file-like object) ready for upload
49
+
50
+ Raises:
51
+ FileNotFoundError: If the file path doesn't exist
52
+ TypeError: If the file type is not supported
53
+ ValueError: If the URL is invalid or unreachable
54
+ ValueError: If the MIME type is unsupported
55
+ """
56
+ # Handle URLs
57
+ if isinstance(file, str) and (file.startswith('http://') or file.startswith('https://')):
58
+ response = requests.get(file)
59
+ response.raise_for_status()
60
+ file_obj = io.BytesIO(response.content)
61
+ filename = Path(file.split('/')[-1]).name or 'downloaded_file'
62
+ return filename, file_obj
63
+
64
+ # Handle base64 strings
65
+ if isinstance(file, str) and ',' in file and ';base64,' in file:
66
+ try:
67
+ # Split header and data
68
+ header, base64_data = file.split(',', 1)
69
+ import base64
70
+ file_bytes = base64.b64decode(base64_data)
71
+ file_obj = io.BytesIO(file_bytes)
72
+
73
+ # Try to determine format from header
74
+ format = 'bin'
75
+ mime_type = header.split(':')[-1].split(';')[0].lower()
76
+
77
+ # Map MIME types to file extensions
78
+ mime_to_ext = {
79
+ 'application/pdf': 'pdf',
80
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
81
+ 'application/msword': 'doc',
82
+ 'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
83
+ 'application/vnd.ms-powerpoint': 'ppt',
84
+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
85
+ 'application/vnd.ms-excel': 'xls',
86
+ 'image/jpeg': 'jpg',
87
+ 'image/png': 'png',
88
+ 'image/jpg': 'jpg'
89
+ }
90
+
91
+ if mime_type in mime_to_ext:
92
+ format = mime_to_ext[mime_type]
93
+ else:
94
+ raise ValueError(f"Unsupported MIME type: {mime_type}")
95
+
96
+ return f"file.{format}", file_obj
97
+ except Exception as e:
98
+ raise ValueError(f"Invalid base64 string: {str(e)}")
99
+
100
+ # Handle file paths
101
+ if isinstance(file, (str, Path)):
102
+ path = Path(file).resolve()
103
+ if not path.exists():
104
+ raise FileNotFoundError(f"File not found: {file}")
105
+ return path.name, open(path, 'rb')
106
+
107
+ # Handle PIL Images
108
+ if isinstance(file, Image.Image):
109
+ img_byte_arr = io.BytesIO()
110
+ format = file.format or 'PNG'
111
+ file.save(img_byte_arr, format=format)
112
+ img_byte_arr.seek(0)
113
+ return f"image.{format.lower()}", img_byte_arr
114
+
115
+ # Handle file-like objects
116
+ if hasattr(file, 'read') and hasattr(file, 'seek'):
117
+ # Try to get the filename from the file object if possible
118
+ name = getattr(file, 'name', 'document') if hasattr(file, 'name') else 'document'
119
+ return Path(name).name, file
120
+
121
+ raise TypeError(f"Unsupported file type: {type(file)}")
122
+
123
+ def _prepare_upload_data(
124
+ self,
125
+ file: Union[str, Path, BinaryIO, Image.Image],
126
+ config: Configuration = None
127
+ ) -> Tuple[dict, dict]:
128
+ """Prepare files and data dictionaries for upload.
129
+
130
+ Args:
131
+ file: The file to upload
132
+ config: Optional configuration settings
133
+
134
+ Returns:
135
+ Tuple[dict, dict]: (files dict, data dict) ready for upload
136
+ """
137
+ filename, file_obj = self._prepare_file(file)
138
+ files = {"file": (filename, file_obj)}
139
+ data = {}
140
+
141
+ if config:
142
+ config_dict = config.model_dump(mode="json", exclude_none=True)
143
+ for key, value in config_dict.items():
144
+ if isinstance(value, dict):
145
+ files[key] = (None, json.dumps(value), 'application/json')
146
+ else:
147
+ data[key] = value
148
+
149
+ return files, data
150
+
151
+ @abstractmethod
152
+ def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
153
+ """Upload a file and wait for processing to complete.
154
+
155
+ Must be implemented by subclasses.
156
+ """
157
+ pass
158
+
159
+ @abstractmethod
160
+ def start_upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
161
+ """Upload a file for processing and immediately return the task response.
162
+
163
+ Must be implemented by subclasses.
164
+ """
165
+ pass
166
+
167
+ @abstractmethod
168
+ def get_task(self, task_id: str) -> TaskResponse:
169
+ """Get a task response by its ID.
170
+
171
+ Must be implemented by subclasses.
172
+ """
173
+ pass
chunkr_ai/api/chunkr.py CHANGED
@@ -1,125 +1,108 @@
1
- from .models import TaskResponse, Configuration
2
- from .auth import HeadersMixin
3
- from dotenv import load_dotenv
4
- import io
5
- import os
1
+ from .base import ChunkrBase
2
+ from .config import Configuration
3
+ from .task import TaskResponse
6
4
  from pathlib import Path
7
5
  from PIL import Image
8
6
  import requests
9
- from typing import Union, BinaryIO, Tuple
7
+ from typing import Union, BinaryIO
10
8
 
11
- class Chunkr(HeadersMixin):
12
- """Client for interacting with the Chunkr API."""
9
+ class Chunkr(ChunkrBase):
10
+ """Chunkr API client"""
13
11
 
14
12
  def __init__(self, url: str = None, api_key: str = None):
15
- load_dotenv()
16
- self.url = (
17
- url or
18
- os.getenv('CHUNKR_URL') or
19
- 'https://api.chunkr.ai'
20
- )
21
- self._api_key = (
22
- api_key or
23
- os.getenv('CHUNKR_API_KEY')
24
- )
25
- if not self._api_key:
26
- raise ValueError("API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai")
27
-
28
- self.url = self.url.rstrip("/")
13
+ super().__init__(url, api_key)
14
+ self._session = requests.Session()
29
15
 
30
- def _prepare_file(
31
- self,
32
- file: Union[str, BinaryIO, Image.Image, bytes, io.BytesIO]
33
- ) -> Tuple[str, BinaryIO]:
34
- """Convert various file types into a tuple of (filename, file-like object).
16
+ def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
17
+ """Upload a file and wait for processing to complete.
35
18
 
36
19
  Args:
37
- file: Input file in various formats
20
+ file: The file to upload.
21
+ config: Configuration options for processing. Optional.
38
22
 
39
- Returns:
40
- Tuple[str, BinaryIO]: Filename and file-like object ready for upload
41
- """
42
- if isinstance(file, str):
43
- path = Path(file).resolve()
44
- if not path.exists():
45
- raise FileNotFoundError(f"File not found: {file}")
46
- return path.name, path.open("rb")
47
- elif isinstance(file, Image.Image):
48
- img_byte_arr = io.BytesIO()
49
- file.save(img_byte_arr, format=file.format or 'PNG')
50
- img_byte_arr.seek(0)
51
- return "image.png", img_byte_arr
52
- elif isinstance(file, bytes):
53
- return "document", io.BytesIO(file)
54
- elif isinstance(file, io.BytesIO):
55
- return "document", file
56
- else:
57
- return "document", file
58
-
59
- def upload(self, file: Union[str, BinaryIO, Image.Image, bytes, io.BytesIO], config: Configuration = None) -> TaskResponse:
60
- """Upload a file and wait for processing to complete.
23
+ Examples:
24
+ ```
25
+ # Upload from file path
26
+ chunkr.upload("document.pdf")
61
27
 
62
- The file can be one of:
63
- - str: Path to a file on disk
64
- - BinaryIO: A file-like object (e.g., opened with 'rb' mode)
65
- - Image.Image: A PIL/Pillow Image object
66
- - bytes: Raw binary data
67
- - io.BytesIO: A binary stream in memory
28
+ # Upload from URL
29
+ chunkr.upload("https://example.com/document.pdf")
68
30
 
69
- Args:
70
- file: The file to upload.
71
- config:
72
- Configuration options for processing. Optional.
31
+ # Upload from base64 string (must include MIME type header)
32
+ chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
73
33
 
34
+ # Upload from opened file
35
+ with open("document.pdf", "rb") as f:
36
+ chunkr.upload(f)
37
+
38
+ # Upload an image
39
+ from PIL import Image
40
+ img = Image.open("photo.jpg")
41
+ chunkr.upload(img)
42
+ ```
74
43
  Returns:
75
44
  TaskResponse: The completed task response
76
45
  """
77
- return self.start_upload(file, config).poll()
46
+ task = self.start_upload(file, config)
47
+ return task.poll()
78
48
 
79
- def start_upload(self, file: Union[str, BinaryIO, Image.Image, bytes, io.BytesIO], config: Configuration = None) -> TaskResponse:
49
+ def start_upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
80
50
  """Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`
81
51
 
82
- The file can be one of:
83
- - str: Path to a file on disk
84
- - BinaryIO: A file-like object (e.g., opened with 'rb' mode)
85
- - Image.Image: A PIL/Pillow Image object
86
- - bytes: Raw binary data
87
- - io.BytesIO: A binary stream in memory
88
-
89
52
  Args:
90
53
  file: The file to upload.
91
- config (Configuration, optional): Configuration options for processing
54
+ config: Configuration options for processing. Optional.
55
+
56
+ Examples:
57
+ ```
58
+ # Upload from file path
59
+ task = chunkr.start_upload("document.pdf")
60
+
61
+ # Upload from opened file
62
+ with open("document.pdf", "rb") as f:
63
+ task = chunkr.start_upload(f)
64
+
65
+ # Upload from URL
66
+ task = chunkr.start_upload("https://example.com/document.pdf")
67
+
68
+ # Upload from base64 string (must include MIME type header)
69
+ task = chunkr.start_upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
70
+
71
+ # Upload an image
72
+ from PIL import Image
73
+ img = Image.open("photo.jpg")
74
+ task = chunkr.start_upload(img)
75
+
76
+ # Wait for the task to complete - this can be done when needed
77
+ task.poll()
78
+ ```
92
79
 
93
80
  Returns:
94
81
  TaskResponse: The initial task response
95
-
96
- Raises:
97
- requests.exceptions.HTTPError: If the API request fails
98
82
  """
99
- url = f"{self.url}/api/v1/task"
100
- filename, file_obj = self._prepare_file(file)
101
-
102
- files = {"file": (filename, file_obj)}
103
- r = requests.post(
104
- url,
105
- files=files,
106
- json=config.dict() if config else {},
83
+ files, data = self._prepare_upload_data(file, config)
84
+ r = self._session.post(
85
+ f"{self.url}/api/v1/task",
86
+ files=files,
87
+ data=data,
107
88
  headers=self._headers()
108
89
  )
109
90
  r.raise_for_status()
110
- return TaskResponse(**r.json()).with_api_key(self._api_key)
91
+ return TaskResponse(**r.json()).with_client(self)
111
92
 
112
93
  def get_task(self, task_id: str) -> TaskResponse:
113
94
  """Get a task response by its ID.
114
95
 
115
96
  Args:
116
- task_id (str): The ID of the task to get
97
+ task_id: The ID of the task to get
117
98
 
118
99
  Returns:
119
100
  TaskResponse: The task response
120
101
  """
121
- url = f"{self.url}/api/v1/task/{task_id}"
122
- r = requests.get(url, headers=self._headers())
102
+ r = self._session.get(
103
+ f"{self.url}/api/v1/task/{task_id}",
104
+ headers=self._headers()
105
+ )
123
106
  r.raise_for_status()
124
- return TaskResponse(**r.json()).with_api_key(self._api_key)
107
+ return TaskResponse(**r.json()).with_client(self)
125
108
 
@@ -1,39 +1,105 @@
1
- from .chunkr import Chunkr
2
- from .models import TaskResponse, Configuration
1
+ from .base import ChunkrBase
2
+ from .task import TaskResponse
3
+ from .config import Configuration
3
4
  import httpx
4
- import io
5
+ from pathlib import Path
5
6
  from PIL import Image
6
7
  from typing import Union, BinaryIO
7
8
 
8
- class ChunkrAsync(Chunkr):
9
- """Async client for interacting with the Chunkr API.
9
+ class ChunkrAsync(ChunkrBase):
10
+ """Asynchronous Chunkr API client"""
10
11
 
11
- This class inherits from the Chunkr class but works with async HTTP requests.
12
- """
12
+ def __init__(self, url: str = None, api_key: str = None):
13
+ super().__init__(url, api_key)
14
+ self._client = httpx.AsyncClient()
13
15
 
14
- async def upload(self, file: Union[str, BinaryIO, Image.Image, bytes, io.BytesIO], config: Configuration = None) -> TaskResponse:
16
+ async def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
17
+ """Upload a file and wait for processing to complete.
18
+
19
+ Args:
20
+ file: The file to upload.
21
+ config: Configuration options for processing. Optional.
22
+
23
+ Examples:
24
+ ```python
25
+ # Upload from file path
26
+ await chunkr.upload("document.pdf")
27
+
28
+ # Upload from opened file
29
+ with open("document.pdf", "rb") as f:
30
+ await chunkr.upload(f)
31
+
32
+ # Upload from URL
33
+ await chunkr.upload("https://example.com/document.pdf")
34
+
35
+ # Upload from base64 string (must include MIME type header)
36
+ await chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
37
+
38
+ # Upload an image
39
+ from PIL import Image
40
+ img = Image.open("photo.jpg")
41
+ await chunkr.upload(img)
42
+ ```
43
+ Returns:
44
+ TaskResponse: The completed task response
45
+ """
15
46
  task = await self.start_upload(file, config)
16
47
  return await task.poll_async()
17
48
 
18
- async def start_upload(self, file: Union[str, BinaryIO, Image.Image, bytes, io.BytesIO], config: Configuration = None) -> TaskResponse:
19
- url = f"{self.url}/api/v1/task"
20
- filename, file_obj = self._prepare_file(file)
21
- async with httpx.AsyncClient() as client:
22
- files = {"file": (filename, file_obj)}
23
- r = await client.post(
24
- url,
25
- files=files,
26
- json=config.dict() if config else {},
27
- headers=self._headers()
28
- )
29
- r.raise_for_status()
30
- return TaskResponse(**r.json()).with_api_key(self._api_key)
49
+ async def start_upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
50
+ """Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll_async()`.
51
+
52
+ Args:
53
+ file: The file to upload.
54
+ config: Configuration options for processing. Optional.
55
+
56
+ Examples:
57
+ ```
58
+ # Upload from file path
59
+ task = await chunkr.start_upload("document.pdf")
60
+
61
+ # Upload from opened file
62
+ with open("document.pdf", "rb") as f:
63
+ task = await chunkr.start_upload(f)
64
+
65
+ # Upload from URL
66
+ task = await chunkr.start_upload("https://example.com/document.pdf")
67
+
68
+ # Upload from base64 string (must include MIME type header)
69
+ task = await chunkr.start_upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
70
+
71
+ # Upload an image
72
+ from PIL import Image
73
+ img = Image.open("photo.jpg")
74
+ task = await chunkr.start_upload(img)
75
+
76
+ # Wait for the task to complete - this can be done when needed
77
+ await task.poll_async()
78
+ ```
79
+
80
+ Returns:
81
+ TaskResponse: The initial task response
82
+ """
83
+ files, data = self._prepare_upload_data(file, config)
84
+ r = await self._client.post(
85
+ f"{self.url}/api/v1/task",
86
+ files=files,
87
+ json=config.model_dump() if config else {},
88
+ headers=self._headers()
89
+ )
90
+ r.raise_for_status()
91
+ return TaskResponse(**r.json()).with_client(self)
31
92
 
32
93
  async def get_task(self, task_id: str) -> TaskResponse:
33
- url = f"{self.url}/api/v1/task/{task_id}"
34
- async with httpx.AsyncClient() as client:
35
- r = await client.get(url, headers=self._headers())
36
- r.raise_for_status()
37
- return TaskResponse(**r.json()).with_api_key(self._api_key)
94
+ r = await self._client.get(
95
+ f"{self.url}/api/v1/task/{task_id}",
96
+ headers=self._headers()
97
+ )
98
+ r.raise_for_status()
99
+ return TaskResponse(**r.json()).with_client(self)
100
+
101
+ async def __aenter__(self):
102
+ return self
38
103
 
39
-
104
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
105
+ await self._client.aclose()
@@ -0,0 +1,131 @@
1
+ from pydantic import BaseModel, Field, model_validator
2
+ from enum import Enum
3
+ from typing import Optional, List, Dict
4
+
5
+ class GenerationStrategy(str, Enum):
6
+ LLM = "LLM"
7
+ AUTO = "Auto"
8
+
9
+ class CroppingStrategy(str, Enum):
10
+ ALL = "All"
11
+ AUTO = "Auto"
12
+
13
+ class LlmConfig(BaseModel):
14
+ model: str
15
+ prompt: str
16
+ temperature: float = 0.0
17
+
18
+ class GenerationConfig(BaseModel):
19
+ html: Optional[GenerationStrategy] = None
20
+ llm: Optional[LlmConfig] = None
21
+ markdown: Optional[GenerationStrategy] = None
22
+ crop_image: Optional[CroppingStrategy] = None
23
+
24
+ class SegmentProcessing(BaseModel):
25
+ title: Optional[GenerationConfig] = None
26
+ section_header: Optional[GenerationConfig] = None
27
+ text: Optional[GenerationConfig] = None
28
+ list_item: Optional[GenerationConfig] = None
29
+ table: Optional[GenerationConfig] = None
30
+ picture: Optional[GenerationConfig] = None
31
+ caption: Optional[GenerationConfig] = None
32
+ formula: Optional[GenerationConfig] = None
33
+ footnote: Optional[GenerationConfig] = None
34
+ page_header: Optional[GenerationConfig] = None
35
+ page_footer: Optional[GenerationConfig] = None
36
+ page: Optional[GenerationConfig] = None
37
+
38
+ class ChunkProcessing(BaseModel):
39
+ target_length: Optional[int] = None
40
+
41
+ class Property(BaseModel):
42
+ name: str
43
+ title: Optional[str] = None
44
+ prop_type: str
45
+ description: Optional[str] = None
46
+ default: Optional[str] = None
47
+
48
+ class JsonSchema(BaseModel):
49
+ title: str
50
+ properties: List[Property]
51
+
52
+ class OcrStrategy(str, Enum):
53
+ ALL = "All"
54
+ AUTO = "Auto"
55
+
56
+ class SegmentationStrategy(str, Enum):
57
+ LAYOUT_ANALYSIS = "LayoutAnalysis"
58
+ PAGE = "Page"
59
+
60
+ class BoundingBox(BaseModel):
61
+ left: float
62
+ top: float
63
+ width: float
64
+ height: float
65
+
66
+ class OCRResult(BaseModel):
67
+ bbox: BoundingBox
68
+ text: str
69
+ confidence: Optional[float]
70
+
71
+ class SegmentType(str, Enum):
72
+ CAPTION = "Caption"
73
+ FOOTNOTE = "Footnote"
74
+ FORMULA = "Formula"
75
+ LIST_ITEM = "ListItem"
76
+ PAGE = "Page"
77
+ PAGE_FOOTER = "PageFooter"
78
+ PAGE_HEADER = "PageHeader"
79
+ PICTURE = "Picture"
80
+ SECTION_HEADER = "SectionHeader"
81
+ TABLE = "Table"
82
+ TEXT = "Text"
83
+ TITLE = "Title"
84
+
85
+ class Segment(BaseModel):
86
+ bbox: BoundingBox
87
+ content: str
88
+ page_height: float
89
+ html: Optional[str]
90
+ image: Optional[str]
91
+ markdown: Optional[str]
92
+ ocr: List[OCRResult]
93
+ page_number: int
94
+ page_width: float
95
+ segment_id: str
96
+ segment_type: SegmentType
97
+
98
+ class Chunk(BaseModel):
99
+ chunk_id: str
100
+ chunk_length: int
101
+ segments: List[Segment]
102
+
103
+ class ExtractedJson(BaseModel):
104
+ data: Dict
105
+
106
+ class OutputResponse(BaseModel):
107
+ chunks: List[Chunk] = []
108
+ extracted_json: Optional[ExtractedJson]
109
+
110
+ class Model(str, Enum):
111
+ FAST = "Fast"
112
+ HIGH_QUALITY = "HighQuality"
113
+
114
+ class Configuration(BaseModel):
115
+ chunk_processing: Optional[ChunkProcessing] = Field(default=None)
116
+ expires_in: Optional[int] = Field(default=None)
117
+ high_resolution: Optional[bool] = Field(default=None)
118
+ json_schema: Optional[JsonSchema] = Field(default=None)
119
+ model: Optional[Model] = Field(default=None)
120
+ ocr_strategy: Optional[OcrStrategy] = Field(default=None)
121
+ segment_processing: Optional[SegmentProcessing] = Field(default=None)
122
+ segmentation_strategy: Optional[SegmentationStrategy] = Field(default=None)
123
+
124
+ @model_validator(mode='before')
125
+ def map_deprecated_fields(cls, values: Dict) -> Dict:
126
+ if isinstance(values, dict) and "target_chunk_length" in values:
127
+ target_length = values.pop("target_chunk_length")
128
+ if target_length is not None:
129
+ values["chunk_processing"] = values.get("chunk_processing", {}) or {}
130
+ values["chunk_processing"]["target_length"] = target_length
131
+ return values
@@ -0,0 +1,19 @@
1
+ from typing import runtime_checkable, Protocol
2
+ from requests import Session
3
+ from httpx import AsyncClient
4
+
5
+ @runtime_checkable
6
+ class ChunkrClientProtocol(Protocol):
7
+ """Protocol defining the interface for Chunkr clients"""
8
+ url: str
9
+ _api_key: str
10
+ _session: Session
11
+ _client: AsyncClient
12
+
13
+ def get_api_key(self) -> str:
14
+ """Get the API key"""
15
+ ...
16
+
17
+ def _headers(self) -> dict:
18
+ """Return headers required for API requests"""
19
+ ...
chunkr_ai/api/task.py ADDED
@@ -0,0 +1,131 @@
1
+ from .protocol import ChunkrClientProtocol
2
+ from .config import Configuration, OutputResponse
3
+ import asyncio
4
+ from datetime import datetime
5
+ from enum import Enum
6
+ from pydantic import BaseModel, PrivateAttr
7
+ import time
8
+ from typing import Optional, Union
9
+
10
+ class Status(str, Enum):
11
+ STARTING = "Starting"
12
+ PROCESSING = "Processing"
13
+ SUCCEEDED = "Succeeded"
14
+ FAILED = "Failed"
15
+
16
+ class TaskResponse(BaseModel):
17
+ configuration: Configuration
18
+ created_at: datetime
19
+ expires_at: Optional[datetime]
20
+ file_name: Optional[str]
21
+ finished_at: Optional[datetime]
22
+ input_file_url: Optional[str]
23
+ message: str
24
+ output: Optional[OutputResponse]
25
+ page_count: Optional[int]
26
+ pdf_url: Optional[str]
27
+ status: Status
28
+ task_id: str
29
+ task_url: Optional[str]
30
+ _client: Optional[Union[ChunkrClientProtocol]] = PrivateAttr(default=None)
31
+
32
+ def with_client(self, client: Union[ChunkrClientProtocol]) -> 'TaskResponse':
33
+ self._client = client
34
+ return self
35
+
36
+ def _poll_request_sync(self) -> dict:
37
+ """Helper method to make polling request with retry logic (synchronous)"""
38
+ if not self.task_url:
39
+ raise ValueError("Task URL not found in response")
40
+
41
+ while True:
42
+ try:
43
+ r = self._client._session.get(self.task_url, headers=self._client._headers())
44
+ r.raise_for_status()
45
+ return r.json()
46
+ except (ConnectionError, TimeoutError) as _:
47
+ print("Connection error while polling the task, retrying...")
48
+ time.sleep(0.5)
49
+ except Exception as e:
50
+ raise
51
+
52
+ async def _poll_request_async(self) -> dict:
53
+ """Helper method to make polling request with retry logic (asynchronous)"""
54
+ if not self.task_url:
55
+ raise ValueError("Task URL not found in response")
56
+
57
+ while True:
58
+ try:
59
+ r = await self._client._client.get(self.task_url, headers=self._client._headers())
60
+ await r.raise_for_status()
61
+ return await r.json()
62
+ except (ConnectionError, TimeoutError) as _:
63
+ print("Connection error while polling the task, retrying...")
64
+ await asyncio.sleep(0.5)
65
+ except Exception as e:
66
+ raise
67
+
68
+ def _check_status(self) -> Optional['TaskResponse']:
69
+ """Helper method to check task status and handle completion/failure"""
70
+ if self.status == "Failed":
71
+ raise ValueError(self.message)
72
+ if self.status not in ("Starting", "Processing"):
73
+ return self
74
+ return None
75
+
76
+ def poll(self) -> 'TaskResponse':
77
+ """Poll the task for completion."""
78
+ while True:
79
+ response = self._poll_request_sync()
80
+ self.__dict__.update(response)
81
+
82
+ if result := self._check_status():
83
+ return result
84
+
85
+ time.sleep(0.5)
86
+
87
+ async def poll_async(self) -> 'TaskResponse':
88
+ """Poll the task for completion asynchronously."""
89
+ while True:
90
+ response = await self._poll_request_async()
91
+ self.__dict__.update(response)
92
+
93
+ if result := self._check_status():
94
+ return result
95
+
96
+ await asyncio.sleep(0.5)
97
+
98
+ def _get_content(self, content_type: str) -> str:
99
+ """Helper method to get either HTML, Markdown, or raw content."""
100
+ if not self.output:
101
+ return ""
102
+ parts = []
103
+ for c in self.output.chunks:
104
+ for s in c.segments:
105
+ content = getattr(s, content_type)
106
+ if content:
107
+ parts.append(content)
108
+ return "\n".join(parts)
109
+
110
+ def html(self) -> str:
111
+ """Get full HTML for the task"""
112
+ return self._get_content("html")
113
+
114
+ def markdown(self) -> str:
115
+ """Get full markdown for the task"""
116
+ return self._get_content("markdown")
117
+
118
+ def content(self) -> str:
119
+ """Get full text for the task"""
120
+ return self._get_content("content")
121
+
122
+ class TaskPayload(BaseModel):
123
+ current_configuration: Configuration
124
+ file_name: str
125
+ image_folder_location: str
126
+ input_location: str
127
+ output_location: str
128
+ pdf_location: str
129
+ previous_configuration: Optional[Configuration]
130
+ task_id: str
131
+ user_id: str
chunkr_ai/models.py ADDED
@@ -0,0 +1,48 @@
1
+ from .api.config import (
2
+ BoundingBox,
3
+ Chunk,
4
+ ChunkProcessing,
5
+ Configuration,
6
+ CroppingStrategy,
7
+ ExtractedJson,
8
+ GenerationStrategy,
9
+ GenerationConfig,
10
+ JsonSchema,
11
+ LlmConfig,
12
+ Model,
13
+ OCRResult,
14
+ OcrStrategy,
15
+ OutputResponse,
16
+ Property,
17
+ Segment,
18
+ SegmentProcessing,
19
+ SegmentType,
20
+ SegmentationStrategy,
21
+ )
22
+
23
+ from .api.task import TaskResponse, TaskPayload, Status
24
+
25
+ __all__ = [
26
+ 'BoundingBox',
27
+ 'Chunk',
28
+ 'ChunkProcessing',
29
+ 'Configuration',
30
+ 'CroppingStrategy',
31
+ 'ExtractedJson',
32
+ 'GenerationConfig',
33
+ 'GenerationStrategy',
34
+ 'JsonSchema',
35
+ 'LlmConfig',
36
+ 'Model',
37
+ 'OCRResult',
38
+ 'OcrStrategy',
39
+ 'OutputResponse',
40
+ 'Property',
41
+ 'Segment',
42
+ 'SegmentProcessing',
43
+ 'SegmentType',
44
+ 'SegmentationStrategy',
45
+ 'Status',
46
+ 'TaskPayload',
47
+ 'TaskResponse'
48
+ ]
@@ -0,0 +1,204 @@
1
+ Metadata-Version: 2.2
2
+ Name: chunkr-ai
3
+ Version: 0.0.4
4
+ Summary: Python client for Chunkr: open source document intelligence
5
+ Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
+ Project-URL: Homepage, https://chunkr.ai
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Requires-Dist: httpx>=0.28.1
10
+ Requires-Dist: pillow>=11.1.0
11
+ Requires-Dist: pydantic>=2.10.4
12
+ Requires-Dist: python-dotenv>=1.0.1
13
+ Requires-Dist: requests>=2.32.3
14
+ Provides-Extra: test
15
+ Requires-Dist: pytest>=8.3.4; extra == "test"
16
+ Requires-Dist: pytest-xdist>=3.6.1; extra == "test"
17
+
18
+ # Chunkr Python Client
19
+
20
+ This provides a simple interface to interact with the Chunkr API.
21
+
22
+ ## Getting Started
23
+
24
+ You can get an API key from [Chunkr](https://chunkr.ai) or deploy your own Chunkr instance. For self-hosted deployment options, check out our [deployment guide](https://github.com/lumina-ai-inc/chunkr/tree/main?tab=readme-ov-file#self-hosted-deployment-options).
25
+
26
+ For more information about the API and its capabilities, visit the [Chunkr API docs](https://docs.chunkr.ai).
27
+
28
+ ## Installation
29
+
30
+ ```bash
31
+ pip install chunkr-ai
32
+ ```
33
+
34
+ ## Usage
35
+
36
+ We provide two clients: `Chunkr` for synchronous operations and `ChunkrAsync` for asynchronous operations.
37
+
38
+ ### Synchronous Usage
39
+
40
+ ```python
41
+ from chunkr_ai import Chunkr
42
+
43
+ # Initialize client
44
+ chunkr = Chunkr()
45
+
46
+ # Upload a file and wait for processing
47
+ task = chunkr.upload("document.pdf")
48
+
49
+ # Print the response
50
+ print(task)
51
+
52
+ # Get output from task
53
+ output = task.output
54
+
55
+ # If you want to upload without waiting for processing
56
+ task = chunkr.start_upload("document.pdf")
57
+ # ... do other things ...
58
+ task.poll() # Check status when needed
59
+ ```
60
+
61
+ ### Asynchronous Usage
62
+
63
+ ```python
64
+ from chunkr_ai import ChunkrAsync
65
+
66
+ async def process_document():
67
+ # Initialize client
68
+ chunkr = ChunkrAsync()
69
+
70
+ # Upload a file and wait for processing
71
+ task = await chunkr.upload("document.pdf")
72
+
73
+ # Print the response
74
+ print(task)
75
+
76
+ # Get output from task
77
+ output = task.output
78
+
79
+ # If you want to upload without waiting for processing
80
+ task = await chunkr.start_upload("document.pdf")
81
+ # ... do other things ...
82
+ await task.poll_async() # Check status when needed
83
+ ```
84
+
85
+ ### Additional Features
86
+
87
+ Both clients support various input types:
88
+
89
+ ```python
90
+ # Upload from file path
91
+ chunkr.upload("document.pdf")
92
+
93
+ # Upload from opened file
94
+ with open("document.pdf", "rb") as f:
95
+ chunkr.upload(f)
96
+
97
+ # Upload from URL
98
+ chunkr.upload("https://example.com/document.pdf")
99
+
100
+ # Upload from base64 string
101
+ chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
102
+
103
+ # Upload an image
104
+ from PIL import Image
105
+ img = Image.open("photo.jpg")
106
+ chunkr.upload(img)
107
+ ```
108
+
109
+ ### Configuration
110
+
111
+ You can customize the processing behavior by passing a `Configuration` object:
112
+
113
+ ```python
114
+ from chunkr_ai.models import Configuration, OcrStrategy, SegmentationStrategy, GenerationStrategy
115
+
116
+ # Basic configuration
117
+ config = Configuration(
118
+ ocr_strategy=OcrStrategy.AUTO,
119
+ segmentation_strategy=SegmentationStrategy.LAYOUT_ANALYSIS,
120
+ high_resolution=True,
121
+ expires_in=3600, # seconds
122
+ )
123
+
124
+ # Upload with configuration
125
+ task = chunkr.upload("document.pdf", config)
126
+ ```
127
+
128
+ #### Available Configuration Examples
129
+
130
+ - **Chunk Processing**
131
+ ```python
132
+ from chunkr_ai.models import ChunkProcessing
133
+ config = Configuration(
134
+ chunk_processing=ChunkProcessing(target_length=1024)
135
+ )
136
+ ```
137
+ - **Expires In**
138
+ ```python
139
+ config = Configuration(expires_in=3600)
140
+ ```
141
+
142
+ - **High Resolution**
143
+ ```python
144
+ config = Configuration(high_resolution=True)
145
+ ```
146
+
147
+ - **JSON Schema**
148
+ ```python
149
+ config = Configuration(json_schema=JsonSchema(
150
+ title="Sales Data",
151
+ properties=[
152
+ Property(name="Person with highest sales", prop_type="string", description="The person with the highest sales"),
153
+ Property(name="Person with lowest sales", prop_type="string", description="The person with the lowest sales"),
154
+ ]
155
+ ))
156
+ ```
157
+
158
+ - **OCR Strategy**
159
+ ```python
160
+ config = Configuration(ocr_strategy=OcrStrategy.AUTO)
161
+ ```
162
+
163
+ - **Segment Processing**
164
+ ```python
165
+ from chunkr_ai.models import SegmentProcessing, GenerationConfig, GenerationStrategy
166
+ config = Configuration(
167
+ segment_processing=SegmentProcessing(
168
+ page=GenerationConfig(
169
+ html=GenerationStrategy.LLM,
170
+ markdown=GenerationStrategy.LLM
171
+ )
172
+ )
173
+ )
174
+ ```
175
+
176
+ - **Segmentation Strategy**
177
+ ```python
178
+ config = Configuration(
179
+ segmentation_strategy=SegmentationStrategy.LAYOUT_ANALYSIS # or SegmentationStrategy.PAGE
180
+ )
181
+ ```
182
+
183
+ ## Environment setup
184
+
185
+ You can provide your API key and URL in several ways:
186
+ 1. Environment variables: `CHUNKR_API_KEY` and `CHUNKR_URL`
187
+ 2. `.env` file
188
+ 3. Direct initialization:
189
+ ```python
190
+ chunkr = Chunkr(
191
+ api_key="your-api-key",
192
+ url="https://api.chunkr.ai"
193
+ )
194
+ ```
195
+
196
+ ## Run tests
197
+
198
+ ```python
199
+ # Install dependencies
200
+ uv pip install -e ".[test]"
201
+
202
+ # Run tests
203
+ uv run pytest
204
+ ```
@@ -0,0 +1,17 @@
1
+ chunkr_ai/__init__.py,sha256=eXygrEhGxxIHXNYIlHF2eied8rGsx2RphgR8Wo4lRyo,110
2
+ chunkr_ai/main.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ chunkr_ai/models.py,sha256=d-B4vfgZClJOoHdPaH3vagwUc4qxeQSmUxab77DKYtQ,874
4
+ chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ chunkr_ai/api/api.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ chunkr_ai/api/auth.py,sha256=iSd5Jek2BFaHGw9HY-RrqgwP56BHFU0xbSuJS4fU6AA,425
7
+ chunkr_ai/api/base.py,sha256=WDHx8tU0fl9_-yvYTKL-U0uaxHv-8_bRfiw9Xkl-mWM,6499
8
+ chunkr_ai/api/chunkr.py,sha256=LkBFzGB_T0y3fnBeIn_nwQW6Mb7eZO-iTlzWrmWBoko,3450
9
+ chunkr_ai/api/chunkr_async.py,sha256=B9deRVoe4h3Csh_jEuQxuxQ-DKSuZPdwkanFTyfHmeM,3603
10
+ chunkr_ai/api/config.py,sha256=K0s1giImciPksu-bO9gzRwUaK2Vo1nxNKQkXlRQ2cb8,3785
11
+ chunkr_ai/api/protocol.py,sha256=XKS9RmtvBpJItYhPg18qlOCKpaSHdOuQTRSUxAdUz2g,479
12
+ chunkr_ai/api/task.py,sha256=ALU-rYlObbitlM1MKEFeSz_IBUpzb9736Iqu9huWg7c,4392
13
+ chunkr_ai-0.0.4.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ chunkr_ai-0.0.4.dist-info/METADATA,sha256=7k2zij-F7_Kcs6nFCJMKQW382gFpOOLAnZoOOXFrKFs,4913
15
+ chunkr_ai-0.0.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
16
+ chunkr_ai-0.0.4.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
17
+ chunkr_ai-0.0.4.dist-info/RECORD,,
chunkr_ai/api/models.py DELETED
@@ -1,231 +0,0 @@
1
- from .auth import HeadersMixin
2
- import asyncio
3
- from datetime import datetime
4
- from enum import Enum
5
- import httpx
6
- from pydantic import BaseModel, Field, PrivateAttr
7
- import requests
8
- import time
9
- from typing import Optional, List, Dict, Union
10
-
11
- class GenerationStrategy(str, Enum):
12
- LLM = "LLM"
13
- AUTO = "Auto"
14
-
15
- class CroppingStrategy(str, Enum):
16
- ALL = "All"
17
- AUTO = "Auto"
18
-
19
- class LlmConfig(BaseModel):
20
- model: str
21
- prompt: str
22
- temperature: float = 0.0
23
-
24
- class AutoGenerationConfig(BaseModel):
25
- html: GenerationStrategy = GenerationStrategy.AUTO
26
- llm: Optional[LlmConfig] = None
27
- markdown: GenerationStrategy = GenerationStrategy.AUTO
28
- crop_image: CroppingStrategy = CroppingStrategy.ALL
29
-
30
- class LlmGenerationConfig(BaseModel):
31
- html: GenerationStrategy = GenerationStrategy.LLM
32
- llm: Optional[LlmConfig] = None
33
- markdown: GenerationStrategy = GenerationStrategy.LLM
34
- crop_image: CroppingStrategy = CroppingStrategy.ALL
35
-
36
- class SegmentProcessing(BaseModel):
37
- title: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
38
- section_header: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
39
- text: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
40
- list_item: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
41
- table: LlmGenerationConfig = Field(default_factory=LlmGenerationConfig)
42
- picture: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
43
- caption: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
44
- formula: LlmGenerationConfig = Field(default_factory=LlmGenerationConfig)
45
- footnote: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
46
- page_header: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
47
- page_footer: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
48
- page: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
49
-
50
- class ChunkProcessing(BaseModel):
51
- target_length: int = 512
52
-
53
- class Property(BaseModel):
54
- name: str
55
- title: Optional[str]
56
- prop_type: str
57
- description: Optional[str]
58
- default: Optional[str]
59
-
60
- class JsonSchema(BaseModel):
61
- title: str
62
- properties: List[Property]
63
- schema_type: Optional[str]
64
-
65
- class OcrStrategy(str, Enum):
66
- ALL = "All"
67
- AUTO = "Auto"
68
-
69
- class SegmentationStrategy(str, Enum):
70
- LAYOUT_ANALYSIS = "LayoutAnalysis"
71
- PAGE = "Page"
72
-
73
- class BoundingBox(BaseModel):
74
- left: float
75
- top: float
76
- width: float
77
- height: float
78
-
79
- class OCRResult(BaseModel):
80
- bbox: BoundingBox
81
- text: str
82
- confidence: Optional[float]
83
-
84
- class SegmentType(str, Enum):
85
- CAPTION = "Caption"
86
- FOOTNOTE = "Footnote"
87
- FORMULA = "Formula"
88
- LIST_ITEM = "ListItem"
89
- PAGE = "Page"
90
- PAGE_FOOTER = "PageFooter"
91
- PAGE_HEADER = "PageHeader"
92
- PICTURE = "Picture"
93
- SECTION_HEADER = "SectionHeader"
94
- TABLE = "Table"
95
- TEXT = "Text"
96
- TITLE = "Title"
97
-
98
- class Segment(BaseModel):
99
- bbox: BoundingBox
100
- content: str
101
- page_height: float
102
- html: Optional[str]
103
- image: Optional[str]
104
- markdown: Optional[str]
105
- ocr: List[OCRResult]
106
- page_number: int
107
- page_width: float
108
- segment_id: str
109
- segment_type: SegmentType
110
-
111
- class Chunk(BaseModel):
112
- chunk_id: str
113
- chunk_length: int
114
- segments: List[Segment]
115
-
116
- class ExtractedJson(BaseModel):
117
- data: Dict
118
-
119
- class OutputResponse(BaseModel):
120
- chunks: List[Chunk] = []
121
- extracted_json: Optional[ExtractedJson]
122
-
123
- class Model(str, Enum):
124
- FAST = "Fast"
125
- HIGH_QUALITY = "HighQuality"
126
-
127
- class Configuration(BaseModel):
128
- chunk_processing: ChunkProcessing = Field(default_factory=ChunkProcessing)
129
- expires_in: Optional[int] = None
130
- high_resolution: bool = False
131
- json_schema: Optional[JsonSchema] = None
132
- model: Optional[Model] = Field(None, deprecated=True)
133
- ocr_strategy: OcrStrategy = OcrStrategy.AUTO
134
- segment_processing: SegmentProcessing = Field(default_factory=SegmentProcessing)
135
- segmentation_strategy: SegmentationStrategy = SegmentationStrategy.LAYOUT_ANALYSIS
136
- target_chunk_length: Optional[int] = Field(None, deprecated=True)
137
-
138
-
139
- class Status(str, Enum):
140
- STARTING = "Starting"
141
- PROCESSING = "Processing"
142
- SUCCEEDED = "Succeeded"
143
- FAILED = "Failed"
144
-
145
- class TaskResponse(BaseModel, HeadersMixin):
146
- configuration: Configuration
147
- created_at: datetime
148
- expires_at: Optional[datetime]
149
- file_name: Optional[str]
150
- finished_at: Optional[datetime]
151
- input_file_url: Optional[str]
152
- message: str
153
- output: Optional[OutputResponse]
154
- page_count: Optional[int]
155
- pdf_url: Optional[str]
156
- status: Status
157
- task_id: str
158
- task_url: Optional[str]
159
- _api_key: Optional[str] = PrivateAttr(default=None)
160
-
161
- def with_api_key(self, api_key: str) -> 'TaskResponse':
162
- """Helper function to set api key on a TaskResponse after creation"""
163
- self._api_key = api_key
164
- return self
165
-
166
- def poll(self) -> 'TaskResponse':
167
- """Poll the task for completion"""
168
- if not self.task_url:
169
- raise ValueError("Task URL not found in response")
170
-
171
- while True:
172
- r = requests.get(self.task_url, headers=self._headers())
173
- r.raise_for_status()
174
- self.__dict__.update(r.json())
175
- if self.status == "Failed":
176
- raise ValueError(self.message)
177
- if self.status not in ("Starting", "Processing"):
178
- return self
179
- time.sleep(0.5)
180
-
181
- async def poll_async(self) -> 'TaskResponse':
182
- """Async poll the task for completion"""
183
- if not self.task_url:
184
- raise ValueError("Task URL not found in response")
185
-
186
- async with httpx.AsyncClient() as client:
187
- while True:
188
- r = await client.get(self.task_url, headers=self._headers())
189
- r.raise_for_status()
190
- self.__dict__.update(r.json())
191
- if self.status == "Failed":
192
- raise ValueError(self.message)
193
- if self.status not in ("Starting", "Processing"):
194
- return self
195
- await asyncio.sleep(0.5)
196
-
197
-
198
- def _get_content(self, content_type: str) -> str:
199
- """Helper method to get either HTML, Markdown, or raw content."""
200
- if not self.output:
201
- return ""
202
- parts = []
203
- for c in self.output.chunks:
204
- for s in c.segments:
205
- content = getattr(s, content_type)
206
- if content:
207
- parts.append(content)
208
- return "\n".join(parts)
209
-
210
- def html(self) -> str:
211
- """Get full HTML for the task"""
212
- return self._get_content("html")
213
-
214
- def markdown(self) -> str:
215
- """Get full markdown for the task"""
216
- return self._get_content("markdown")
217
-
218
- def content(self) -> str:
219
- """Get full text for the task"""
220
- return self._get_content("content")
221
-
222
- class TaskPayload(BaseModel):
223
- current_configuration: Configuration
224
- file_name: str
225
- image_folder_location: str
226
- input_location: str
227
- output_location: str
228
- pdf_location: str
229
- previous_configuration: Optional[Configuration]
230
- task_id: str
231
- user_id: str
@@ -1,16 +0,0 @@
1
- Metadata-Version: 2.2
2
- Name: chunkr-ai
3
- Version: 0.0.2
4
- Summary: Python client for chunkr: open source document intelligence
5
- Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
- Description-Content-Type: text/markdown
7
- License-File: LICENSE
8
- Requires-Dist: build>=1.2.2.post1
9
- Requires-Dist: httpx>=0.28.1
10
- Requires-Dist: pillow>=11.1.0
11
- Requires-Dist: pydantic>=2.10.4
12
- Requires-Dist: python-dotenv>=1.0.1
13
- Requires-Dist: requests>=2.32.3
14
- Requires-Dist: twine>=6.0.1
15
- Provides-Extra: test
16
- Requires-Dist: pytest>=8.3.4; extra == "test"
@@ -1,12 +0,0 @@
1
- chunkr_ai/__init__.py,sha256=eXygrEhGxxIHXNYIlHF2eied8rGsx2RphgR8Wo4lRyo,110
2
- chunkr_ai/main.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- chunkr_ai/api/api.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- chunkr_ai/api/auth.py,sha256=U25WiNQBsrAWYAntuds0zSMvB4gUpAwGoSa5wnQ2LRQ,454
5
- chunkr_ai/api/chunkr.py,sha256=UqFoK8ytCsW1I5F0nM4OD6I4zigy-UHzGuMDtpvMSmE,4454
6
- chunkr_ai/api/chunkr_async.py,sha256=Kfh7_DEon6QTPe-XJops8l9R6rp0zIfJKeh9ZEGFQao,1529
7
- chunkr_ai/api/models.py,sha256=vAVeRHgdSO4SDl009R2Vz75WtuXAwkUZW8ZsVXk9yBA,7221
8
- chunkr_ai-0.0.2.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
- chunkr_ai-0.0.2.dist-info/METADATA,sha256=ZK6gdzkukxMEVr1WxodLZ9dZNHar32C00ST1LG9mFl8,519
10
- chunkr_ai-0.0.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
11
- chunkr_ai-0.0.2.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
12
- chunkr_ai-0.0.2.dist-info/RECORD,,