chunkr-ai 0.0.37__tar.gz → 0.0.39__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {chunkr_ai-0.0.37/src/chunkr_ai.egg-info → chunkr_ai-0.0.39}/PKG-INFO +1 -1
- {chunkr_ai-0.0.37 → chunkr_ai-0.0.39}/pyproject.toml +1 -1
- {chunkr_ai-0.0.37 → chunkr_ai-0.0.39}/src/chunkr_ai/api/configuration.py +11 -11
- {chunkr_ai-0.0.37 → chunkr_ai-0.0.39}/src/chunkr_ai/api/task_response.py +3 -3
- {chunkr_ai-0.0.37 → chunkr_ai-0.0.39/src/chunkr_ai.egg-info}/PKG-INFO +1 -1
- {chunkr_ai-0.0.37 → chunkr_ai-0.0.39}/tests/test_chunkr.py +12 -2
- {chunkr_ai-0.0.37 → chunkr_ai-0.0.39}/LICENSE +0 -0
- {chunkr_ai-0.0.37 → chunkr_ai-0.0.39}/README.md +0 -0
- {chunkr_ai-0.0.37 → chunkr_ai-0.0.39}/setup.cfg +0 -0
- {chunkr_ai-0.0.37 → chunkr_ai-0.0.39}/src/chunkr_ai/__init__.py +0 -0
- {chunkr_ai-0.0.37 → chunkr_ai-0.0.39}/src/chunkr_ai/api/__init__.py +0 -0
- {chunkr_ai-0.0.37 → chunkr_ai-0.0.39}/src/chunkr_ai/api/auth.py +0 -0
- {chunkr_ai-0.0.37 → chunkr_ai-0.0.39}/src/chunkr_ai/api/chunkr.py +0 -0
- {chunkr_ai-0.0.37 → chunkr_ai-0.0.39}/src/chunkr_ai/api/chunkr_base.py +0 -0
- {chunkr_ai-0.0.37 → chunkr_ai-0.0.39}/src/chunkr_ai/api/decorators.py +0 -0
- {chunkr_ai-0.0.37 → chunkr_ai-0.0.39}/src/chunkr_ai/api/misc.py +0 -0
- {chunkr_ai-0.0.37 → chunkr_ai-0.0.39}/src/chunkr_ai/api/protocol.py +0 -0
- {chunkr_ai-0.0.37 → chunkr_ai-0.0.39}/src/chunkr_ai/models.py +0 -0
- {chunkr_ai-0.0.37 → chunkr_ai-0.0.39}/src/chunkr_ai.egg-info/SOURCES.txt +0 -0
- {chunkr_ai-0.0.37 → chunkr_ai-0.0.39}/src/chunkr_ai.egg-info/dependency_links.txt +0 -0
- {chunkr_ai-0.0.37 → chunkr_ai-0.0.39}/src/chunkr_ai.egg-info/requires.txt +0 -0
- {chunkr_ai-0.0.37 → chunkr_ai-0.0.39}/src/chunkr_ai.egg-info/top_level.txt +0 -0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "chunkr-ai"
|
7
|
-
version = "0.0.
|
7
|
+
version = "0.0.39"
|
8
8
|
authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
|
9
9
|
description = "Python client for Chunkr: open source document intelligence"
|
10
10
|
readme = "README.md"
|
@@ -19,20 +19,18 @@ class GenerationConfig(BaseModel):
|
|
19
19
|
class SegmentProcessing(BaseModel):
|
20
20
|
model_config = ConfigDict(populate_by_name=True, alias_generator=str.title)
|
21
21
|
|
22
|
-
title: Optional[GenerationConfig] = Field(default=None, alias="Title")
|
23
|
-
section_header: Optional[GenerationConfig] = Field(
|
24
|
-
default=None, alias="SectionHeader"
|
25
|
-
)
|
26
|
-
text: Optional[GenerationConfig] = Field(default=None, alias="Text")
|
27
|
-
list_item: Optional[GenerationConfig] = Field(default=None, alias="ListItem")
|
28
|
-
table: Optional[GenerationConfig] = Field(default=None, alias="Table")
|
29
|
-
picture: Optional[GenerationConfig] = Field(default=None, alias="Picture")
|
30
22
|
caption: Optional[GenerationConfig] = Field(default=None, alias="Caption")
|
31
|
-
formula: Optional[GenerationConfig] = Field(default=None, alias="Formula")
|
32
23
|
footnote: Optional[GenerationConfig] = Field(default=None, alias="Footnote")
|
33
|
-
|
34
|
-
|
24
|
+
formula: Optional[GenerationConfig] = Field(default=None, alias="Formula")
|
25
|
+
list_item: Optional[GenerationConfig] = Field(default=None, alias="ListItem")
|
35
26
|
page: Optional[GenerationConfig] = Field(default=None, alias="Page")
|
27
|
+
page_footer: Optional[GenerationConfig] = Field(default=None, alias="PageFooter")
|
28
|
+
page_header: Optional[GenerationConfig] = Field(default=None, alias="PageHeader")
|
29
|
+
picture: Optional[GenerationConfig] = Field(default=None, alias="Picture")
|
30
|
+
section_header: Optional[GenerationConfig] = Field(default=None, alias="SectionHeader")
|
31
|
+
table: Optional[GenerationConfig] = Field(default=None, alias="Table")
|
32
|
+
text: Optional[GenerationConfig] = Field(default=None, alias="Text")
|
33
|
+
title: Optional[GenerationConfig] = Field(default=None, alias="Title")
|
36
34
|
|
37
35
|
class ChunkProcessing(BaseModel):
|
38
36
|
ignore_headers_and_footers: Optional[bool] = None
|
@@ -84,11 +82,13 @@ class Segment(BaseModel):
|
|
84
82
|
page_width: float
|
85
83
|
segment_id: str
|
86
84
|
segment_type: SegmentType
|
85
|
+
confidence: Optional[float]
|
87
86
|
|
88
87
|
class Chunk(BaseModel):
|
89
88
|
chunk_id: str
|
90
89
|
chunk_length: int
|
91
90
|
segments: List[Segment]
|
91
|
+
embed: Optional[str] = None
|
92
92
|
|
93
93
|
class OutputResponse(BaseModel):
|
94
94
|
chunks: List[Chunk]
|
@@ -144,7 +144,7 @@ class TaskResponse(BaseModel, Generic[T]):
|
|
144
144
|
Args:
|
145
145
|
output_file (str, optional): Path to save the markdown content. Defaults to None.
|
146
146
|
"""
|
147
|
-
content = self._get_content("markdown")
|
147
|
+
content = self._get_content("markdown", separator="\n\n")
|
148
148
|
self._write_to_file(content, output_file)
|
149
149
|
return content
|
150
150
|
|
@@ -168,7 +168,7 @@ class TaskResponse(BaseModel, Generic[T]):
|
|
168
168
|
self._write_to_file(data, output_file, is_json=True)
|
169
169
|
return data
|
170
170
|
|
171
|
-
def _get_content(self, t: str) -> str:
|
171
|
+
def _get_content(self, t: str, separator: str = "\n") -> str:
|
172
172
|
if not self.output:
|
173
173
|
return ""
|
174
174
|
parts = []
|
@@ -177,4 +177,4 @@ class TaskResponse(BaseModel, Generic[T]):
|
|
177
177
|
v = getattr(s, t)
|
178
178
|
if v:
|
179
179
|
parts.append(v)
|
180
|
-
return
|
180
|
+
return separator.join(parts)
|
@@ -247,13 +247,18 @@ async def test_send_base64_file_with_filename(client, sample_path):
|
|
247
247
|
|
248
248
|
@pytest.mark.asyncio
|
249
249
|
async def test_output_files_no_dir(client, sample_path, tmp_path):
|
250
|
-
await client.upload(sample_path)
|
250
|
+
task = await client.upload(sample_path)
|
251
251
|
|
252
252
|
html_file = tmp_path / "output.html"
|
253
253
|
md_file = tmp_path / "output.md"
|
254
254
|
content_file = tmp_path / "output.txt"
|
255
255
|
json_file = tmp_path / "output.json"
|
256
256
|
|
257
|
+
task.html(html_file)
|
258
|
+
task.markdown(md_file)
|
259
|
+
task.content(content_file)
|
260
|
+
task.json(json_file)
|
261
|
+
|
257
262
|
assert html_file.exists()
|
258
263
|
assert md_file.exists()
|
259
264
|
assert content_file.exists()
|
@@ -261,13 +266,18 @@ async def test_output_files_no_dir(client, sample_path, tmp_path):
|
|
261
266
|
|
262
267
|
@pytest.mark.asyncio
|
263
268
|
async def test_output_files_with_dirs(client, sample_path, tmp_path):
|
264
|
-
await client.upload(sample_path)
|
269
|
+
task = await client.upload(sample_path)
|
265
270
|
|
266
271
|
nested_dir = tmp_path / "nested" / "output" / "dir"
|
267
272
|
html_file = nested_dir / "output.html"
|
268
273
|
md_file = nested_dir / "output.md"
|
269
274
|
content_file = nested_dir / "output.txt"
|
270
275
|
json_file = nested_dir / "output.json"
|
276
|
+
|
277
|
+
task.html(html_file)
|
278
|
+
task.markdown(md_file)
|
279
|
+
task.content(content_file)
|
280
|
+
task.json(json_file)
|
271
281
|
|
272
282
|
assert html_file.exists()
|
273
283
|
assert md_file.exists()
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|