docslight 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docslight/__init__.py +41 -0
- docslight/cli.py +215 -0
- docslight/client.py +92 -0
- docslight/cloud/__init__.py +5 -0
- docslight/cloud/client.py +626 -0
- docslight/config.py +117 -0
- docslight/exceptions.py +65 -0
- docslight/local/__init__.py +31 -0
- docslight/local/layout_blocks.py +80 -0
- docslight/local/llm_extractor.py +252 -0
- docslight/local/loaders.py +95 -0
- docslight/local/markdown.py +18 -0
- docslight/local/office_loader.py +128 -0
- docslight/local/paddle_parser.py +173 -0
- docslight/local/pipeline.py +213 -0
- docslight/preview.py +46 -0
- docslight/providers/__init__.py +6 -0
- docslight/providers/ollama.py +30 -0
- docslight/providers/openai_compatible.py +64 -0
- docslight/result.py +89 -0
- docslight/schemas/__init__.py +5 -0
- docslight/schemas/fields.py +190 -0
- docslight/standard_json.py +367 -0
- docslight/static/app/common.js +668 -0
- docslight/static/app/docslight-extract.json +307 -0
- docslight/static/app/extract.js +394 -0
- docslight/static/app/i18n.js +405 -0
- docslight/static/app/parse.js +161 -0
- docslight/static/styles.css +878 -0
- docslight/templates/base.html +36 -0
- docslight/templates/extract.html +123 -0
- docslight/templates/parse.html +81 -0
- docslight/web_app.py +386 -0
- docslight-0.1.0.dist-info/METADATA +277 -0
- docslight-0.1.0.dist-info/RECORD +39 -0
- docslight-0.1.0.dist-info/WHEEL +5 -0
- docslight-0.1.0.dist-info/entry_points.txt +2 -0
- docslight-0.1.0.dist-info/licenses/LICENSE +21 -0
- docslight-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
<!doctype html>
|
|
2
|
+
<html lang="en">
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="utf-8" />
|
|
5
|
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
|
6
|
+
<title>{% block title %}DocSlight Workbench{% endblock %}</title>
|
|
7
|
+
<link rel="stylesheet" href="{{ url_for('static', filename='styles.css') }}?v=6" />
|
|
8
|
+
</head>
|
|
9
|
+
<body data-page="{{ active_page }}">
|
|
10
|
+
<header class="topbar">
|
|
11
|
+
<a class="brand" href="{{ url_for('parse_page') }}" aria-label="DocSlight home">
|
|
12
|
+
<span class="brand-mark">D</span>
|
|
13
|
+
<span class="brand-text">DocSlight</span>
|
|
14
|
+
</a>
|
|
15
|
+
<nav class="topnav" aria-label="Workbench pages">
|
|
16
|
+
<a class="nav-link{% if active_page == 'parse' %} is-active{% endif %}" href="{{ url_for('parse_page') }}" data-i18n="nav.parse">Parse</a>
|
|
17
|
+
<a class="nav-link{% if active_page == 'extract' %} is-active{% endif %}" href="{{ url_for('extract_page') }}" data-i18n="nav.extract">Extract</a>
|
|
18
|
+
</nav>
|
|
19
|
+
<div class="topbar-actions">
|
|
20
|
+
<select id="languageSelect" class="language-select" aria-label="Language" data-i18n-aria-label="language.label">
|
|
21
|
+
<option value="en">English</option>
|
|
22
|
+
<option value="zh-CN">简体中文</option>
|
|
23
|
+
<option value="zh-TW">繁體中文</option>
|
|
24
|
+
</select>
|
|
25
|
+
<div class="health-badge" aria-live="polite">
|
|
26
|
+
<span class="health-dot" aria-hidden="true"></span>
|
|
27
|
+
<span id="healthStatus" data-i18n="health.checking">Checking service...</span>
|
|
28
|
+
</div>
|
|
29
|
+
</div>
|
|
30
|
+
</header>
|
|
31
|
+
<main class="app-main">
|
|
32
|
+
{% block content %}{% endblock %}
|
|
33
|
+
</main>
|
|
34
|
+
{% block page_scripts %}{% endblock %}
|
|
35
|
+
</body>
|
|
36
|
+
</html>
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
{% extends "base.html" %}
|
|
2
|
+
|
|
3
|
+
{% block title %}Extract | DocSlight Workbench{% endblock %}
|
|
4
|
+
|
|
5
|
+
{% block content %}
|
|
6
|
+
<section class="workbench" aria-label="Extract workbench" data-i18n-aria-label="extract.workbench">
|
|
7
|
+
<form id="extractForm" class="panel config-panel" enctype="multipart/form-data">
|
|
8
|
+
<div class="panel-header">
|
|
9
|
+
<p class="eyebrow" data-i18n="extract.eyebrow">Extract setup</p>
|
|
10
|
+
<h1 data-i18n="extract.title">Extract fields</h1>
|
|
11
|
+
<p data-i18n="extract.description">Define fields and tables, then extract structured values from a document.</p>
|
|
12
|
+
</div>
|
|
13
|
+
|
|
14
|
+
<label id="dropZone" class="drop-zone compact-drop-zone" for="fileInput">
|
|
15
|
+
<input id="fileInput" name="file" type="file" accept=".pdf,.png,.jpg,.jpeg,.tif,.tiff,.bmp,.webp,.docx,.pptx,.xlsx" />
|
|
16
|
+
<span class="drop-title" data-i18n="drop.choose">Choose document</span>
|
|
17
|
+
<span class="drop-copy" data-i18n="drop.formats">PDF, image, DOCX, PPTX, XLSX</span>
|
|
18
|
+
<span id="fileName" class="file-name" data-i18n="drop.none">No file selected</span>
|
|
19
|
+
</label>
|
|
20
|
+
|
|
21
|
+
<label class="field-label" for="modeSelect" data-i18n="mode.label">Processing mode</label>
|
|
22
|
+
<select id="modeSelect" name="mode">
|
|
23
|
+
<option value="cloud" data-i18n="mode.cloud">Cloud</option>
|
|
24
|
+
<option value="local" data-i18n="mode.local">Local</option>
|
|
25
|
+
</select>
|
|
26
|
+
|
|
27
|
+
<div id="cloudConfig" class="config-block">
|
|
28
|
+
<label class="field-label">
|
|
29
|
+
<span data-i18n="cloud.baseUrl">Cloud Base URL</span>
|
|
30
|
+
<input name="base_url" type="url" placeholder="https://api.compdf.com" />
|
|
31
|
+
</label>
|
|
32
|
+
<label class="field-label">
|
|
33
|
+
<span data-i18n="cloud.apiKey">API key</span>
|
|
34
|
+
<input name="api_key" type="password" autocomplete="off" placeholder="Cloud API key" data-i18n-placeholder="cloud.apiKeyPlaceholder" />
|
|
35
|
+
</label>
|
|
36
|
+
<label class="field-label">
|
|
37
|
+
<span data-i18n="cloud.extractMode">Cloud model</span>
|
|
38
|
+
<select id="cloudExtractMode" name="cloud_extract_mode">
|
|
39
|
+
<option value="vlm">vlm</option>
|
|
40
|
+
<option value="integrate">integrate</option>
|
|
41
|
+
</select>
|
|
42
|
+
</label>
|
|
43
|
+
<label id="groundingToggle" class="checkbox-label" hidden>
|
|
44
|
+
<input id="enableGrounding" name="enable_grounding" type="checkbox" value="true" checked />
|
|
45
|
+
<span data-i18n="cloud.enableGrounding">Enable grounding</span>
|
|
46
|
+
</label>
|
|
47
|
+
</div>
|
|
48
|
+
|
|
49
|
+
<fieldset id="localLlmBlock" class="config-block">
|
|
50
|
+
<legend data-i18n="extract.localLlm">Local LLM</legend>
|
|
51
|
+
<label class="field-label">
|
|
52
|
+
<span data-i18n="extract.provider">Provider</span>
|
|
53
|
+
<select name="local_llm_provider">
|
|
54
|
+
<option value="ollama">ollama</option>
|
|
55
|
+
<option value="openai-compatible">openai-compatible</option>
|
|
56
|
+
<option value="openai">openai</option>
|
|
57
|
+
</select>
|
|
58
|
+
</label>
|
|
59
|
+
<label class="field-label">
|
|
60
|
+
<span data-i18n="extract.model">Model</span>
|
|
61
|
+
<input name="local_llm_model" type="text" placeholder="llama3.1" />
|
|
62
|
+
</label>
|
|
63
|
+
<label class="field-label">
|
|
64
|
+
<span data-i18n="extract.baseUrl">Base URL</span>
|
|
65
|
+
<input name="local_llm_base_url" type="url" placeholder="http://localhost:11434" />
|
|
66
|
+
</label>
|
|
67
|
+
<label class="field-label">
|
|
68
|
+
<span data-i18n="extract.apiKey">API key</span>
|
|
69
|
+
<input name="local_llm_api_key" type="password" autocomplete="off" placeholder="optional" data-i18n-placeholder="extract.optionalPlaceholder" />
|
|
70
|
+
</label>
|
|
71
|
+
</fieldset>
|
|
72
|
+
|
|
73
|
+
<section id="fieldsBuilder" class="fields-builder" aria-labelledby="fieldsBuilderTitle">
|
|
74
|
+
<h2 id="fieldsBuilderTitle" data-i18n="fields.title">Fields</h2>
|
|
75
|
+
<label class="field-label" for="fieldTemplateName" data-i18n="fields.templateName">Template name</label>
|
|
76
|
+
<input id="fieldTemplateName" type="text" placeholder="Invoice" data-i18n-placeholder="fields.templatePlaceholder" />
|
|
77
|
+
<div id="fieldsRows" class="fields-rows"></div>
|
|
78
|
+
<div class="field-actions">
|
|
79
|
+
<button id="addFieldButton" type="button" data-i18n="fields.addField">Add field</button>
|
|
80
|
+
<button id="addTableButton" type="button" data-i18n="fields.addTable">Add table</button>
|
|
81
|
+
</div>
|
|
82
|
+
<input name="fields" type="hidden" />
|
|
83
|
+
</section>
|
|
84
|
+
|
|
85
|
+
<p id="formError" class="form-error" role="alert" hidden></p>
|
|
86
|
+
<button id="submitButton" type="submit" data-i18n="extract.run">Run extract</button>
|
|
87
|
+
</form>
|
|
88
|
+
|
|
89
|
+
<section class="panel specimen-panel" aria-label="Document specimen" data-i18n-aria-label="preview.specimen">
|
|
90
|
+
<div class="preview-header">
|
|
91
|
+
<h2 id="previewTitle" data-i18n="preview.title">Document preview</h2>
|
|
92
|
+
<span id="highlightStatus" data-i18n="preview.noHighlight">No highlight selected</span>
|
|
93
|
+
</div>
|
|
94
|
+
<div id="previewCanvas" class="preview-canvas"></div>
|
|
95
|
+
<p id="officePreviewNotice" class="preview-notice" hidden data-i18n="preview.officeUnsupported">Office files can be processed, but preview and positioning highlight are not supported in this version.</p>
|
|
96
|
+
</section>
|
|
97
|
+
|
|
98
|
+
<section class="panel result-panel" aria-labelledby="extractResultsTitle">
|
|
99
|
+
<div class="result-header">
|
|
100
|
+
<h2 id="extractResultsTitle" data-i18n="extract.resultsTitle">Extract results</h2>
|
|
101
|
+
<button id="downloadButton" type="button" disabled data-i18n="common.download">Download</button>
|
|
102
|
+
</div>
|
|
103
|
+
<div id="extractResultTabs" class="result-tabs" role="tablist">
|
|
104
|
+
<button type="button" data-result-tab="fields" data-i18n="extract.tabs.fields">Fields</button>
|
|
105
|
+
<button type="button" data-result-tab="json" data-i18n="extract.tabs.json">JSON</button>
|
|
106
|
+
</div>
|
|
107
|
+
<section id="fieldsPanel" data-result-panel="fields">
|
|
108
|
+
<pre id="fieldsResult"></pre>
|
|
109
|
+
</section>
|
|
110
|
+
<section id="jsonPanel" data-result-panel="json">
|
|
111
|
+
<pre id="jsonResult"></pre>
|
|
112
|
+
</section>
|
|
113
|
+
<details class="metadata-panel">
|
|
114
|
+
<summary data-i18n="common.metadataPreview">Metadata preview</summary>
|
|
115
|
+
<pre id="metadataPreview"></pre>
|
|
116
|
+
</details>
|
|
117
|
+
</section>
|
|
118
|
+
</section>
|
|
119
|
+
{% endblock %}
|
|
120
|
+
|
|
121
|
+
{% block page_scripts %}
|
|
122
|
+
<script type="module" src="{{ url_for('static', filename='app/extract.js') }}"></script>
|
|
123
|
+
{% endblock %}
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
{% extends "base.html" %}
|
|
2
|
+
|
|
3
|
+
{% block title %}Parse | DocSlight Workbench{% endblock %}
|
|
4
|
+
|
|
5
|
+
{% block content %}
|
|
6
|
+
<section class="workbench" aria-label="Parse workbench" data-i18n-aria-label="parse.workbench">
|
|
7
|
+
<form id="parseForm" class="panel config-panel" enctype="multipart/form-data">
|
|
8
|
+
<div class="panel-header">
|
|
9
|
+
<p class="eyebrow" data-i18n="parse.eyebrow">Parse setup</p>
|
|
10
|
+
<h1 data-i18n="parse.title">Parse documents</h1>
|
|
11
|
+
<p data-i18n="parse.description">Convert documents into layout blocks, Markdown, and raw JSON.</p>
|
|
12
|
+
</div>
|
|
13
|
+
|
|
14
|
+
<label id="dropZone" class="drop-zone compact-drop-zone" for="fileInput">
|
|
15
|
+
<input id="fileInput" name="file" type="file" accept=".pdf,.png,.jpg,.jpeg,.tif,.tiff,.bmp,.webp,.docx,.pptx,.xlsx" />
|
|
16
|
+
<span class="drop-title" data-i18n="drop.choose">Choose document</span>
|
|
17
|
+
<span class="drop-copy" data-i18n="drop.formats">PDF, image, DOCX, PPTX, XLSX</span>
|
|
18
|
+
<span id="fileName" class="file-name" data-i18n="drop.none">No file selected</span>
|
|
19
|
+
</label>
|
|
20
|
+
|
|
21
|
+
<label class="field-label" for="modeSelect" data-i18n="mode.label">Processing mode</label>
|
|
22
|
+
<select id="modeSelect" name="mode">
|
|
23
|
+
<option value="cloud" data-i18n="mode.cloud">Cloud</option>
|
|
24
|
+
<option value="local" data-i18n="mode.local">Local</option>
|
|
25
|
+
</select>
|
|
26
|
+
|
|
27
|
+
<div id="cloudConfig" class="config-block">
|
|
28
|
+
<label class="field-label">
|
|
29
|
+
<span data-i18n="cloud.baseUrl">Cloud Base URL</span>
|
|
30
|
+
<input name="base_url" type="url" placeholder="https://api.compdf.com" />
|
|
31
|
+
</label>
|
|
32
|
+
<label class="field-label">
|
|
33
|
+
<span data-i18n="cloud.apiKey">API key</span>
|
|
34
|
+
<input name="api_key" type="password" autocomplete="off" placeholder="Cloud API key" data-i18n-placeholder="cloud.apiKeyPlaceholder" />
|
|
35
|
+
</label>
|
|
36
|
+
</div>
|
|
37
|
+
|
|
38
|
+
<p id="localParseNote" class="helper-text" data-i18n="parse.localNote">Local parsing uses the configured local runtime.</p>
|
|
39
|
+
<p id="formError" class="form-error" role="alert" hidden></p>
|
|
40
|
+
<button id="submitButton" type="submit" data-i18n="parse.run">Run parse</button>
|
|
41
|
+
</form>
|
|
42
|
+
|
|
43
|
+
<section class="panel specimen-panel" aria-label="Document specimen" data-i18n-aria-label="preview.specimen">
|
|
44
|
+
<div class="preview-header">
|
|
45
|
+
<h2 id="previewTitle" data-i18n="preview.title">Document preview</h2>
|
|
46
|
+
<span id="highlightStatus" data-i18n="preview.noHighlight">No highlight selected</span>
|
|
47
|
+
</div>
|
|
48
|
+
<div id="previewCanvas" class="preview-canvas"></div>
|
|
49
|
+
<p id="officePreviewNotice" class="preview-notice" hidden data-i18n="preview.officeUnsupported">Office files can be processed, but preview and positioning highlight are not supported in this version.</p>
|
|
50
|
+
</section>
|
|
51
|
+
|
|
52
|
+
<section class="panel result-panel" aria-labelledby="parseResultsTitle">
|
|
53
|
+
<div class="result-header">
|
|
54
|
+
<h2 id="parseResultsTitle" data-i18n="parse.resultsTitle">Parse results</h2>
|
|
55
|
+
<button id="downloadButton" type="button" disabled data-i18n="common.download">Download</button>
|
|
56
|
+
</div>
|
|
57
|
+
<div id="parseResultTabs" class="result-tabs" role="tablist">
|
|
58
|
+
<button type="button" data-result-tab="blocks" data-i18n="parse.tabs.blocks">Blocks</button>
|
|
59
|
+
<button type="button" data-result-tab="markdown" data-i18n="parse.tabs.markdown">Markdown</button>
|
|
60
|
+
<button type="button" data-result-tab="json" data-i18n="parse.tabs.json">JSON</button>
|
|
61
|
+
</div>
|
|
62
|
+
<section id="blocksPanel" data-result-panel="blocks">
|
|
63
|
+
<pre id="blocksResult"></pre>
|
|
64
|
+
</section>
|
|
65
|
+
<section id="markdownPanel" data-result-panel="markdown">
|
|
66
|
+
<pre id="markdownResult"></pre>
|
|
67
|
+
</section>
|
|
68
|
+
<section id="jsonPanel" data-result-panel="json">
|
|
69
|
+
<pre id="jsonResult"></pre>
|
|
70
|
+
</section>
|
|
71
|
+
<details class="metadata-panel">
|
|
72
|
+
<summary data-i18n="common.metadataPreview">Metadata preview</summary>
|
|
73
|
+
<pre id="metadataPreview"></pre>
|
|
74
|
+
</details>
|
|
75
|
+
</section>
|
|
76
|
+
</section>
|
|
77
|
+
{% endblock %}
|
|
78
|
+
|
|
79
|
+
{% block page_scripts %}
|
|
80
|
+
<script type="module" src="{{ url_for('static', filename='app/parse.js') }}"></script>
|
|
81
|
+
{% endblock %}
|
docslight/web_app.py
ADDED
|
@@ -0,0 +1,386 @@
|
|
|
1
|
+
"""Local Flask web application for DocSlight."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import base64
|
|
7
|
+
import json
|
|
8
|
+
import logging
|
|
9
|
+
import sys
|
|
10
|
+
import tempfile
|
|
11
|
+
from collections.abc import Callable
|
|
12
|
+
from io import BytesIO
|
|
13
|
+
from json import JSONDecodeError
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Any, cast
|
|
16
|
+
|
|
17
|
+
from flask import Flask, Response, jsonify, redirect, render_template, request, send_file, url_for
|
|
18
|
+
from werkzeug.datastructures import FileStorage
|
|
19
|
+
from werkzeug.utils import secure_filename
|
|
20
|
+
|
|
21
|
+
from docslight import DocSlight
|
|
22
|
+
from docslight.exceptions import (
|
|
23
|
+
AuthenticationError,
|
|
24
|
+
CloudAPIError,
|
|
25
|
+
ConfigurationError,
|
|
26
|
+
DocSlightError,
|
|
27
|
+
RateLimitError,
|
|
28
|
+
)
|
|
29
|
+
from docslight.preview import render_pdf_preview
|
|
30
|
+
from docslight.schemas import build_extract_schema, normalize_fields
|
|
31
|
+
|
|
32
|
+
ALLOWED_EXTENSIONS = {
|
|
33
|
+
"pdf",
|
|
34
|
+
"png",
|
|
35
|
+
"jpg",
|
|
36
|
+
"jpeg",
|
|
37
|
+
"tif",
|
|
38
|
+
"tiff",
|
|
39
|
+
"bmp",
|
|
40
|
+
"webp",
|
|
41
|
+
"docx",
|
|
42
|
+
"pptx",
|
|
43
|
+
"xlsx",
|
|
44
|
+
}
|
|
45
|
+
IMAGE_MIME_TYPES = {
|
|
46
|
+
"png": "image/png",
|
|
47
|
+
"jpg": "image/jpeg",
|
|
48
|
+
"jpeg": "image/jpeg",
|
|
49
|
+
"tif": "image/tiff",
|
|
50
|
+
"tiff": "image/tiff",
|
|
51
|
+
"bmp": "image/bmp",
|
|
52
|
+
"webp": "image/webp",
|
|
53
|
+
}
|
|
54
|
+
OFFICE_EXTENSIONS = {"docx", "pptx", "xlsx"}
|
|
55
|
+
OFFICE_PREVIEW_UNSUPPORTED_MESSAGE = (
|
|
56
|
+
"Office files can be processed, but preview and positioning highlight are not supported in this version."
|
|
57
|
+
)
|
|
58
|
+
LOG_FORMAT = "%(levelname)s:%(name)s:%(message)s"
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def create_app(docslight_factory: Callable[..., Any] = DocSlight) -> Flask:
|
|
62
|
+
"""Create the local DocSlight Flask application."""
|
|
63
|
+
app = Flask(__name__)
|
|
64
|
+
|
|
65
|
+
@app.get("/")
|
|
66
|
+
def index() -> Any:
|
|
67
|
+
return redirect(url_for("parse_page"))
|
|
68
|
+
|
|
69
|
+
@app.get("/parse")
|
|
70
|
+
def parse_page() -> str:
|
|
71
|
+
return render_template("parse.html", active_page="parse")
|
|
72
|
+
|
|
73
|
+
@app.get("/extract")
|
|
74
|
+
def extract_page() -> str:
|
|
75
|
+
return render_template("extract.html", active_page="extract")
|
|
76
|
+
|
|
77
|
+
@app.get("/api/health")
|
|
78
|
+
def health() -> Any:
|
|
79
|
+
return jsonify({"status": "healthy", "service": "docslight-web"})
|
|
80
|
+
|
|
81
|
+
@app.get("/api/system-info")
|
|
82
|
+
def system_info() -> Any:
|
|
83
|
+
return jsonify(
|
|
84
|
+
{
|
|
85
|
+
"modes": ["cloud", "local"],
|
|
86
|
+
"supported_extensions": sorted(ALLOWED_EXTENSIONS),
|
|
87
|
+
}
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
@app.post("/api/parse")
|
|
91
|
+
def parse_document() -> Any:
|
|
92
|
+
file_response = _require_upload()
|
|
93
|
+
if not isinstance(file_response, FileStorage):
|
|
94
|
+
return file_response
|
|
95
|
+
|
|
96
|
+
return _with_temp_upload(
|
|
97
|
+
file_response,
|
|
98
|
+
lambda path: _parse_response_payload(
|
|
99
|
+
docslight_factory(**_client_kwargs(include_local_llm=False)).parse(path)
|
|
100
|
+
),
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
@app.post("/api/extract")
|
|
104
|
+
def extract_document() -> Any:
|
|
105
|
+
file_response = _require_upload()
|
|
106
|
+
if not isinstance(file_response, FileStorage):
|
|
107
|
+
return file_response
|
|
108
|
+
|
|
109
|
+
def operation(path: Path) -> dict[str, Any]:
|
|
110
|
+
extract_options: dict[str, Any] = {}
|
|
111
|
+
fields = _parse_fields_form_field()
|
|
112
|
+
if fields is not None:
|
|
113
|
+
extract_options["fields"] = fields
|
|
114
|
+
derived_schema = build_extract_schema(fields)
|
|
115
|
+
if derived_schema is not None:
|
|
116
|
+
extract_options["schema"] = derived_schema
|
|
117
|
+
|
|
118
|
+
schema = _parse_json_form_field("schema")
|
|
119
|
+
if schema is not None:
|
|
120
|
+
extract_options["schema"] = schema
|
|
121
|
+
|
|
122
|
+
document_types = _parse_json_form_field("document_types")
|
|
123
|
+
if document_types is not None:
|
|
124
|
+
if not isinstance(document_types, list):
|
|
125
|
+
raise ValueError("document_types must be a JSON list")
|
|
126
|
+
extract_options["document_types"] = document_types
|
|
127
|
+
|
|
128
|
+
if _blank_to_none(request.form.get("mode")) != "local":
|
|
129
|
+
extract_mode = _blank_to_none(request.form.get("cloud_extract_mode")) or "vlm"
|
|
130
|
+
extract_options["mode"] = extract_mode
|
|
131
|
+
enable_grounding = _parse_bool_form_field("enable_grounding")
|
|
132
|
+
if extract_mode == "integrate" and enable_grounding is not None:
|
|
133
|
+
extract_options["enable_grounding"] = enable_grounding
|
|
134
|
+
|
|
135
|
+
payload = docslight_factory(**_client_kwargs()).extract(path, **extract_options).to_json()
|
|
136
|
+
return cast(dict[str, Any], payload)
|
|
137
|
+
|
|
138
|
+
return _with_temp_upload(file_response, operation, wrap_result=False)
|
|
139
|
+
|
|
140
|
+
@app.post("/api/preview")
|
|
141
|
+
def preview_document() -> Any:
|
|
142
|
+
file_response = _require_upload()
|
|
143
|
+
if not isinstance(file_response, FileStorage):
|
|
144
|
+
return file_response
|
|
145
|
+
|
|
146
|
+
return _with_temp_upload(file_response, _preview_payload)
|
|
147
|
+
|
|
148
|
+
def _client_kwargs(include_local_llm: bool = True) -> dict[str, Any]:
|
|
149
|
+
kwargs = {
|
|
150
|
+
"mode": _blank_to_none(request.form.get("mode")),
|
|
151
|
+
"api_key": _blank_to_none(request.form.get("api_key")),
|
|
152
|
+
"base_url": _blank_to_none(request.form.get("base_url")),
|
|
153
|
+
}
|
|
154
|
+
if include_local_llm:
|
|
155
|
+
kwargs["local_llm"] = local_llm_from_form(request.form)
|
|
156
|
+
else:
|
|
157
|
+
kwargs["local_llm"] = None
|
|
158
|
+
return kwargs
|
|
159
|
+
|
|
160
|
+
return app
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def run_web_app(host: str = "127.0.0.1", port: int = 8000, debug: bool = False) -> None:
|
|
164
|
+
"""Run the local DocSlight web application."""
|
|
165
|
+
_configure_web_logging(debug)
|
|
166
|
+
create_app().run(host=host, port=port, debug=debug)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _configure_web_logging(debug: bool) -> None:
|
|
170
|
+
if not debug:
|
|
171
|
+
return
|
|
172
|
+
root_logger = logging.getLogger()
|
|
173
|
+
if not root_logger.handlers:
|
|
174
|
+
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)
|
|
175
|
+
else:
|
|
176
|
+
root_logger.setLevel(logging.INFO)
|
|
177
|
+
logging.getLogger("docslight").setLevel(logging.INFO)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
181
|
+
"""Build the standalone web application argument parser."""
|
|
182
|
+
parser = argparse.ArgumentParser(
|
|
183
|
+
prog="python -m docslight.web_app",
|
|
184
|
+
description="Run the DocSlight web application.",
|
|
185
|
+
)
|
|
186
|
+
parser.add_argument("--host", default="127.0.0.1")
|
|
187
|
+
parser.add_argument("--port", type=int, default=8000)
|
|
188
|
+
parser.add_argument("--debug", action="store_true")
|
|
189
|
+
return parser
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def main(argv: list[str] | None = None) -> int:
|
|
193
|
+
"""Run the standalone DocSlight web application entrypoint."""
|
|
194
|
+
args = build_parser().parse_args(argv)
|
|
195
|
+
run_web_app(args.host, args.port, args.debug)
|
|
196
|
+
return 0
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def local_llm_from_form(form: Any) -> dict[str, str] | None:
|
|
200
|
+
"""Build local LLM settings from web form values."""
|
|
201
|
+
values = {
|
|
202
|
+
"provider": _blank_to_none(form.get("local_llm_provider")),
|
|
203
|
+
"model": _blank_to_none(form.get("local_llm_model")),
|
|
204
|
+
"base_url": _blank_to_none(form.get("local_llm_base_url")),
|
|
205
|
+
"api_key": _blank_to_none(form.get("local_llm_api_key")),
|
|
206
|
+
}
|
|
207
|
+
if not any(values.values()):
|
|
208
|
+
return None
|
|
209
|
+
if values["provider"] is None:
|
|
210
|
+
values["provider"] = "ollama"
|
|
211
|
+
return {key: value for key, value in values.items() if value is not None}
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def _parse_response_payload(result: Any) -> Any:
|
|
215
|
+
raw_archive = getattr(result, "raw_archive", None)
|
|
216
|
+
if isinstance(raw_archive, bytes) and raw_archive:
|
|
217
|
+
metadata = getattr(result, "metadata", {})
|
|
218
|
+
filename = "docslight-parse.zip"
|
|
219
|
+
if isinstance(metadata, dict):
|
|
220
|
+
filename = str(metadata.get("downFileName") or metadata.get("taskId") or filename)
|
|
221
|
+
if not filename.endswith(".zip"):
|
|
222
|
+
filename = f"{filename}.zip"
|
|
223
|
+
return send_file(
|
|
224
|
+
BytesIO(raw_archive),
|
|
225
|
+
mimetype="application/zip",
|
|
226
|
+
as_attachment=True,
|
|
227
|
+
download_name=filename,
|
|
228
|
+
)
|
|
229
|
+
raw_response = getattr(result, "raw_response", None)
|
|
230
|
+
if isinstance(raw_response, dict):
|
|
231
|
+
return raw_response
|
|
232
|
+
return cast(dict[str, Any], result.to_json())
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def _require_upload() -> FileStorage | Any:
|
|
236
|
+
upload = request.files.get("file")
|
|
237
|
+
if upload is None or upload.filename is None or upload.filename == "":
|
|
238
|
+
return _error_response("A file upload is required.", 400)
|
|
239
|
+
if not _is_allowed_filename(upload.filename):
|
|
240
|
+
return _error_response("Unsupported file extension.", 400)
|
|
241
|
+
return upload
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def _with_temp_upload(
|
|
245
|
+
upload: FileStorage,
|
|
246
|
+
operation: Callable[[Path], Any],
|
|
247
|
+
wrap_result: bool = True,
|
|
248
|
+
) -> Any:
|
|
249
|
+
temp_path: Path | None = None
|
|
250
|
+
try:
|
|
251
|
+
suffix = _safe_upload_suffix(upload.filename)
|
|
252
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
|
|
253
|
+
temp_path = Path(temp_file.name)
|
|
254
|
+
upload.save(temp_file)
|
|
255
|
+
|
|
256
|
+
result = operation(temp_path)
|
|
257
|
+
if isinstance(result, Response):
|
|
258
|
+
return result
|
|
259
|
+
if not wrap_result:
|
|
260
|
+
return jsonify({"success": True, **result})
|
|
261
|
+
return jsonify({"success": True, "result": result})
|
|
262
|
+
except Exception as exc: # noqa: B902
|
|
263
|
+
return _exception_response(exc)
|
|
264
|
+
finally:
|
|
265
|
+
if temp_path is not None:
|
|
266
|
+
temp_path.unlink(missing_ok=True)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def _safe_upload_suffix(filename: str | None) -> str:
|
|
270
|
+
"""Extract a lowercase ASCII suffix from the original upload filename.
|
|
271
|
+
|
|
272
|
+
``secure_filename`` strips non-ASCII characters wholesale, so filenames
|
|
273
|
+
like ``"截图.png"`` collapse to ``"png"`` and lose the ``.png`` extension.
|
|
274
|
+
The temp file then has no suffix and ``_preview_payload`` rejects it as
|
|
275
|
+
"Unsupported file preview extension.". We therefore inspect the original
|
|
276
|
+
filename ourselves and only fall back to ``secure_filename`` if the
|
|
277
|
+
extracted suffix is allowed.
|
|
278
|
+
"""
|
|
279
|
+
if not filename:
|
|
280
|
+
return ""
|
|
281
|
+
suffix = Path(filename).suffix.lower()
|
|
282
|
+
bare = suffix.lstrip(".")
|
|
283
|
+
if bare and bare in ALLOWED_EXTENSIONS:
|
|
284
|
+
return suffix
|
|
285
|
+
fallback = Path(secure_filename(filename) or "upload").suffix.lower()
|
|
286
|
+
return fallback
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def _parse_json_form_field(name: str) -> Any:
|
|
290
|
+
value = _blank_to_none(request.form.get(name))
|
|
291
|
+
if value is None:
|
|
292
|
+
return None
|
|
293
|
+
return json.loads(value)
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def _parse_fields_form_field() -> Any:
|
|
297
|
+
value = _blank_to_none(request.form.get("fields"))
|
|
298
|
+
if value is None:
|
|
299
|
+
return None
|
|
300
|
+
stripped = value.strip()
|
|
301
|
+
if stripped.startswith("{"):
|
|
302
|
+
return normalize_fields(json.loads(stripped))
|
|
303
|
+
return normalize_fields(stripped)
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def _parse_bool_form_field(name: str) -> bool | None:
|
|
307
|
+
value = _blank_to_none(request.form.get(name))
|
|
308
|
+
if value is None:
|
|
309
|
+
return None
|
|
310
|
+
normalized = value.strip().lower()
|
|
311
|
+
if normalized in {"1", "true", "yes", "on"}:
|
|
312
|
+
return True
|
|
313
|
+
if normalized in {"0", "false", "no", "off"}:
|
|
314
|
+
return False
|
|
315
|
+
raise ValueError(f"{name} must be a boolean value")
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
def _preview_payload(path: Path) -> dict[str, Any]:
|
|
319
|
+
suffix = path.suffix.lower().lstrip(".")
|
|
320
|
+
if suffix == "pdf":
|
|
321
|
+
return render_pdf_preview(path)
|
|
322
|
+
if suffix in IMAGE_MIME_TYPES:
|
|
323
|
+
encoded = base64.b64encode(path.read_bytes()).decode("ascii")
|
|
324
|
+
width, height = _probe_image_size(path)
|
|
325
|
+
payload: dict[str, Any] = {
|
|
326
|
+
"kind": "image",
|
|
327
|
+
"mime_type": IMAGE_MIME_TYPES[suffix],
|
|
328
|
+
"data_url": f"data:{IMAGE_MIME_TYPES[suffix]};base64,{encoded}",
|
|
329
|
+
}
|
|
330
|
+
if width is not None and height is not None:
|
|
331
|
+
payload["width"] = width
|
|
332
|
+
payload["height"] = height
|
|
333
|
+
return payload
|
|
334
|
+
if suffix in OFFICE_EXTENSIONS:
|
|
335
|
+
return {"kind": "unsupported", "message": OFFICE_PREVIEW_UNSUPPORTED_MESSAGE}
|
|
336
|
+
raise ValueError("Unsupported file preview extension.")
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def _probe_image_size(path: Path) -> tuple[int | None, int | None]:
|
|
340
|
+
"""Best-effort image dimension probe. Returns (None, None) on failure so
|
|
341
|
+
the front end can fall back to <img>.naturalWidth/naturalHeight.
|
|
342
|
+
"""
|
|
343
|
+
try:
|
|
344
|
+
from PIL import Image
|
|
345
|
+
except ImportError:
|
|
346
|
+
return None, None
|
|
347
|
+
try:
|
|
348
|
+
with Image.open(path) as image:
|
|
349
|
+
return int(image.width), int(image.height)
|
|
350
|
+
except Exception: # noqa: BLE001
|
|
351
|
+
return None, None
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def _is_allowed_filename(filename: str) -> bool:
|
|
355
|
+
suffix = Path(filename).suffix.lower().lstrip(".")
|
|
356
|
+
return suffix in ALLOWED_EXTENSIONS
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def _blank_to_none(value: str | None) -> str | None:
|
|
360
|
+
if value is None:
|
|
361
|
+
return None
|
|
362
|
+
stripped = value.strip()
|
|
363
|
+
return stripped or None
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
def _exception_response(error: Exception) -> Any:
|
|
367
|
+
if isinstance(error, AuthenticationError):
|
|
368
|
+
return _error_response(str(error), 401)
|
|
369
|
+
if isinstance(error, RateLimitError):
|
|
370
|
+
return _error_response(str(error), 429)
|
|
371
|
+
if isinstance(error, CloudAPIError) and error.status_code is not None:
|
|
372
|
+
return _error_response(str(error), error.status_code)
|
|
373
|
+
if isinstance(error, (ConfigurationError, ValueError, JSONDecodeError)):
|
|
374
|
+
return _error_response(str(error), 400)
|
|
375
|
+
if isinstance(error, DocSlightError):
|
|
376
|
+
return _error_response(str(error), 400)
|
|
377
|
+
return _error_response("Internal server error.", 500)
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def _error_response(message: str, status_code: int) -> Any:
|
|
381
|
+
return jsonify({"success": False, "error": message}), status_code
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
if __name__ == "__main__":
|
|
385
|
+
sys.exit(main())
|
|
386
|
+
|