@exulu/backend 1.46.1 → 1.47.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agents/skills/mintlify/SKILL.md +347 -0
- package/.editorconfig +15 -0
- package/.eslintrc.json +52 -0
- package/.jscpd.json +18 -0
- package/.prettierignore +5 -0
- package/.prettierrc.json +12 -0
- package/CHANGELOG.md +15 -2
- package/README.md +747 -0
- package/SECURITY.md +5 -0
- package/dist/index.cjs +12015 -10496
- package/dist/index.d.cts +725 -667
- package/dist/index.d.ts +725 -667
- package/dist/index.js +12034 -10508
- package/ee/LICENSE.md +62 -0
- package/ee/agentic-retrieval/index.ts +1109 -0
- package/ee/documents/THIRD_PARTY_LICENSES/docling.txt +31 -0
- package/ee/documents/processing/build_pdf_processor.sh +35 -0
- package/ee/documents/processing/chunk_markdown.py +263 -0
- package/ee/documents/processing/doc_processor.ts +635 -0
- package/ee/documents/processing/pdf_processor.spec +115 -0
- package/ee/documents/processing/pdf_to_markdown.py +420 -0
- package/ee/documents/processing/requirements.txt +4 -0
- package/ee/entitlements.ts +49 -0
- package/ee/markdown.ts +686 -0
- package/ee/queues/decorator.ts +140 -0
- package/ee/queues/queues.ts +156 -0
- package/ee/queues/server.ts +6 -0
- package/ee/rbac-resolver.ts +51 -0
- package/ee/rbac-update.ts +111 -0
- package/ee/schemas.ts +347 -0
- package/ee/tokenizer.ts +80 -0
- package/ee/workers.ts +1423 -0
- package/eslint.config.js +88 -0
- package/jest.config.ts +25 -0
- package/license.md +73 -49
- package/mintlify-docs/.mintignore +7 -0
- package/mintlify-docs/AGENTS.md +33 -0
- package/mintlify-docs/CLAUDE.MD +50 -0
- package/mintlify-docs/CONTRIBUTING.md +32 -0
- package/mintlify-docs/LICENSE +21 -0
- package/mintlify-docs/README.md +55 -0
- package/mintlify-docs/ai-tools/claude-code.mdx +43 -0
- package/mintlify-docs/ai-tools/cursor.mdx +39 -0
- package/mintlify-docs/ai-tools/windsurf.mdx +39 -0
- package/mintlify-docs/api-reference/core-types/agent-types.mdx +110 -0
- package/mintlify-docs/api-reference/core-types/analytics-types.mdx +95 -0
- package/mintlify-docs/api-reference/core-types/configuration-types.mdx +83 -0
- package/mintlify-docs/api-reference/core-types/evaluation-types.mdx +106 -0
- package/mintlify-docs/api-reference/core-types/job-types.mdx +135 -0
- package/mintlify-docs/api-reference/core-types/overview.mdx +73 -0
- package/mintlify-docs/api-reference/core-types/prompt-types.mdx +102 -0
- package/mintlify-docs/api-reference/core-types/rbac-types.mdx +163 -0
- package/mintlify-docs/api-reference/core-types/session-types.mdx +77 -0
- package/mintlify-docs/api-reference/core-types/user-management.mdx +112 -0
- package/mintlify-docs/api-reference/core-types/workflow-types.mdx +88 -0
- package/mintlify-docs/api-reference/core-types.mdx +585 -0
- package/mintlify-docs/api-reference/dynamic-types.mdx +851 -0
- package/mintlify-docs/api-reference/endpoint/create.mdx +4 -0
- package/mintlify-docs/api-reference/endpoint/delete.mdx +4 -0
- package/mintlify-docs/api-reference/endpoint/get.mdx +4 -0
- package/mintlify-docs/api-reference/endpoint/webhook.mdx +4 -0
- package/mintlify-docs/api-reference/introduction.mdx +661 -0
- package/mintlify-docs/api-reference/mutations.mdx +1012 -0
- package/mintlify-docs/api-reference/openapi.json +217 -0
- package/mintlify-docs/api-reference/queries.mdx +1154 -0
- package/mintlify-docs/backend/introduction.mdx +218 -0
- package/mintlify-docs/changelog.mdx +293 -0
- package/mintlify-docs/community-edition.mdx +304 -0
- package/mintlify-docs/core/exulu-agent/api-reference.mdx +894 -0
- package/mintlify-docs/core/exulu-agent/configuration.mdx +690 -0
- package/mintlify-docs/core/exulu-agent/introduction.mdx +552 -0
- package/mintlify-docs/core/exulu-app/api-reference.mdx +481 -0
- package/mintlify-docs/core/exulu-app/configuration.mdx +319 -0
- package/mintlify-docs/core/exulu-app/introduction.mdx +117 -0
- package/mintlify-docs/core/exulu-authentication.mdx +810 -0
- package/mintlify-docs/core/exulu-chunkers/api-reference.mdx +1011 -0
- package/mintlify-docs/core/exulu-chunkers/configuration.mdx +596 -0
- package/mintlify-docs/core/exulu-chunkers/introduction.mdx +403 -0
- package/mintlify-docs/core/exulu-context/api-reference.mdx +911 -0
- package/mintlify-docs/core/exulu-context/configuration.mdx +648 -0
- package/mintlify-docs/core/exulu-context/introduction.mdx +394 -0
- package/mintlify-docs/core/exulu-database.mdx +811 -0
- package/mintlify-docs/core/exulu-default-agents.mdx +545 -0
- package/mintlify-docs/core/exulu-eval/api-reference.mdx +772 -0
- package/mintlify-docs/core/exulu-eval/configuration.mdx +680 -0
- package/mintlify-docs/core/exulu-eval/introduction.mdx +459 -0
- package/mintlify-docs/core/exulu-logging.mdx +464 -0
- package/mintlify-docs/core/exulu-otel.mdx +670 -0
- package/mintlify-docs/core/exulu-queues/api-reference.mdx +648 -0
- package/mintlify-docs/core/exulu-queues/configuration.mdx +650 -0
- package/mintlify-docs/core/exulu-queues/introduction.mdx +474 -0
- package/mintlify-docs/core/exulu-reranker/api-reference.mdx +630 -0
- package/mintlify-docs/core/exulu-reranker/configuration.mdx +663 -0
- package/mintlify-docs/core/exulu-reranker/introduction.mdx +516 -0
- package/mintlify-docs/core/exulu-tool/api-reference.mdx +723 -0
- package/mintlify-docs/core/exulu-tool/configuration.mdx +805 -0
- package/mintlify-docs/core/exulu-tool/introduction.mdx +539 -0
- package/mintlify-docs/core/exulu-variables/api-reference.mdx +699 -0
- package/mintlify-docs/core/exulu-variables/configuration.mdx +736 -0
- package/mintlify-docs/core/exulu-variables/introduction.mdx +511 -0
- package/mintlify-docs/development.mdx +94 -0
- package/mintlify-docs/docs.json +248 -0
- package/mintlify-docs/enterprise-edition.mdx +538 -0
- package/mintlify-docs/essentials/code.mdx +35 -0
- package/mintlify-docs/essentials/images.mdx +59 -0
- package/mintlify-docs/essentials/markdown.mdx +88 -0
- package/mintlify-docs/essentials/navigation.mdx +87 -0
- package/mintlify-docs/essentials/reusable-snippets.mdx +110 -0
- package/mintlify-docs/essentials/settings.mdx +318 -0
- package/mintlify-docs/favicon.svg +3 -0
- package/mintlify-docs/frontend/introduction.mdx +39 -0
- package/mintlify-docs/getting-started.mdx +267 -0
- package/mintlify-docs/guides/custom-agent.mdx +608 -0
- package/mintlify-docs/guides/first-agent.mdx +315 -0
- package/mintlify-docs/images/admin_ui.png +0 -0
- package/mintlify-docs/images/contexts.png +0 -0
- package/mintlify-docs/images/create_agents.png +0 -0
- package/mintlify-docs/images/evals.png +0 -0
- package/mintlify-docs/images/graphql.png +0 -0
- package/mintlify-docs/images/graphql_api.png +0 -0
- package/mintlify-docs/images/hero-dark.png +0 -0
- package/mintlify-docs/images/hero-light.png +0 -0
- package/mintlify-docs/images/hero.png +0 -0
- package/mintlify-docs/images/knowledge_sources.png +0 -0
- package/mintlify-docs/images/mcp.png +0 -0
- package/mintlify-docs/images/scaling.png +0 -0
- package/mintlify-docs/index.mdx +411 -0
- package/mintlify-docs/logo/dark.svg +9 -0
- package/mintlify-docs/logo/light.svg +9 -0
- package/mintlify-docs/partners.mdx +558 -0
- package/mintlify-docs/products.mdx +77 -0
- package/mintlify-docs/snippets/snippet-intro.mdx +4 -0
- package/mintlify-docs/styles.css +207 -0
- package/{documentation → old-documentation}/logging.md +3 -3
- package/package.json +35 -4
- package/skills-lock.json +10 -0
- package/types/context-processor.ts +45 -0
- package/types/exulu-table-definition.ts +79 -0
- package/types/file-types.ts +18 -0
- package/types/models/agent.ts +10 -12
- package/types/models/exulu-agent-tool-config.ts +11 -0
- package/types/models/rate-limiter-rules.ts +7 -0
- package/types/provider-config.ts +21 -0
- package/types/queue-config.ts +16 -0
- package/types/rbac-rights-modes.ts +1 -0
- package/types/statistics.ts +20 -0
- package/types/workflow.ts +31 -0
- package/changelogs/10.11.2025_03.12.2025.md +0 -316
- package/types/models/agent-backend.ts +0 -15
- /package/{documentation → old-documentation}/otel.md +0 -0
- /package/{documentation → old-documentation}/patch-older-releases.md +0 -0
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# -*- mode: python ; coding: utf-8 -*-
|
|
2
|
+
from PyInstaller.utils.hooks import copy_metadata, collect_data_files
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
block_cipher = None
|
|
6
|
+
|
|
7
|
+
# Collect package metadata for packages that need it
|
|
8
|
+
datas = []
|
|
9
|
+
datas += copy_metadata('docling')
|
|
10
|
+
datas += copy_metadata('docling-core')
|
|
11
|
+
datas += copy_metadata('docling-parse')
|
|
12
|
+
datas += copy_metadata('docling-ibm-models')
|
|
13
|
+
datas += copy_metadata('transformers')
|
|
14
|
+
datas += copy_metadata('torch')
|
|
15
|
+
datas += copy_metadata('tokenizers')
|
|
16
|
+
datas += copy_metadata('huggingface-hub')
|
|
17
|
+
datas += copy_metadata('pydantic')
|
|
18
|
+
datas += copy_metadata('pydantic-core')
|
|
19
|
+
|
|
20
|
+
# Collect data files from docling packages
|
|
21
|
+
datas += collect_data_files('docling_parse')
|
|
22
|
+
datas += collect_data_files('docling')
|
|
23
|
+
datas += collect_data_files('docling_core')
|
|
24
|
+
datas += collect_data_files('docling_ibm_models')
|
|
25
|
+
datas += collect_data_files('transformers')
|
|
26
|
+
|
|
27
|
+
# Collect all data files from docling and transformers packages
|
|
28
|
+
a = Analysis(
|
|
29
|
+
['pdf_processor.py'],
|
|
30
|
+
pathex=[],
|
|
31
|
+
binaries=[],
|
|
32
|
+
datas=datas,
|
|
33
|
+
hiddenimports=[
|
|
34
|
+
'docling',
|
|
35
|
+
'docling.document_converter',
|
|
36
|
+
'docling.chunking',
|
|
37
|
+
'docling.models',
|
|
38
|
+
'docling.models.plugins',
|
|
39
|
+
'docling.models.plugins.defaults',
|
|
40
|
+
'docling.backend',
|
|
41
|
+
'docling.backend.docling_parse_backend',
|
|
42
|
+
'docling.backend.asciidoc_backend',
|
|
43
|
+
'docling.backend.html_backend',
|
|
44
|
+
'docling.backend.md_backend',
|
|
45
|
+
'docling.backend.msexcel_backend',
|
|
46
|
+
'docling.backend.mspowerpoint_backend',
|
|
47
|
+
'docling.backend.msword_backend',
|
|
48
|
+
'docling.datamodel',
|
|
49
|
+
'docling.datamodel.document',
|
|
50
|
+
'docling_core',
|
|
51
|
+
'docling_core.transforms.chunker',
|
|
52
|
+
'docling_core.transforms.chunker.tokenizer',
|
|
53
|
+
'docling_core.transforms.chunker.tokenizer.huggingface',
|
|
54
|
+
'transformers',
|
|
55
|
+
'transformers.models',
|
|
56
|
+
'transformers.models.auto',
|
|
57
|
+
'torch',
|
|
58
|
+
'numpy',
|
|
59
|
+
'PIL',
|
|
60
|
+
'pdfplumber',
|
|
61
|
+
'pypdf',
|
|
62
|
+
'pikepdf',
|
|
63
|
+
'lxml',
|
|
64
|
+
'bs4',
|
|
65
|
+
'tiktoken',
|
|
66
|
+
'tokenizers',
|
|
67
|
+
'sentencepiece',
|
|
68
|
+
'safetensors',
|
|
69
|
+
'huggingface_hub',
|
|
70
|
+
'tqdm',
|
|
71
|
+
'regex',
|
|
72
|
+
'requests',
|
|
73
|
+
'urllib3',
|
|
74
|
+
'certifi',
|
|
75
|
+
'charset_normalizer',
|
|
76
|
+
'idna',
|
|
77
|
+
'packaging',
|
|
78
|
+
'filelock',
|
|
79
|
+
'pyyaml',
|
|
80
|
+
'jinja2',
|
|
81
|
+
'markupsafe',
|
|
82
|
+
],
|
|
83
|
+
hookspath=[],
|
|
84
|
+
hooksconfig={},
|
|
85
|
+
runtime_hooks=[],
|
|
86
|
+
excludes=[],
|
|
87
|
+
win_no_prefer_redirects=False,
|
|
88
|
+
win_private_assemblies=False,
|
|
89
|
+
cipher=block_cipher,
|
|
90
|
+
noarchive=False,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
|
|
94
|
+
|
|
95
|
+
exe = EXE(
|
|
96
|
+
pyz,
|
|
97
|
+
a.scripts,
|
|
98
|
+
a.binaries,
|
|
99
|
+
a.zipfiles,
|
|
100
|
+
a.datas,
|
|
101
|
+
[],
|
|
102
|
+
name='pdf_processor',
|
|
103
|
+
debug=False,
|
|
104
|
+
bootloader_ignore_signals=False,
|
|
105
|
+
strip=False,
|
|
106
|
+
upx=True,
|
|
107
|
+
upx_exclude=[],
|
|
108
|
+
runtime_tmpdir=None,
|
|
109
|
+
console=True,
|
|
110
|
+
disable_windowed_traceback=False,
|
|
111
|
+
argv_emulation=False,
|
|
112
|
+
target_arch=None,
|
|
113
|
+
codesign_identity=None,
|
|
114
|
+
entitlements_file=None,
|
|
115
|
+
)
|
|
@@ -0,0 +1,420 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
PDF to Markdown Converter using Docling
|
|
4
|
+
Converts a PDF to JSON with page-separated markdown and images.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
pdf_to_markdown.py <pdf_file_path> [-o OUTPUT_PATH] [--max-tokens MAX_TOKENS]
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import sys
|
|
11
|
+
import os
|
|
12
|
+
import warnings
|
|
13
|
+
import argparse
|
|
14
|
+
import json
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from PIL import Image
|
|
17
|
+
|
|
18
|
+
# Suppress warnings
|
|
19
|
+
warnings.filterwarnings('ignore')
|
|
20
|
+
os.environ['PYTHONWARNINGS'] = 'ignore'
|
|
21
|
+
|
|
22
|
+
from docling.document_converter import DocumentConverter, PdfFormatOption
|
|
23
|
+
from docling.datamodel.base_models import InputFormat
|
|
24
|
+
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
|
25
|
+
from hierarchical.postprocessor import ResultPostprocessor
|
|
26
|
+
|
|
27
|
+
IMAGE_RESOLUTION_SCALE = 2.0
|
|
28
|
+
|
|
29
|
+
def normalize_markdown_content(content: str) -> str:
|
|
30
|
+
"""
|
|
31
|
+
Normalize markdown content by removing excessive whitespace,
|
|
32
|
+
especially in table formatting.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
content: Raw markdown content
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Normalized markdown content
|
|
39
|
+
"""
|
|
40
|
+
import re
|
|
41
|
+
|
|
42
|
+
lines = content.split('\n')
|
|
43
|
+
normalized_lines = []
|
|
44
|
+
|
|
45
|
+
for line in lines:
|
|
46
|
+
# Check if this is a table row (contains |)
|
|
47
|
+
if '|' in line:
|
|
48
|
+
# Split by | and strip whitespace from each cell
|
|
49
|
+
parts = line.split('|')
|
|
50
|
+
cleaned_parts = [part.strip() for part in parts]
|
|
51
|
+
# Rejoin with single space padding
|
|
52
|
+
normalized_line = ' | '.join(cleaned_parts)
|
|
53
|
+
normalized_lines.append(normalized_line)
|
|
54
|
+
else:
|
|
55
|
+
# For non-table lines, just strip trailing whitespace
|
|
56
|
+
normalized_lines.append(line.rstrip())
|
|
57
|
+
|
|
58
|
+
return '\n'.join(normalized_lines)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def extract_headings_from_markdown(markdown_content: str) -> list:
|
|
62
|
+
"""
|
|
63
|
+
Extract all headings from markdown content as a list of (level, text) tuples.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
markdown_content: Markdown text content
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
List of (level, text) tuples in order of appearance
|
|
70
|
+
"""
|
|
71
|
+
import re
|
|
72
|
+
|
|
73
|
+
headings = []
|
|
74
|
+
lines = markdown_content.split('\n')
|
|
75
|
+
|
|
76
|
+
for line in lines:
|
|
77
|
+
# Match markdown headings (# Header)
|
|
78
|
+
heading_match = re.match(r'^(#{1,6})\s+(.+)$', line.strip())
|
|
79
|
+
if heading_match:
|
|
80
|
+
level = len(heading_match.group(1)) # Number of # symbols
|
|
81
|
+
text = heading_match.group(2).strip()
|
|
82
|
+
headings.append((level, text))
|
|
83
|
+
|
|
84
|
+
return headings
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def build_hierarchy_from_stack(heading_stack: list) -> dict:
|
|
88
|
+
"""
|
|
89
|
+
Build a nested hierarchy dictionary from a heading stack.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
heading_stack: List of (level, text) tuples representing the current path
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
Nested dictionary representing the hierarchy
|
|
96
|
+
"""
|
|
97
|
+
hierarchy = {}
|
|
98
|
+
current = hierarchy
|
|
99
|
+
|
|
100
|
+
for i, (level, heading_text) in enumerate(heading_stack):
|
|
101
|
+
if i == len(heading_stack) - 1:
|
|
102
|
+
# Last item in stack - set to null
|
|
103
|
+
current[heading_text] = None
|
|
104
|
+
else:
|
|
105
|
+
# Not last item - create dict for children
|
|
106
|
+
if heading_text not in current:
|
|
107
|
+
current[heading_text] = {}
|
|
108
|
+
current = current[heading_text]
|
|
109
|
+
|
|
110
|
+
return hierarchy
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def merge_hierarchies(h1: dict, h2: dict) -> dict:
|
|
114
|
+
"""
|
|
115
|
+
Deep merge two hierarchy dictionaries, combining their structures.
|
|
116
|
+
"""
|
|
117
|
+
if not h1:
|
|
118
|
+
return h2.copy() if h2 else {}
|
|
119
|
+
if not h2:
|
|
120
|
+
return h1.copy()
|
|
121
|
+
|
|
122
|
+
result = {}
|
|
123
|
+
all_keys = set(h1.keys()) | set(h2.keys())
|
|
124
|
+
|
|
125
|
+
for key in all_keys:
|
|
126
|
+
if key in h1 and key in h2:
|
|
127
|
+
# Both have this key
|
|
128
|
+
if isinstance(h1[key], dict) and isinstance(h2[key], dict):
|
|
129
|
+
result[key] = merge_hierarchies(h1[key], h2[key])
|
|
130
|
+
elif h2[key] is not None:
|
|
131
|
+
result[key] = h2[key]
|
|
132
|
+
else:
|
|
133
|
+
result[key] = h1[key]
|
|
134
|
+
elif key in h1:
|
|
135
|
+
result[key] = h1[key]
|
|
136
|
+
else:
|
|
137
|
+
result[key] = h2[key]
|
|
138
|
+
|
|
139
|
+
return result
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def parse_heading_hierarchy(markdown_content: str) -> dict:
|
|
143
|
+
"""
|
|
144
|
+
Parse markdown content and build a nested heading hierarchy.
|
|
145
|
+
Headings at the same level are siblings in the hierarchy.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
markdown_content: Markdown text content
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
Nested dictionary representing heading hierarchy
|
|
152
|
+
"""
|
|
153
|
+
import re
|
|
154
|
+
|
|
155
|
+
lines = markdown_content.split('\n')
|
|
156
|
+
heading_stack = [] # Stack of (level, text) tuples
|
|
157
|
+
hierarchy = {}
|
|
158
|
+
|
|
159
|
+
for line in lines:
|
|
160
|
+
# Match markdown headings (# Header)
|
|
161
|
+
heading_match = re.match(r'^(#{1,6})\s+(.+)$', line.strip())
|
|
162
|
+
if heading_match:
|
|
163
|
+
level = len(heading_match.group(1)) # Number of # symbols
|
|
164
|
+
text = heading_match.group(2).strip()
|
|
165
|
+
|
|
166
|
+
# Pop headings from stack that are deeper than current level
|
|
167
|
+
# (removes children when moving back up the hierarchy)
|
|
168
|
+
while heading_stack and heading_stack[-1][0] >= level:
|
|
169
|
+
heading_stack.pop()
|
|
170
|
+
|
|
171
|
+
# Add this heading to stack
|
|
172
|
+
heading_stack.append((level, text))
|
|
173
|
+
|
|
174
|
+
# Build nested structure for current heading path
|
|
175
|
+
current = hierarchy
|
|
176
|
+
for i, (lvl, heading_text) in enumerate(heading_stack):
|
|
177
|
+
if heading_text not in current:
|
|
178
|
+
# If this is the last heading in the stack, set to null
|
|
179
|
+
# Otherwise, set to empty dict for children
|
|
180
|
+
if i == len(heading_stack) - 1:
|
|
181
|
+
current[heading_text] = None
|
|
182
|
+
else:
|
|
183
|
+
current[heading_text] = {}
|
|
184
|
+
|
|
185
|
+
# Navigate to the next level if not at the end
|
|
186
|
+
if i < len(heading_stack) - 1:
|
|
187
|
+
if current[heading_text] is None:
|
|
188
|
+
current[heading_text] = {}
|
|
189
|
+
current = current[heading_text]
|
|
190
|
+
|
|
191
|
+
return hierarchy
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def process_pdf_to_json(pdf_path: str, output_path: str = None, images_dir: str = None) -> list:
|
|
195
|
+
"""
|
|
196
|
+
Process a PDF file using Docling and return JSON with page-separated markdown and images.
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
pdf_path: Path to the PDF file
|
|
200
|
+
output_path: Optional output path for JSON file
|
|
201
|
+
images_dir: Directory to save page images (should be passed from main)
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
List of page objects with content and image references
|
|
205
|
+
"""
|
|
206
|
+
# Configure PDF pipeline with image generation
|
|
207
|
+
pipeline_options = PdfPipelineOptions()
|
|
208
|
+
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
|
|
209
|
+
pipeline_options.generate_page_images = True # Generate page images
|
|
210
|
+
|
|
211
|
+
# Convert the PDF document
|
|
212
|
+
converter = DocumentConverter(
|
|
213
|
+
format_options={
|
|
214
|
+
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
|
215
|
+
}
|
|
216
|
+
)
|
|
217
|
+
result = converter.convert(source=pdf_path)
|
|
218
|
+
|
|
219
|
+
# Apply hierarchical post-processing to fix heading hierarchy
|
|
220
|
+
print(f"Applying hierarchical post-processing...", file=sys.stderr)
|
|
221
|
+
ResultPostprocessor(result, source=pdf_path).process()
|
|
222
|
+
|
|
223
|
+
doc = result.document
|
|
224
|
+
|
|
225
|
+
# Export full markdown with page markers
|
|
226
|
+
full_markdown = doc.export_to_markdown(page_break_placeholder="<!-- END_OF_PAGE -->")
|
|
227
|
+
|
|
228
|
+
# Split by page markers
|
|
229
|
+
pages = full_markdown.split("<!-- END_OF_PAGE -->")
|
|
230
|
+
|
|
231
|
+
# Ensure images_dir is a Path object
|
|
232
|
+
images_dir = Path(images_dir)
|
|
233
|
+
images_dir.mkdir(exist_ok=True)
|
|
234
|
+
|
|
235
|
+
# Extract and save page images from the conversion result
|
|
236
|
+
page_images = {}
|
|
237
|
+
|
|
238
|
+
# Check if page images are in the result object
|
|
239
|
+
if hasattr(result, 'pages') and result.pages:
|
|
240
|
+
# Create images directory if it doesn't exist
|
|
241
|
+
images_dir.mkdir(exist_ok=True)
|
|
242
|
+
|
|
243
|
+
for page_data in result.pages:
|
|
244
|
+
# Get page number
|
|
245
|
+
page_no = getattr(page_data, 'page_no', None) or getattr(page_data, 'page_number', None)
|
|
246
|
+
|
|
247
|
+
# Check for image attribute
|
|
248
|
+
if hasattr(page_data, 'image') and page_data.image:
|
|
249
|
+
# Save the PIL image to disk
|
|
250
|
+
image_filename = f"page_{page_no}.png"
|
|
251
|
+
image_path = images_dir / image_filename
|
|
252
|
+
|
|
253
|
+
# Save the image
|
|
254
|
+
if isinstance(page_data.image, Image.Image):
|
|
255
|
+
page_data.image.save(str(image_path), 'PNG')
|
|
256
|
+
page_images[page_no] = str(image_path)
|
|
257
|
+
print(f"Saved page {page_no} image to: {image_path}", file=sys.stderr)
|
|
258
|
+
|
|
259
|
+
# Build page objects with cumulative heading hierarchy
|
|
260
|
+
page_objects = []
|
|
261
|
+
cumulative_markdown = "" # Track all markdown up to and including current page
|
|
262
|
+
heading_stack = [] # Current heading context (stack of (level, text) tuples)
|
|
263
|
+
|
|
264
|
+
# Build JSON structure with page-separated content
|
|
265
|
+
for page_num, page_content in enumerate(pages, start=1):
|
|
266
|
+
# Skip empty pages
|
|
267
|
+
if not page_content.strip():
|
|
268
|
+
continue
|
|
269
|
+
|
|
270
|
+
# Add current page to cumulative markdown
|
|
271
|
+
cumulative_markdown += page_content + "\n"
|
|
272
|
+
|
|
273
|
+
# Extract headings from current page only
|
|
274
|
+
page_headings = extract_headings_from_markdown(page_content)
|
|
275
|
+
|
|
276
|
+
# Track all heading contexts that appear on this page
|
|
277
|
+
page_hierarchy = {}
|
|
278
|
+
|
|
279
|
+
# If no headings on this page, use the current stack context
|
|
280
|
+
if not page_headings:
|
|
281
|
+
if heading_stack:
|
|
282
|
+
page_hierarchy = build_hierarchy_from_stack(heading_stack)
|
|
283
|
+
else:
|
|
284
|
+
# Process each heading on the current page
|
|
285
|
+
for level, text in page_headings:
|
|
286
|
+
# Pop headings from stack that are at same or deeper level
|
|
287
|
+
while heading_stack and heading_stack[-1][0] >= level:
|
|
288
|
+
heading_stack.pop()
|
|
289
|
+
|
|
290
|
+
# Add this heading to stack
|
|
291
|
+
heading_stack.append((level, text))
|
|
292
|
+
|
|
293
|
+
# Build hierarchy for this context and merge it
|
|
294
|
+
context_hierarchy = build_hierarchy_from_stack(heading_stack)
|
|
295
|
+
page_hierarchy = merge_hierarchies(page_hierarchy, context_hierarchy)
|
|
296
|
+
|
|
297
|
+
# Get image path if available
|
|
298
|
+
page_image_path = page_images.get(page_num)
|
|
299
|
+
|
|
300
|
+
# Normalize the content to remove excessive whitespace
|
|
301
|
+
normalized_content = normalize_markdown_content(page_content.strip())
|
|
302
|
+
|
|
303
|
+
page_objects.append({
|
|
304
|
+
"page": page_num,
|
|
305
|
+
"content": normalized_content,
|
|
306
|
+
"image": page_image_path,
|
|
307
|
+
"headings": page_hierarchy
|
|
308
|
+
})
|
|
309
|
+
|
|
310
|
+
# Save to JSON file if output path provided
|
|
311
|
+
if output_path:
|
|
312
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
|
313
|
+
json.dump(page_objects, f, indent=2, ensure_ascii=False)
|
|
314
|
+
f.flush()
|
|
315
|
+
print(f"Successfully saved JSON to: {output_path}", file=sys.stderr)
|
|
316
|
+
print(f"Images saved to: {images_dir}", file=sys.stderr)
|
|
317
|
+
|
|
318
|
+
return page_objects
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def main():
|
|
322
|
+
"""Main entry point for the script."""
|
|
323
|
+
# Set up argument parser
|
|
324
|
+
parser = argparse.ArgumentParser(
|
|
325
|
+
description='Convert PDF to Markdown using Docling with hierarchical headings and page markers',
|
|
326
|
+
formatter_class=argparse.RawDescriptionHelpFormatter
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
parser.add_argument(
|
|
330
|
+
'pdf_path',
|
|
331
|
+
type=str,
|
|
332
|
+
help='Path to the PDF file to convert'
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
parser.add_argument(
|
|
336
|
+
'-o', '--output',
|
|
337
|
+
type=str,
|
|
338
|
+
dest='output_path',
|
|
339
|
+
help='Output path for the JSON file (default: same name as PDF with .json extension)'
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
parser.add_argument(
|
|
343
|
+
'--images-dir',
|
|
344
|
+
type=str,
|
|
345
|
+
dest='images_dir',
|
|
346
|
+
help='Directory to save page images (default: <pdf_name>_images/)'
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
parser.add_argument(
|
|
350
|
+
'--max-tokens',
|
|
351
|
+
type=int,
|
|
352
|
+
dest='max_tokens',
|
|
353
|
+
help='Maximum number of tokens (currently not used, reserved for future use)'
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
# Parse arguments
|
|
357
|
+
args = parser.parse_args()
|
|
358
|
+
|
|
359
|
+
pdf_path = args.pdf_path
|
|
360
|
+
output_path = args.output_path
|
|
361
|
+
images_dir = args.images_dir
|
|
362
|
+
|
|
363
|
+
# Validate the file exists
|
|
364
|
+
if not Path(pdf_path).exists():
|
|
365
|
+
print(f"Error: File not found: {pdf_path}", file=sys.stderr)
|
|
366
|
+
sys.exit(1)
|
|
367
|
+
|
|
368
|
+
# Create a shared folder named after the source file
|
|
369
|
+
pdf_file = Path(pdf_path)
|
|
370
|
+
shared_folder = pdf_file.parent / pdf_file.stem
|
|
371
|
+
shared_folder.mkdir(exist_ok=True)
|
|
372
|
+
|
|
373
|
+
# Default: JSON file inside the shared folder
|
|
374
|
+
if not output_path:
|
|
375
|
+
output_path = str(shared_folder / "processed.json")
|
|
376
|
+
else:
|
|
377
|
+
# If output_path is a directory, append docling.json
|
|
378
|
+
output_path_obj = Path(output_path)
|
|
379
|
+
if output_path_obj.is_dir():
|
|
380
|
+
output_path = str(output_path_obj / "processed.json")
|
|
381
|
+
elif not output_path_obj.suffix:
|
|
382
|
+
# If no extension provided, treat as directory
|
|
383
|
+
output_path_obj.mkdir(exist_ok=True)
|
|
384
|
+
output_path = str(output_path_obj / "processed.json")
|
|
385
|
+
|
|
386
|
+
# Default: images directory inside the shared folder
|
|
387
|
+
if not images_dir:
|
|
388
|
+
# If output_path was provided and is in a custom location, use that location's parent
|
|
389
|
+
output_parent = Path(output_path).parent
|
|
390
|
+
images_dir = str(output_parent / "images")
|
|
391
|
+
|
|
392
|
+
try:
|
|
393
|
+
# Process the PDF
|
|
394
|
+
print(f"Processing PDF: {pdf_path}", file=sys.stderr)
|
|
395
|
+
page_objects = process_pdf_to_json(pdf_path, output_path, images_dir)
|
|
396
|
+
|
|
397
|
+
# Print stats
|
|
398
|
+
total_content_length = sum(len(page['content']) for page in page_objects)
|
|
399
|
+
images_with_content = sum(1 for page in page_objects if page.get('image'))
|
|
400
|
+
|
|
401
|
+
print(f"\nJSON output stats:", file=sys.stderr)
|
|
402
|
+
print(f" Total pages: {len(page_objects)}", file=sys.stderr)
|
|
403
|
+
print(f" Pages with images: {images_with_content}", file=sys.stderr)
|
|
404
|
+
print(f" Total content characters: {total_content_length}", file=sys.stderr)
|
|
405
|
+
|
|
406
|
+
# Exit cleanly
|
|
407
|
+
sys.stderr.flush()
|
|
408
|
+
sys.stdout.flush()
|
|
409
|
+
os._exit(0)
|
|
410
|
+
|
|
411
|
+
except Exception as e:
|
|
412
|
+
print(f"Error processing PDF: {str(e)}", file=sys.stderr)
|
|
413
|
+
import traceback
|
|
414
|
+
traceback.print_exc(file=sys.stderr)
|
|
415
|
+
sys.stderr.flush()
|
|
416
|
+
os._exit(1)
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
if __name__ == "__main__":
|
|
420
|
+
main()
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
export const ENTITLEMENTS: {
|
|
2
|
+
"rbac": boolean,
|
|
3
|
+
"advanced-markdown-chunker": boolean,
|
|
4
|
+
"agentic-retrieval": boolean,
|
|
5
|
+
"queues": boolean,
|
|
6
|
+
"custom-branding": boolean,
|
|
7
|
+
"evals": boolean,
|
|
8
|
+
"template-conversations": boolean,
|
|
9
|
+
"agent-feedback": boolean,
|
|
10
|
+
"multi-agent-tooling": boolean,
|
|
11
|
+
"advanced-document-processing": boolean
|
|
12
|
+
} = {
|
|
13
|
+
"rbac": false,
|
|
14
|
+
"advanced-markdown-chunker": false,
|
|
15
|
+
"agentic-retrieval": false,
|
|
16
|
+
"queues": false,
|
|
17
|
+
"custom-branding": false,
|
|
18
|
+
"evals": false,
|
|
19
|
+
"template-conversations": false,
|
|
20
|
+
"agent-feedback": false,
|
|
21
|
+
"multi-agent-tooling": false,
|
|
22
|
+
"advanced-document-processing": false
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export const checkLicense = () => {
|
|
26
|
+
if (
|
|
27
|
+
!process.env.EXULU_ENTERPRISE_LICENSE ||
|
|
28
|
+
process.env.EXULU_ENTERPRISE_LICENSE === "" ||
|
|
29
|
+
!process.env.EXULU_ENTERPRISE_LICENSE.startsWith("EXULU_EE_")
|
|
30
|
+
) {
|
|
31
|
+
return ENTITLEMENTS
|
|
32
|
+
} else {
|
|
33
|
+
return {
|
|
34
|
+
"rbac": true,
|
|
35
|
+
"advanced-markdown-chunker": true,
|
|
36
|
+
"agentic-retrieval": true,
|
|
37
|
+
"mcp": true,
|
|
38
|
+
"queues": true,
|
|
39
|
+
"prompt-library": true,
|
|
40
|
+
"custom-branding": true,
|
|
41
|
+
"evals": true,
|
|
42
|
+
"template-conversations": true,
|
|
43
|
+
"agent-feedback": true,
|
|
44
|
+
"multi-agent-tooling": true,
|
|
45
|
+
"advanced-document-processing": true
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
}
|