mathpix 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +72 -0
- data/README.md +115 -2
- data/SECURITY.md +1 -1
- data/bin/mathpix-mcp +55 -0
- data/lib/mathpix/batch.rb +7 -8
- data/lib/mathpix/batched_document_conversion.rb +238 -0
- data/lib/mathpix/client.rb +33 -27
- data/lib/mathpix/configuration.rb +5 -9
- data/lib/mathpix/conversion.rb +2 -6
- data/lib/mathpix/document.rb +47 -12
- data/lib/mathpix/document_batcher.rb +191 -0
- data/lib/mathpix/mcp/auth/oauth_provider.rb +8 -9
- data/lib/mathpix/mcp/base_tool.rb +8 -5
- data/lib/mathpix/mcp/elicitations/ambiguity_elicitation.rb +8 -11
- data/lib/mathpix/mcp/elicitations/base_elicitation.rb +2 -0
- data/lib/mathpix/mcp/elicitations/confidence_elicitation.rb +2 -1
- data/lib/mathpix/mcp/elicitations.rb +1 -1
- data/lib/mathpix/mcp/middleware/cors_middleware.rb +2 -6
- data/lib/mathpix/mcp/middleware/oauth_middleware.rb +2 -6
- data/lib/mathpix/mcp/middleware/rate_limiting_middleware.rb +19 -18
- data/lib/mathpix/mcp/resources/formats_list_resource.rb +54 -54
- data/lib/mathpix/mcp/resources/hierarchical_router.rb +9 -18
- data/lib/mathpix/mcp/resources/latest_snip_resource.rb +22 -22
- data/lib/mathpix/mcp/resources/recent_snips_resource.rb +11 -10
- data/lib/mathpix/mcp/resources/snip_stats_resource.rb +14 -12
- data/lib/mathpix/mcp/server.rb +18 -18
- data/lib/mathpix/mcp/tools/batch_convert_tool.rb +31 -37
- data/lib/mathpix/mcp/tools/check_document_status_tool.rb +5 -5
- data/lib/mathpix/mcp/tools/convert_document_tool.rb +15 -14
- data/lib/mathpix/mcp/tools/convert_image_tool.rb +15 -14
- data/lib/mathpix/mcp/tools/convert_strokes_tool.rb +13 -13
- data/lib/mathpix/mcp/tools/get_account_info_tool.rb +1 -1
- data/lib/mathpix/mcp/tools/get_usage_tool.rb +5 -7
- data/lib/mathpix/mcp/tools/list_formats_tool.rb +30 -30
- data/lib/mathpix/mcp/tools/search_results_tool.rb +13 -14
- data/lib/mathpix/mcp/transports/http_streaming_transport.rb +129 -118
- data/lib/mathpix/mcp/transports/sse_stream_handler.rb +37 -35
- data/lib/mathpix/result.rb +3 -2
- data/lib/mathpix/version.rb +1 -1
- data/lib/mathpix.rb +3 -1
- metadata +75 -12
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 10150e3331211cf21bee0d8dfebad1226cc16c8616966d9a5dbb9de06f131b6c
|
4
|
+
data.tar.gz: 8931dcca80cedf7d03d07a8ed1ff92cd3fc65dd6829d5c432a11e2833f99ccf9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 96a10fc2943e50c95e5ec0eec3fab60eef4ba14bff0b2e738a6698264e5cd24e13977171f71f131f9db73b2237a5451f827951728a8e40936a0b7a1c60fb3e6e
|
7
|
+
data.tar.gz: d9af8b573f189f08e4e641b242e42476698421818cc57236b2f27f633eb1e7364ed8fd96b10de95e498dd0d1fab1d69d0cbc07bcc5a285d45ba1cd3667d5b697
|
data/CHANGELOG.md
CHANGED
@@ -5,6 +5,78 @@ All notable changes to this project will be documented in this file.
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
7
7
|
|
8
|
+
## [Unreleased]
|
9
|
+
|
10
|
+
## [0.1.2] - 2025-10-14
|
11
|
+
|
12
|
+
### Added
|
13
|
+
- **Automatic PDF Batching**: Large PDFs (>1.2MB) are now automatically split into batches for processing
|
14
|
+
- Adaptive batch sizing based on file size and page count
|
15
|
+
- Intelligent checkpoint pattern using seed 1069: [+1, -1, -1, +1, +1, +1, +1]
|
16
|
+
- Automatic result merging across all batches (markdown, LaTeX, HTML, equations, tables, diagrams)
|
17
|
+
- Exponential backoff retry logic for failed batches (3 attempts)
|
18
|
+
- Comprehensive batch metadata tracking
|
19
|
+
- Transparent to existing API - no code changes needed
|
20
|
+
- `DocumentBatcher` class for batch calculation and PDF extraction
|
21
|
+
- `BatchedDocumentConversion` class for managing multi-batch conversions
|
22
|
+
- 38 comprehensive tests for batching functionality (17 DocumentBatcher + 21 BatchedDocumentConversion)
|
23
|
+
- Research-backed documentation (`docs/BATCHING_RESEARCH.md`):
|
24
|
+
- 7 comprehensive web searches on OCR API limits, performance benchmarks, and distributed systems
|
25
|
+
- Industry comparison: AWS Textract, Google Cloud Vision, Azure AI Vision, Adobe Services
|
26
|
+
- Performance analysis: LlamaParse, Docling, Unstructured benchmarks
|
27
|
+
- Rationale for all batching constants (MAX_SINGLE_REQUEST_MB, DEFAULT_PAGES_PER_BATCH, MIN_PAGES_PER_BATCH)
|
28
|
+
- Test infrastructure improvements:
|
29
|
+
- Added `rack-test` dependency for HTTP streaming tests
|
30
|
+
- Fixed RSpec shared examples syntax
|
31
|
+
- Created test summary documentation
|
32
|
+
|
33
|
+
### Changed
|
34
|
+
- `Document` class now automatically uses batching for large PDFs
|
35
|
+
- Batch processing uses seed 1069 for deterministic checkpoint selection
|
36
|
+
- Ruby 3.4.1 compatibility verified and enforced via `.ruby-version`
|
37
|
+
- **BREAKING**: Minimum Ruby version increased from 2.7.0 to 3.2.0 (required by Bundler 2.7.2)
|
38
|
+
|
39
|
+
### Fixed
|
40
|
+
- **Ruby 3.5+ Compatibility**: Added `ostruct ~> 0.6` as explicit runtime dependency
|
41
|
+
- Eliminates deprecation warning: "ostruct will no longer be part of default gems"
|
42
|
+
- Ensures forward compatibility with Ruby 3.5 and later
|
43
|
+
- ostruct used in `HttpStreamingTransport` for error object creation
|
44
|
+
|
45
|
+
### Dependencies
|
46
|
+
- Added `pdf-reader ~> 2.11` for PDF structure parsing
|
47
|
+
- Added `prawn ~> 2.4` for batch PDF creation
|
48
|
+
- Added `ostruct ~> 0.6` for Ruby 3.5+ compatibility
|
49
|
+
- Added `rack-test ~> 2.1` (development) for transport testing
|
50
|
+
|
51
|
+
### Performance
|
52
|
+
- Large PDF conversions now handle files previously rejected by API
|
53
|
+
- Automatic retry reduces failure rates
|
54
|
+
- Parallel batch processing planned for future release
|
55
|
+
|
56
|
+
### Documentation
|
57
|
+
- Added comprehensive batching research document with full citations
|
58
|
+
- Documented adaptive batching algorithm with examples
|
59
|
+
- Added research-backed rationale for all batching constants
|
60
|
+
|
61
|
+
## [0.1.1] - 2025-10-13
|
62
|
+
|
63
|
+
### Added
|
64
|
+
- MCP server executable (`bin/mathpix-mcp`) for Claude Code integration
|
65
|
+
- Comprehensive MCP setup documentation (`MCP_SETUP.md`)
|
66
|
+
- Recovery codes backup documentation with MATHPIX prefix naming convention
|
67
|
+
- GitHub issues created from code TODOs for future enhancements
|
68
|
+
|
69
|
+
### Changed
|
70
|
+
- Recovery code backup files now use MATHPIX prefix for clarity
|
71
|
+
- Updated `.gitignore` to allow MCP_SETUP.md in repository
|
72
|
+
|
73
|
+
### Documentation
|
74
|
+
- Added detailed MCP server installation guide
|
75
|
+
- Documented 9 available MCP tools
|
76
|
+
- Documented 4 available MCP resources
|
77
|
+
- Added troubleshooting section for common MCP issues
|
78
|
+
- Documented secure backup locations for recovery codes
|
79
|
+
|
8
80
|
## [0.1.0] - 2025-10-13
|
9
81
|
|
10
82
|
### Added
|
data/README.md
CHANGED
@@ -10,13 +10,16 @@ Transform mathematical images to LaTeX, chemistry structures to SMILES, and docu
|
|
10
10
|
- 🔒 **Security First**: HTTPS enforcement, path traversal protection, file size limits
|
11
11
|
- 🎯 **Fluent API**: Builder pattern for elegant, chainable operations
|
12
12
|
- ⚡ **Batch Processing**: Parallel execution with callback hooks
|
13
|
+
- 📄 **Smart PDF Batching**: Automatic batching for large PDFs (>1.2MB) with adaptive sizing
|
13
14
|
- 📊 **Multiple Formats**: LaTeX, MathML, AsciiMath, Markdown, SMILES
|
14
15
|
- 🧪 **BDD Tested**: 15+ Cucumber feature files with comprehensive coverage
|
15
|
-
- 🔌 **MCP Integration**: Full Model Context Protocol server
|
16
|
+
- 🔌 **MCP Integration**: Full Model Context Protocol server for any MCP-compatible client
|
16
17
|
- 🎲 **Balanced Ternary**: Seed 1069 encoding utilities
|
17
18
|
|
18
19
|
## Installation
|
19
20
|
|
21
|
+
### As a Ruby Gem
|
22
|
+
|
20
23
|
Add to your Gemfile:
|
21
24
|
|
22
25
|
```ruby
|
@@ -29,6 +32,23 @@ Or install directly:
|
|
29
32
|
gem install mathpix
|
30
33
|
```
|
31
34
|
|
35
|
+
### As an MCP Server
|
36
|
+
|
37
|
+
The gem includes a standalone MCP (Model Context Protocol) server that works with any MCP-compatible client:
|
38
|
+
|
39
|
+
```bash
|
40
|
+
gem install mathpix
|
41
|
+
```
|
42
|
+
|
43
|
+
The server executable will be installed at `~/.gem/ruby/X.X.X/bin/mathpix-mcp`.
|
44
|
+
|
45
|
+
**Supported MCP Clients:**
|
46
|
+
- Claude Desktop/Code
|
47
|
+
- Any MCP registry-supporting client
|
48
|
+
- Custom MCP implementations
|
49
|
+
|
50
|
+
See [MCP_SETUP.md](MCP_SETUP.md) for complete MCP server setup and configuration.
|
51
|
+
|
32
52
|
## Quick Start
|
33
53
|
|
34
54
|
### Configuration
|
@@ -104,6 +124,47 @@ end
|
|
104
124
|
puts pdf_job.markdown
|
105
125
|
```
|
106
126
|
|
127
|
+
### Large PDF Batching (Automatic)
|
128
|
+
|
129
|
+
For PDFs larger than 1.2MB, the gem automatically uses intelligent batching to prevent "request too large" errors. This happens transparently - no configuration needed.
|
130
|
+
|
131
|
+
```ruby
|
132
|
+
# Large PDF (e.g., 10MB, 200 pages) - automatic batching
|
133
|
+
conversion = Mathpix.document('large_thesis.pdf')
|
134
|
+
.with_formats(:markdown, :latex)
|
135
|
+
.convert
|
136
|
+
|
137
|
+
# Wait for all batches to complete
|
138
|
+
conversion.wait_until_complete
|
139
|
+
|
140
|
+
# Get merged result (all batches combined)
|
141
|
+
result = conversion.result
|
142
|
+
puts "Processed #{result.data['batch_count']} batches"
|
143
|
+
puts "Total pages: #{result.data['total_pages']}"
|
144
|
+
puts result.markdown
|
145
|
+
```
|
146
|
+
|
147
|
+
**How it works:**
|
148
|
+
|
149
|
+
1. **Automatic Detection**: Files > 1.2MB are automatically batched
|
150
|
+
2. **Adaptive Sizing**: Batch size adapts to page density
|
151
|
+
- Dense pages (0.5MB/page) → 2 pages per batch
|
152
|
+
- Normal pages (0.05MB/page) → 10 pages per batch
|
153
|
+
3. **Sequential Processing**: Batches processed in order with exponential backoff retry
|
154
|
+
4. **Result Merging**: Markdown, LaTeX, HTML, and metadata merged automatically
|
155
|
+
5. **Seed 1069 Checkpoints**: Balanced ternary pattern `[+1, -1, -1, +1, +1, +1, +1]` for progress tracking
|
156
|
+
|
157
|
+
**Batch metadata:**
|
158
|
+
|
159
|
+
```ruby
|
160
|
+
result.data['batch_metadata'].each do |batch|
|
161
|
+
puts "Batch #{batch[:batch_num]}: pages #{batch[:page_start]}-#{batch[:page_end]}"
|
162
|
+
puts " Size: #{batch[:size_mb].round(2)} MB"
|
163
|
+
puts " Time: #{batch[:conversion_time_seconds].round(1)}s"
|
164
|
+
puts " Checkpoint: #{batch[:checkpoint] ? '✓' : '✗'}"
|
165
|
+
end
|
166
|
+
```
|
167
|
+
|
107
168
|
### Batch Processing
|
108
169
|
|
109
170
|
```ruby
|
@@ -127,6 +188,58 @@ puts "Success rate: #{results.success_rate}"
|
|
127
188
|
puts "High confidence: #{results.confident(0.9).count}"
|
128
189
|
```
|
129
190
|
|
191
|
+
## MCP Server Usage
|
192
|
+
|
193
|
+
The Mathpix MCP server provides AI assistants with OCR capabilities through the Model Context Protocol.
|
194
|
+
|
195
|
+
### Available Tools (9)
|
196
|
+
|
197
|
+
1. **convert_image** - Convert math/chemistry images to LaTeX/SMILES
|
198
|
+
2. **convert_document** - Convert PDF documents to Markdown (async)
|
199
|
+
3. **check_document_status** - Check status of document conversion
|
200
|
+
4. **batch_convert** - Convert multiple images in parallel
|
201
|
+
5. **get_account_info** - Get account information
|
202
|
+
6. **get_usage** - Get API usage statistics
|
203
|
+
7. **list_formats** - List supported output formats
|
204
|
+
8. **convert_strokes** - Convert handwriting strokes to LaTeX
|
205
|
+
9. **search_results** - Search previous OCR results
|
206
|
+
|
207
|
+
### Available Resources (4)
|
208
|
+
|
209
|
+
1. **formats_list** - List of supported formats
|
210
|
+
2. **latest_snip** - Most recent OCR result
|
211
|
+
3. **recent_snips** - Recent OCR results
|
212
|
+
4. **snip_stats** - Statistics about OCR results
|
213
|
+
|
214
|
+
### Example MCP Configuration
|
215
|
+
|
216
|
+
For any MCP client that supports JSON configuration:
|
217
|
+
|
218
|
+
```json
|
219
|
+
{
|
220
|
+
"mcpServers": {
|
221
|
+
"mathpix": {
|
222
|
+
"command": "/path/to/.gem/ruby/3.3.0/bin/mathpix-mcp",
|
223
|
+
"env": {
|
224
|
+
"MATHPIX_APP_ID": "your_app_id",
|
225
|
+
"MATHPIX_APP_KEY": "your_app_key",
|
226
|
+
"MATHPIX_MAX_FILE_SIZE_MB": "10",
|
227
|
+
"MATHPIX_HTTPS_ONLY": "true"
|
228
|
+
}
|
229
|
+
}
|
230
|
+
}
|
231
|
+
}
|
232
|
+
```
|
233
|
+
|
234
|
+
**Environment Variables:**
|
235
|
+
- `MATHPIX_APP_ID` - Your Mathpix application ID (required)
|
236
|
+
- `MATHPIX_APP_KEY` - Your Mathpix application key (required)
|
237
|
+
- `MATHPIX_MAX_FILE_SIZE_MB` - Maximum file size (default: 10)
|
238
|
+
- `MATHPIX_HTTPS_ONLY` - Force HTTPS (default: true)
|
239
|
+
- `MATHPIX_LOG_LEVEL` - Logging level: DEBUG, INFO, WARN, ERROR
|
240
|
+
|
241
|
+
See [MCP_SETUP.md](MCP_SETUP.md) for detailed setup instructions, troubleshooting, and client-specific configurations.
|
242
|
+
|
130
243
|
## Error Handling
|
131
244
|
|
132
245
|
```ruby
|
@@ -163,7 +276,7 @@ MIT License - see [LICENSE](LICENSE) for details.
|
|
163
276
|
|
164
277
|
## Support
|
165
278
|
|
166
|
-
- GitHub Issues: https://github.com/
|
279
|
+
- GitHub Issues: https://github.com/TeglonLabs/mathpix-gem/issues
|
167
280
|
- Email: ies@prototypesf.org
|
168
281
|
|
169
282
|
---
|
data/SECURITY.md
CHANGED
@@ -130,7 +130,7 @@ spec.metadata['rubygems_mfa_required'] = 'true'
|
|
130
130
|
|
131
131
|
For security-related questions or concerns:
|
132
132
|
- Email: ies@prototypesf.org
|
133
|
-
- GitHub Issues: https://github.com/
|
133
|
+
- GitHub Issues: https://github.com/TeglonLabs/mathpix-gem/issues (for non-sensitive issues)
|
134
134
|
|
135
135
|
## Acknowledgments
|
136
136
|
|
data/bin/mathpix-mcp
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require 'bundler/setup'
|
5
|
+
require 'mathpix'
|
6
|
+
require 'mathpix/mcp/server'
|
7
|
+
|
8
|
+
# Load environment variables from .env file if present
|
9
|
+
require 'dotenv/load' if defined?(Dotenv)
|
10
|
+
|
11
|
+
# Configure Mathpix from environment
|
12
|
+
Mathpix.configure do |config|
|
13
|
+
config.app_id = ENV['MATHPIX_APP_ID']
|
14
|
+
config.app_key = ENV['MATHPIX_APP_KEY']
|
15
|
+
|
16
|
+
# Optional configuration
|
17
|
+
config.max_file_size_mb = ENV['MATHPIX_MAX_FILE_SIZE_MB']&.to_i || 10
|
18
|
+
config.https_only = ENV['MATHPIX_HTTPS_ONLY'] != 'false'
|
19
|
+
|
20
|
+
# Logging
|
21
|
+
if ENV['MATHPIX_LOG_LEVEL']
|
22
|
+
require 'logger'
|
23
|
+
config.logger = Logger.new($stderr)
|
24
|
+
config.logger.level = Logger.const_get(ENV['MATHPIX_LOG_LEVEL'].upcase)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
# Start MCP server
|
29
|
+
begin
|
30
|
+
server = Mathpix::MCP::Server.new
|
31
|
+
|
32
|
+
# Register all tools
|
33
|
+
server.register_tool(Mathpix::MCP::Tools::ConvertImageTool.new)
|
34
|
+
server.register_tool(Mathpix::MCP::Tools::ConvertDocumentTool.new)
|
35
|
+
server.register_tool(Mathpix::MCP::Tools::CheckDocumentStatusTool.new)
|
36
|
+
server.register_tool(Mathpix::MCP::Tools::BatchConvertTool.new)
|
37
|
+
server.register_tool(Mathpix::MCP::Tools::GetAccountInfoTool.new)
|
38
|
+
server.register_tool(Mathpix::MCP::Tools::GetUsageTool.new)
|
39
|
+
server.register_tool(Mathpix::MCP::Tools::ListFormatsTool.new)
|
40
|
+
server.register_tool(Mathpix::MCP::Tools::ConvertStrokesTool.new)
|
41
|
+
server.register_tool(Mathpix::MCP::Tools::SearchResultsTool.new)
|
42
|
+
|
43
|
+
# Register resources
|
44
|
+
server.register_resource(Mathpix::MCP::Resources::FormatsListResource.new)
|
45
|
+
server.register_resource(Mathpix::MCP::Resources::LatestSnipResource.new)
|
46
|
+
server.register_resource(Mathpix::MCP::Resources::RecentSnipsResource.new)
|
47
|
+
server.register_resource(Mathpix::MCP::Resources::SnipStatsResource.new)
|
48
|
+
|
49
|
+
# Start server on stdio
|
50
|
+
server.start
|
51
|
+
rescue StandardError => e
|
52
|
+
warn "Error starting Mathpix MCP server: #{e.message}"
|
53
|
+
warn e.backtrace.join("\n")
|
54
|
+
exit 1
|
55
|
+
end
|
data/lib/mathpix/batch.rb
CHANGED
@@ -83,14 +83,12 @@ module Mathpix
|
|
83
83
|
errors = []
|
84
84
|
|
85
85
|
image_paths.each do |path|
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
callbacks[:error]&.call(e, path)
|
93
|
-
end
|
86
|
+
result = client.snap(path, **options)
|
87
|
+
results << result
|
88
|
+
callbacks[:each]&.call(result)
|
89
|
+
rescue StandardError => e
|
90
|
+
errors << { path: path, error: e }
|
91
|
+
callbacks[:error]&.call(e, path)
|
94
92
|
end
|
95
93
|
|
96
94
|
batch_result = BatchResult.new(results, errors)
|
@@ -126,6 +124,7 @@ module Mathpix
|
|
126
124
|
|
127
125
|
def success_rate
|
128
126
|
return 1.0 if total.zero?
|
127
|
+
|
129
128
|
successful.to_f / total
|
130
129
|
end
|
131
130
|
|
@@ -0,0 +1,238 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Mathpix
|
4
|
+
# Batched Document Conversion
|
5
|
+
#
|
6
|
+
# Handles conversion of large PDFs by splitting into batches,
|
7
|
+
# converting each batch separately, and merging results.
|
8
|
+
#
|
9
|
+
# The geodesic path: transparent batching with result merging
|
10
|
+
#
|
11
|
+
# Checkpointing strategy informed by distributed systems research (2025-10-14):
|
12
|
+
# - 7 comprehensive searches on chunking strategies for RAG and distributed processing
|
13
|
+
# - Finding: Optimal chunk overlap is 10-20% (50-100 tokens for 512-token chunks)
|
14
|
+
# - Finding: Memory optimization requires periodic state persistence (every 1000 pages)
|
15
|
+
# - Our approach: Balanced ternary seed 1069 creates checkpoint pattern [+1,-1,-1,+1,+1,+1,+1]
|
16
|
+
# - Result: Checkpoints at batches 1,4,5,6,7,8,11,12,... (≈57% checkpoint rate)
|
17
|
+
# - Balances fault tolerance with processing overhead
|
18
|
+
class BatchedDocumentConversion
|
19
|
+
# Seed 1069 in balanced ternary representation: [+1, -1, -1, +1, +1, +1, +1]
|
20
|
+
# Creates deterministic checkpoint pattern repeating every 7 batches
|
21
|
+
# Checkpoints enable partial recovery if processing fails mid-document
|
22
|
+
# Pattern chosen for mathematical elegance and practical fault tolerance
|
23
|
+
SEED_1069 = [1, -1, -1, 1, 1, 1, 1].freeze
|
24
|
+
|
25
|
+
attr_reader :client, :document_path, :document_type, :batcher, :options, :batch_metadata, :conversions
|
26
|
+
|
27
|
+
# Initialize batched conversion
|
28
|
+
#
|
29
|
+
# @param client [Mathpix::Client] API client
|
30
|
+
# @param document_path [String] path to PDF
|
31
|
+
# @param document_type [Symbol] :pdf, :docx, :pptx
|
32
|
+
# @param batcher [DocumentBatcher] batching strategy
|
33
|
+
# @param options [Hash] conversion options
|
34
|
+
def initialize(client, document_path, document_type, batcher, options = {})
|
35
|
+
@client = client
|
36
|
+
@document_path = document_path
|
37
|
+
@document_type = document_type
|
38
|
+
@batcher = batcher
|
39
|
+
@options = options
|
40
|
+
@batch_metadata = []
|
41
|
+
@conversions = []
|
42
|
+
end
|
43
|
+
|
44
|
+
# Wait for all batches to complete
|
45
|
+
#
|
46
|
+
# @param max_wait [Integer] maximum wait time in seconds PER BATCH
|
47
|
+
# @param poll_interval [Float] seconds between polls
|
48
|
+
# @return [self]
|
49
|
+
def wait_until_complete(max_wait: 600, poll_interval: 3.0)
|
50
|
+
batch_ranges = @batcher.calculate_batches
|
51
|
+
|
52
|
+
batch_ranges.each_with_index do |(start_page, end_page), idx|
|
53
|
+
batch_num = idx + 1
|
54
|
+
batch_start_time = Time.now
|
55
|
+
|
56
|
+
# Extract batch PDF
|
57
|
+
batch_pdf = @batcher.extract_batch(start_page, end_page)
|
58
|
+
batch_size = File.size(batch_pdf.path)
|
59
|
+
|
60
|
+
begin
|
61
|
+
# Convert batch with retry logic
|
62
|
+
conversion_id = convert_batch_with_retry(batch_pdf.path, retry_count: 3)
|
63
|
+
|
64
|
+
# Wait for completion
|
65
|
+
conversion = DocumentConversion.new(
|
66
|
+
@client,
|
67
|
+
conversion_id,
|
68
|
+
batch_pdf.path,
|
69
|
+
@document_type
|
70
|
+
)
|
71
|
+
conversion.wait_until_complete(max_wait: max_wait, poll_interval: poll_interval)
|
72
|
+
|
73
|
+
# Record metadata
|
74
|
+
batch_time = Time.now - batch_start_time
|
75
|
+
@batch_metadata << {
|
76
|
+
batch_num: batch_num,
|
77
|
+
page_start: start_page,
|
78
|
+
page_end: end_page,
|
79
|
+
size_bytes: batch_size,
|
80
|
+
size_mb: batch_size / (1024.0 * 1024.0),
|
81
|
+
status: 'completed',
|
82
|
+
conversion_time_seconds: batch_time,
|
83
|
+
checkpoint: should_checkpoint?(batch_num)
|
84
|
+
}
|
85
|
+
|
86
|
+
@conversions << conversion
|
87
|
+
rescue StandardError => e
|
88
|
+
# Record failure
|
89
|
+
batch_time = Time.now - batch_start_time
|
90
|
+
@batch_metadata << {
|
91
|
+
batch_num: batch_num,
|
92
|
+
page_start: start_page,
|
93
|
+
page_end: end_page,
|
94
|
+
size_bytes: batch_size,
|
95
|
+
size_mb: batch_size / (1024.0 * 1024.0),
|
96
|
+
status: 'failed',
|
97
|
+
error: e.message,
|
98
|
+
conversion_time_seconds: batch_time,
|
99
|
+
checkpoint: false
|
100
|
+
}
|
101
|
+
|
102
|
+
raise ConversionError.new(
|
103
|
+
"Batch #{batch_num} (pages #{start_page}-#{end_page}) failed: #{e.message}",
|
104
|
+
conversion_id: nil,
|
105
|
+
conversion_status: 'failed'
|
106
|
+
)
|
107
|
+
ensure
|
108
|
+
# Clean up temp file
|
109
|
+
batch_pdf.close
|
110
|
+
batch_pdf.unlink
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
self
|
115
|
+
end
|
116
|
+
|
117
|
+
# Get merged result from all batches
|
118
|
+
#
|
119
|
+
# @return [DocumentResult] merged result
|
120
|
+
# @raise [ConversionError] if no conversions completed
|
121
|
+
def result
|
122
|
+
raise ConversionError, 'No batches completed successfully' if @conversions.empty?
|
123
|
+
|
124
|
+
# Merge results from all successful batches
|
125
|
+
merged_data = merge_batch_results
|
126
|
+
|
127
|
+
DocumentResult.new(merged_data, @document_path, @document_type)
|
128
|
+
end
|
129
|
+
|
130
|
+
# Convenience method: wait and get result
|
131
|
+
#
|
132
|
+
# @return [DocumentResult]
|
133
|
+
def complete!
|
134
|
+
wait_until_complete
|
135
|
+
result
|
136
|
+
end
|
137
|
+
|
138
|
+
private
|
139
|
+
|
140
|
+
# Convert batch with exponential backoff retry
|
141
|
+
#
|
142
|
+
# @param batch_path [String] path to batch PDF
|
143
|
+
# @param retry_count [Integer] number of retries
|
144
|
+
# @return [String] conversion ID
|
145
|
+
def convert_batch_with_retry(batch_path, retry_count: 3)
|
146
|
+
attempt = 0
|
147
|
+
|
148
|
+
begin
|
149
|
+
attempt += 1
|
150
|
+
@client.convert_document(
|
151
|
+
document_path: batch_path,
|
152
|
+
document_type: @document_type,
|
153
|
+
**@options
|
154
|
+
)
|
155
|
+
rescue APIError
|
156
|
+
if attempt < retry_count
|
157
|
+
# Exponential backoff: 1s, 2s, 4s
|
158
|
+
sleep_time = 2**(attempt - 1)
|
159
|
+
sleep sleep_time
|
160
|
+
retry
|
161
|
+
end
|
162
|
+
|
163
|
+
raise
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
# Merge results from all batches
|
168
|
+
#
|
169
|
+
# @return [Hash] merged result data
|
170
|
+
def merge_batch_results
|
171
|
+
# Extract results from each batch
|
172
|
+
batch_results = @conversions.map(&:result)
|
173
|
+
|
174
|
+
# Merge markdown (concatenate with blank line separator)
|
175
|
+
merged_markdown = batch_results
|
176
|
+
.map(&:markdown)
|
177
|
+
.compact
|
178
|
+
.join("\n\n")
|
179
|
+
|
180
|
+
# Merge LaTeX
|
181
|
+
merged_latex = batch_results
|
182
|
+
.map(&:latex)
|
183
|
+
.compact
|
184
|
+
.join("\n\n")
|
185
|
+
|
186
|
+
# Merge HTML
|
187
|
+
merged_html = batch_results
|
188
|
+
.map(&:html)
|
189
|
+
.compact
|
190
|
+
.join("\n")
|
191
|
+
|
192
|
+
# Merge pages (flatten arrays)
|
193
|
+
all_pages = batch_results
|
194
|
+
.flat_map(&:pages)
|
195
|
+
|
196
|
+
# Merge equations
|
197
|
+
all_equations = batch_results
|
198
|
+
.flat_map(&:equations)
|
199
|
+
|
200
|
+
# Merge tables
|
201
|
+
all_tables = batch_results
|
202
|
+
.flat_map(&:tables)
|
203
|
+
|
204
|
+
# Merge diagrams
|
205
|
+
all_diagrams = batch_results
|
206
|
+
.flat_map(&:diagrams)
|
207
|
+
|
208
|
+
# Calculate total processing time
|
209
|
+
total_time = @batch_metadata
|
210
|
+
.sum { |m| m[:conversion_time_seconds] }
|
211
|
+
|
212
|
+
# Build merged data
|
213
|
+
{
|
214
|
+
'markdown' => merged_markdown,
|
215
|
+
'latex' => merged_latex,
|
216
|
+
'html' => merged_html,
|
217
|
+
'pages' => all_pages,
|
218
|
+
'equations' => all_equations,
|
219
|
+
'tables' => all_tables,
|
220
|
+
'diagrams' => all_diagrams,
|
221
|
+
'batched' => true,
|
222
|
+
'batch_count' => @conversions.length,
|
223
|
+
'total_pages' => @batcher.page_count,
|
224
|
+
'total_processing_time' => total_time,
|
225
|
+
'batch_metadata' => @batch_metadata
|
226
|
+
}
|
227
|
+
end
|
228
|
+
|
229
|
+
# Check if batch should be checkpointed (Seed 1069 pattern)
|
230
|
+
#
|
231
|
+
# @param batch_num [Integer] batch number (1-indexed)
|
232
|
+
# @return [Boolean] true if trit is +1
|
233
|
+
def should_checkpoint?(batch_num)
|
234
|
+
trit_index = (batch_num - 1) % 7
|
235
|
+
SEED_1069[trit_index] == 1
|
236
|
+
end
|
237
|
+
end
|
238
|
+
end
|