mathpix 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +53 -0
- data/README.md +114 -1
- data/lib/mathpix/batch.rb +7 -8
- data/lib/mathpix/batched_document_conversion.rb +238 -0
- data/lib/mathpix/client.rb +33 -27
- data/lib/mathpix/configuration.rb +5 -9
- data/lib/mathpix/conversion.rb +2 -6
- data/lib/mathpix/document.rb +47 -12
- data/lib/mathpix/document_batcher.rb +191 -0
- data/lib/mathpix/mcp/auth/oauth_provider.rb +8 -9
- data/lib/mathpix/mcp/base_tool.rb +8 -5
- data/lib/mathpix/mcp/elicitations/ambiguity_elicitation.rb +8 -11
- data/lib/mathpix/mcp/elicitations/base_elicitation.rb +2 -0
- data/lib/mathpix/mcp/elicitations/confidence_elicitation.rb +2 -1
- data/lib/mathpix/mcp/elicitations.rb +1 -1
- data/lib/mathpix/mcp/middleware/cors_middleware.rb +2 -6
- data/lib/mathpix/mcp/middleware/oauth_middleware.rb +2 -6
- data/lib/mathpix/mcp/middleware/rate_limiting_middleware.rb +19 -18
- data/lib/mathpix/mcp/resources/formats_list_resource.rb +54 -54
- data/lib/mathpix/mcp/resources/hierarchical_router.rb +9 -18
- data/lib/mathpix/mcp/resources/latest_snip_resource.rb +22 -22
- data/lib/mathpix/mcp/resources/recent_snips_resource.rb +11 -10
- data/lib/mathpix/mcp/resources/snip_stats_resource.rb +14 -12
- data/lib/mathpix/mcp/server.rb +18 -18
- data/lib/mathpix/mcp/tools/batch_convert_tool.rb +31 -37
- data/lib/mathpix/mcp/tools/check_document_status_tool.rb +5 -5
- data/lib/mathpix/mcp/tools/convert_document_tool.rb +15 -14
- data/lib/mathpix/mcp/tools/convert_image_tool.rb +15 -14
- data/lib/mathpix/mcp/tools/convert_strokes_tool.rb +13 -13
- data/lib/mathpix/mcp/tools/get_account_info_tool.rb +1 -1
- data/lib/mathpix/mcp/tools/get_usage_tool.rb +5 -7
- data/lib/mathpix/mcp/tools/list_formats_tool.rb +30 -30
- data/lib/mathpix/mcp/tools/search_results_tool.rb +13 -14
- data/lib/mathpix/mcp/transports/http_streaming_transport.rb +129 -118
- data/lib/mathpix/mcp/transports/sse_stream_handler.rb +37 -35
- data/lib/mathpix/result.rb +3 -2
- data/lib/mathpix/version.rb +1 -1
- data/lib/mathpix.rb +3 -1
- metadata +60 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 10150e3331211cf21bee0d8dfebad1226cc16c8616966d9a5dbb9de06f131b6c
|
4
|
+
data.tar.gz: 8931dcca80cedf7d03d07a8ed1ff92cd3fc65dd6829d5c432a11e2833f99ccf9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 96a10fc2943e50c95e5ec0eec3fab60eef4ba14bff0b2e738a6698264e5cd24e13977171f71f131f9db73b2237a5451f827951728a8e40936a0b7a1c60fb3e6e
|
7
|
+
data.tar.gz: d9af8b573f189f08e4e641b242e42476698421818cc57236b2f27f633eb1e7364ed8fd96b10de95e498dd0d1fab1d69d0cbc07bcc5a285d45ba1cd3667d5b697
|
data/CHANGELOG.md
CHANGED
@@ -5,6 +5,59 @@ All notable changes to this project will be documented in this file.
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
7
7
|
|
8
|
+
## [Unreleased]
|
9
|
+
|
10
|
+
## [0.1.2] - 2025-10-14
|
11
|
+
|
12
|
+
### Added
|
13
|
+
- **Automatic PDF Batching**: Large PDFs (>1.2MB) are now automatically split into batches for processing
|
14
|
+
- Adaptive batch sizing based on file size and page count
|
15
|
+
- Intelligent checkpoint pattern using seed 1069: [+1, -1, -1, +1, +1, +1, +1]
|
16
|
+
- Automatic result merging across all batches (markdown, LaTeX, HTML, equations, tables, diagrams)
|
17
|
+
- Exponential backoff retry logic for failed batches (3 attempts)
|
18
|
+
- Comprehensive batch metadata tracking
|
19
|
+
- Transparent to existing API - no code changes needed
|
20
|
+
- `DocumentBatcher` class for batch calculation and PDF extraction
|
21
|
+
- `BatchedDocumentConversion` class for managing multi-batch conversions
|
22
|
+
- 38 comprehensive tests for batching functionality (17 DocumentBatcher + 21 BatchedDocumentConversion)
|
23
|
+
- Research-backed documentation (`docs/BATCHING_RESEARCH.md`):
|
24
|
+
- 7 comprehensive web searches on OCR API limits, performance benchmarks, and distributed systems
|
25
|
+
- Industry comparison: AWS Textract, Google Cloud Vision, Azure AI Vision, Adobe Services
|
26
|
+
- Performance analysis: LlamaParse, Docling, Unstructured benchmarks
|
27
|
+
- Rationale for all batching constants (MAX_SINGLE_REQUEST_MB, DEFAULT_PAGES_PER_BATCH, MIN_PAGES_PER_BATCH)
|
28
|
+
- Test infrastructure improvements:
|
29
|
+
- Added `rack-test` dependency for HTTP streaming tests
|
30
|
+
- Fixed RSpec shared examples syntax
|
31
|
+
- Created test summary documentation
|
32
|
+
|
33
|
+
### Changed
|
34
|
+
- `Document` class now automatically uses batching for large PDFs
|
35
|
+
- Batch processing uses seed 1069 for deterministic checkpoint selection
|
36
|
+
- Ruby 3.4.1 compatibility verified and enforced via `.ruby-version`
|
37
|
+
- **BREAKING**: Minimum Ruby version increased from 2.7.0 to 3.2.0 (required by Bundler 2.7.2)
|
38
|
+
|
39
|
+
### Fixed
|
40
|
+
- **Ruby 3.5+ Compatibility**: Added `ostruct ~> 0.6` as explicit runtime dependency
|
41
|
+
- Eliminates deprecation warning: "ostruct will no longer be part of default gems"
|
42
|
+
- Ensures forward compatibility with Ruby 3.5 and later
|
43
|
+
- ostruct used in `HttpStreamingTransport` for error object creation
|
44
|
+
|
45
|
+
### Dependencies
|
46
|
+
- Added `pdf-reader ~> 2.11` for PDF structure parsing
|
47
|
+
- Added `prawn ~> 2.4` for batch PDF creation
|
48
|
+
- Added `ostruct ~> 0.6` for Ruby 3.5+ compatibility
|
49
|
+
- Added `rack-test ~> 2.1` (development) for transport testing
|
50
|
+
|
51
|
+
### Performance
|
52
|
+
- Large PDF conversions now handle files previously rejected by API
|
53
|
+
- Automatic retry reduces failure rates
|
54
|
+
- Parallel batch processing planned for future release
|
55
|
+
|
56
|
+
### Documentation
|
57
|
+
- Added comprehensive batching research document with full citations
|
58
|
+
- Documented adaptive batching algorithm with examples
|
59
|
+
- Added research-backed rationale for all batching constants
|
60
|
+
|
8
61
|
## [0.1.1] - 2025-10-13
|
9
62
|
|
10
63
|
### Added
|
data/README.md
CHANGED
@@ -10,13 +10,16 @@ Transform mathematical images to LaTeX, chemistry structures to SMILES, and docu
|
|
10
10
|
- 🔒 **Security First**: HTTPS enforcement, path traversal protection, file size limits
|
11
11
|
- 🎯 **Fluent API**: Builder pattern for elegant, chainable operations
|
12
12
|
- ⚡ **Batch Processing**: Parallel execution with callback hooks
|
13
|
+
- 📄 **Smart PDF Batching**: Automatic batching for large PDFs (>1.2MB) with adaptive sizing
|
13
14
|
- 📊 **Multiple Formats**: LaTeX, MathML, AsciiMath, Markdown, SMILES
|
14
15
|
- 🧪 **BDD Tested**: 15+ Cucumber feature files with comprehensive coverage
|
15
|
-
- 🔌 **MCP Integration**: Full Model Context Protocol server
|
16
|
+
- 🔌 **MCP Integration**: Full Model Context Protocol server for any MCP-compatible client
|
16
17
|
- 🎲 **Balanced Ternary**: Seed 1069 encoding utilities
|
17
18
|
|
18
19
|
## Installation
|
19
20
|
|
21
|
+
### As a Ruby Gem
|
22
|
+
|
20
23
|
Add to your Gemfile:
|
21
24
|
|
22
25
|
```ruby
|
@@ -29,6 +32,23 @@ Or install directly:
|
|
29
32
|
gem install mathpix
|
30
33
|
```
|
31
34
|
|
35
|
+
### As an MCP Server
|
36
|
+
|
37
|
+
The gem includes a standalone MCP (Model Context Protocol) server that works with any MCP-compatible client:
|
38
|
+
|
39
|
+
```bash
|
40
|
+
gem install mathpix
|
41
|
+
```
|
42
|
+
|
43
|
+
The server executable will be installed at `~/.gem/ruby/X.X.X/bin/mathpix-mcp`.
|
44
|
+
|
45
|
+
**Supported MCP Clients:**
|
46
|
+
- Claude Desktop/Code
|
47
|
+
- Any MCP registry-supporting client
|
48
|
+
- Custom MCP implementations
|
49
|
+
|
50
|
+
See [MCP_SETUP.md](MCP_SETUP.md) for complete MCP server setup and configuration.
|
51
|
+
|
32
52
|
## Quick Start
|
33
53
|
|
34
54
|
### Configuration
|
@@ -104,6 +124,47 @@ end
|
|
104
124
|
puts pdf_job.markdown
|
105
125
|
```
|
106
126
|
|
127
|
+
### Large PDF Batching (Automatic)
|
128
|
+
|
129
|
+
For PDFs larger than 1.2MB, the gem automatically uses intelligent batching to prevent "request too large" errors. This happens transparently - no configuration needed.
|
130
|
+
|
131
|
+
```ruby
|
132
|
+
# Large PDF (e.g., 10MB, 200 pages) - automatic batching
|
133
|
+
conversion = Mathpix.document('large_thesis.pdf')
|
134
|
+
.with_formats(:markdown, :latex)
|
135
|
+
.convert
|
136
|
+
|
137
|
+
# Wait for all batches to complete
|
138
|
+
conversion.wait_until_complete
|
139
|
+
|
140
|
+
# Get merged result (all batches combined)
|
141
|
+
result = conversion.result
|
142
|
+
puts "Processed #{result.data['batch_count']} batches"
|
143
|
+
puts "Total pages: #{result.data['total_pages']}"
|
144
|
+
puts result.markdown
|
145
|
+
```
|
146
|
+
|
147
|
+
**How it works:**
|
148
|
+
|
149
|
+
1. **Automatic Detection**: Files > 1.2MB are automatically batched
|
150
|
+
2. **Adaptive Sizing**: Batch size adapts to page density
|
151
|
+
- Dense pages (0.5MB/page) → 2 pages per batch
|
152
|
+
- Normal pages (0.05MB/page) → 10 pages per batch
|
153
|
+
3. **Sequential Processing**: Batches processed in order with exponential backoff retry
|
154
|
+
4. **Result Merging**: Markdown, LaTeX, HTML, and metadata merged automatically
|
155
|
+
5. **Seed 1069 Checkpoints**: Balanced ternary pattern `[+1, -1, -1, +1, +1, +1, +1]` for progress tracking
|
156
|
+
|
157
|
+
**Batch metadata:**
|
158
|
+
|
159
|
+
```ruby
|
160
|
+
result.data['batch_metadata'].each do |batch|
|
161
|
+
puts "Batch #{batch[:batch_num]}: pages #{batch[:page_start]}-#{batch[:page_end]}"
|
162
|
+
puts " Size: #{batch[:size_mb].round(2)} MB"
|
163
|
+
puts " Time: #{batch[:conversion_time_seconds].round(1)}s"
|
164
|
+
puts " Checkpoint: #{batch[:checkpoint] ? '✓' : '✗'}"
|
165
|
+
end
|
166
|
+
```
|
167
|
+
|
107
168
|
### Batch Processing
|
108
169
|
|
109
170
|
```ruby
|
@@ -127,6 +188,58 @@ puts "Success rate: #{results.success_rate}"
|
|
127
188
|
puts "High confidence: #{results.confident(0.9).count}"
|
128
189
|
```
|
129
190
|
|
191
|
+
## MCP Server Usage
|
192
|
+
|
193
|
+
The Mathpix MCP server provides AI assistants with OCR capabilities through the Model Context Protocol.
|
194
|
+
|
195
|
+
### Available Tools (9)
|
196
|
+
|
197
|
+
1. **convert_image** - Convert math/chemistry images to LaTeX/SMILES
|
198
|
+
2. **convert_document** - Convert PDF documents to Markdown (async)
|
199
|
+
3. **check_document_status** - Check status of document conversion
|
200
|
+
4. **batch_convert** - Convert multiple images in parallel
|
201
|
+
5. **get_account_info** - Get account information
|
202
|
+
6. **get_usage** - Get API usage statistics
|
203
|
+
7. **list_formats** - List supported output formats
|
204
|
+
8. **convert_strokes** - Convert handwriting strokes to LaTeX
|
205
|
+
9. **search_results** - Search previous OCR results
|
206
|
+
|
207
|
+
### Available Resources (4)
|
208
|
+
|
209
|
+
1. **formats_list** - List of supported formats
|
210
|
+
2. **latest_snip** - Most recent OCR result
|
211
|
+
3. **recent_snips** - Recent OCR results
|
212
|
+
4. **snip_stats** - Statistics about OCR results
|
213
|
+
|
214
|
+
### Example MCP Configuration
|
215
|
+
|
216
|
+
For any MCP client that supports JSON configuration:
|
217
|
+
|
218
|
+
```json
|
219
|
+
{
|
220
|
+
"mcpServers": {
|
221
|
+
"mathpix": {
|
222
|
+
"command": "/path/to/.gem/ruby/3.3.0/bin/mathpix-mcp",
|
223
|
+
"env": {
|
224
|
+
"MATHPIX_APP_ID": "your_app_id",
|
225
|
+
"MATHPIX_APP_KEY": "your_app_key",
|
226
|
+
"MATHPIX_MAX_FILE_SIZE_MB": "10",
|
227
|
+
"MATHPIX_HTTPS_ONLY": "true"
|
228
|
+
}
|
229
|
+
}
|
230
|
+
}
|
231
|
+
}
|
232
|
+
```
|
233
|
+
|
234
|
+
**Environment Variables:**
|
235
|
+
- `MATHPIX_APP_ID` - Your Mathpix application ID (required)
|
236
|
+
- `MATHPIX_APP_KEY` - Your Mathpix application key (required)
|
237
|
+
- `MATHPIX_MAX_FILE_SIZE_MB` - Maximum file size (default: 10)
|
238
|
+
- `MATHPIX_HTTPS_ONLY` - Force HTTPS (default: true)
|
239
|
+
- `MATHPIX_LOG_LEVEL` - Logging level: DEBUG, INFO, WARN, ERROR
|
240
|
+
|
241
|
+
See [MCP_SETUP.md](MCP_SETUP.md) for detailed setup instructions, troubleshooting, and client-specific configurations.
|
242
|
+
|
130
243
|
## Error Handling
|
131
244
|
|
132
245
|
```ruby
|
data/lib/mathpix/batch.rb
CHANGED
@@ -83,14 +83,12 @@ module Mathpix
|
|
83
83
|
errors = []
|
84
84
|
|
85
85
|
image_paths.each do |path|
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
callbacks[:error]&.call(e, path)
|
93
|
-
end
|
86
|
+
result = client.snap(path, **options)
|
87
|
+
results << result
|
88
|
+
callbacks[:each]&.call(result)
|
89
|
+
rescue StandardError => e
|
90
|
+
errors << { path: path, error: e }
|
91
|
+
callbacks[:error]&.call(e, path)
|
94
92
|
end
|
95
93
|
|
96
94
|
batch_result = BatchResult.new(results, errors)
|
@@ -126,6 +124,7 @@ module Mathpix
|
|
126
124
|
|
127
125
|
def success_rate
|
128
126
|
return 1.0 if total.zero?
|
127
|
+
|
129
128
|
successful.to_f / total
|
130
129
|
end
|
131
130
|
|
@@ -0,0 +1,238 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Mathpix
|
4
|
+
# Batched Document Conversion
|
5
|
+
#
|
6
|
+
# Handles conversion of large PDFs by splitting into batches,
|
7
|
+
# converting each batch separately, and merging results.
|
8
|
+
#
|
9
|
+
# The geodesic path: transparent batching with result merging
|
10
|
+
#
|
11
|
+
# Checkpointing strategy informed by distributed systems research (2025-10-14):
|
12
|
+
# - 7 comprehensive searches on chunking strategies for RAG and distributed processing
|
13
|
+
# - Finding: Optimal chunk overlap is 10-20% (50-100 tokens for 512-token chunks)
|
14
|
+
# - Finding: Memory optimization requires periodic state persistence (every 1000 pages)
|
15
|
+
# - Our approach: Balanced ternary seed 1069 creates checkpoint pattern [+1,-1,-1,+1,+1,+1,+1]
|
16
|
+
# - Result: Checkpoints at batches 1,4,5,6,7,8,11,12,... (≈57% checkpoint rate)
|
17
|
+
# - Balances fault tolerance with processing overhead
|
18
|
+
class BatchedDocumentConversion
|
19
|
+
# Seed 1069 in balanced ternary representation: [+1, -1, -1, +1, +1, +1, +1]
|
20
|
+
# Creates deterministic checkpoint pattern repeating every 7 batches
|
21
|
+
# Checkpoints enable partial recovery if processing fails mid-document
|
22
|
+
# Pattern chosen for mathematical elegance and practical fault tolerance
|
23
|
+
SEED_1069 = [1, -1, -1, 1, 1, 1, 1].freeze
|
24
|
+
|
25
|
+
attr_reader :client, :document_path, :document_type, :batcher, :options, :batch_metadata, :conversions
|
26
|
+
|
27
|
+
# Initialize batched conversion
|
28
|
+
#
|
29
|
+
# @param client [Mathpix::Client] API client
|
30
|
+
# @param document_path [String] path to PDF
|
31
|
+
# @param document_type [Symbol] :pdf, :docx, :pptx
|
32
|
+
# @param batcher [DocumentBatcher] batching strategy
|
33
|
+
# @param options [Hash] conversion options
|
34
|
+
def initialize(client, document_path, document_type, batcher, options = {})
|
35
|
+
@client = client
|
36
|
+
@document_path = document_path
|
37
|
+
@document_type = document_type
|
38
|
+
@batcher = batcher
|
39
|
+
@options = options
|
40
|
+
@batch_metadata = []
|
41
|
+
@conversions = []
|
42
|
+
end
|
43
|
+
|
44
|
+
# Wait for all batches to complete
|
45
|
+
#
|
46
|
+
# @param max_wait [Integer] maximum wait time in seconds PER BATCH
|
47
|
+
# @param poll_interval [Float] seconds between polls
|
48
|
+
# @return [self]
|
49
|
+
def wait_until_complete(max_wait: 600, poll_interval: 3.0)
|
50
|
+
batch_ranges = @batcher.calculate_batches
|
51
|
+
|
52
|
+
batch_ranges.each_with_index do |(start_page, end_page), idx|
|
53
|
+
batch_num = idx + 1
|
54
|
+
batch_start_time = Time.now
|
55
|
+
|
56
|
+
# Extract batch PDF
|
57
|
+
batch_pdf = @batcher.extract_batch(start_page, end_page)
|
58
|
+
batch_size = File.size(batch_pdf.path)
|
59
|
+
|
60
|
+
begin
|
61
|
+
# Convert batch with retry logic
|
62
|
+
conversion_id = convert_batch_with_retry(batch_pdf.path, retry_count: 3)
|
63
|
+
|
64
|
+
# Wait for completion
|
65
|
+
conversion = DocumentConversion.new(
|
66
|
+
@client,
|
67
|
+
conversion_id,
|
68
|
+
batch_pdf.path,
|
69
|
+
@document_type
|
70
|
+
)
|
71
|
+
conversion.wait_until_complete(max_wait: max_wait, poll_interval: poll_interval)
|
72
|
+
|
73
|
+
# Record metadata
|
74
|
+
batch_time = Time.now - batch_start_time
|
75
|
+
@batch_metadata << {
|
76
|
+
batch_num: batch_num,
|
77
|
+
page_start: start_page,
|
78
|
+
page_end: end_page,
|
79
|
+
size_bytes: batch_size,
|
80
|
+
size_mb: batch_size / (1024.0 * 1024.0),
|
81
|
+
status: 'completed',
|
82
|
+
conversion_time_seconds: batch_time,
|
83
|
+
checkpoint: should_checkpoint?(batch_num)
|
84
|
+
}
|
85
|
+
|
86
|
+
@conversions << conversion
|
87
|
+
rescue StandardError => e
|
88
|
+
# Record failure
|
89
|
+
batch_time = Time.now - batch_start_time
|
90
|
+
@batch_metadata << {
|
91
|
+
batch_num: batch_num,
|
92
|
+
page_start: start_page,
|
93
|
+
page_end: end_page,
|
94
|
+
size_bytes: batch_size,
|
95
|
+
size_mb: batch_size / (1024.0 * 1024.0),
|
96
|
+
status: 'failed',
|
97
|
+
error: e.message,
|
98
|
+
conversion_time_seconds: batch_time,
|
99
|
+
checkpoint: false
|
100
|
+
}
|
101
|
+
|
102
|
+
raise ConversionError.new(
|
103
|
+
"Batch #{batch_num} (pages #{start_page}-#{end_page}) failed: #{e.message}",
|
104
|
+
conversion_id: nil,
|
105
|
+
conversion_status: 'failed'
|
106
|
+
)
|
107
|
+
ensure
|
108
|
+
# Clean up temp file
|
109
|
+
batch_pdf.close
|
110
|
+
batch_pdf.unlink
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
self
|
115
|
+
end
|
116
|
+
|
117
|
+
# Get merged result from all batches
|
118
|
+
#
|
119
|
+
# @return [DocumentResult] merged result
|
120
|
+
# @raise [ConversionError] if no conversions completed
|
121
|
+
def result
|
122
|
+
raise ConversionError, 'No batches completed successfully' if @conversions.empty?
|
123
|
+
|
124
|
+
# Merge results from all successful batches
|
125
|
+
merged_data = merge_batch_results
|
126
|
+
|
127
|
+
DocumentResult.new(merged_data, @document_path, @document_type)
|
128
|
+
end
|
129
|
+
|
130
|
+
# Convenience method: wait and get result
|
131
|
+
#
|
132
|
+
# @return [DocumentResult]
|
133
|
+
def complete!
|
134
|
+
wait_until_complete
|
135
|
+
result
|
136
|
+
end
|
137
|
+
|
138
|
+
private
|
139
|
+
|
140
|
+
# Convert batch with exponential backoff retry
|
141
|
+
#
|
142
|
+
# @param batch_path [String] path to batch PDF
|
143
|
+
# @param retry_count [Integer] number of retries
|
144
|
+
# @return [String] conversion ID
|
145
|
+
def convert_batch_with_retry(batch_path, retry_count: 3)
|
146
|
+
attempt = 0
|
147
|
+
|
148
|
+
begin
|
149
|
+
attempt += 1
|
150
|
+
@client.convert_document(
|
151
|
+
document_path: batch_path,
|
152
|
+
document_type: @document_type,
|
153
|
+
**@options
|
154
|
+
)
|
155
|
+
rescue APIError
|
156
|
+
if attempt < retry_count
|
157
|
+
# Exponential backoff: 1s, 2s, 4s
|
158
|
+
sleep_time = 2**(attempt - 1)
|
159
|
+
sleep sleep_time
|
160
|
+
retry
|
161
|
+
end
|
162
|
+
|
163
|
+
raise
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
# Merge results from all batches
|
168
|
+
#
|
169
|
+
# @return [Hash] merged result data
|
170
|
+
def merge_batch_results
|
171
|
+
# Extract results from each batch
|
172
|
+
batch_results = @conversions.map(&:result)
|
173
|
+
|
174
|
+
# Merge markdown (concatenate with blank line separator)
|
175
|
+
merged_markdown = batch_results
|
176
|
+
.map(&:markdown)
|
177
|
+
.compact
|
178
|
+
.join("\n\n")
|
179
|
+
|
180
|
+
# Merge LaTeX
|
181
|
+
merged_latex = batch_results
|
182
|
+
.map(&:latex)
|
183
|
+
.compact
|
184
|
+
.join("\n\n")
|
185
|
+
|
186
|
+
# Merge HTML
|
187
|
+
merged_html = batch_results
|
188
|
+
.map(&:html)
|
189
|
+
.compact
|
190
|
+
.join("\n")
|
191
|
+
|
192
|
+
# Merge pages (flatten arrays)
|
193
|
+
all_pages = batch_results
|
194
|
+
.flat_map(&:pages)
|
195
|
+
|
196
|
+
# Merge equations
|
197
|
+
all_equations = batch_results
|
198
|
+
.flat_map(&:equations)
|
199
|
+
|
200
|
+
# Merge tables
|
201
|
+
all_tables = batch_results
|
202
|
+
.flat_map(&:tables)
|
203
|
+
|
204
|
+
# Merge diagrams
|
205
|
+
all_diagrams = batch_results
|
206
|
+
.flat_map(&:diagrams)
|
207
|
+
|
208
|
+
# Calculate total processing time
|
209
|
+
total_time = @batch_metadata
|
210
|
+
.sum { |m| m[:conversion_time_seconds] }
|
211
|
+
|
212
|
+
# Build merged data
|
213
|
+
{
|
214
|
+
'markdown' => merged_markdown,
|
215
|
+
'latex' => merged_latex,
|
216
|
+
'html' => merged_html,
|
217
|
+
'pages' => all_pages,
|
218
|
+
'equations' => all_equations,
|
219
|
+
'tables' => all_tables,
|
220
|
+
'diagrams' => all_diagrams,
|
221
|
+
'batched' => true,
|
222
|
+
'batch_count' => @conversions.length,
|
223
|
+
'total_pages' => @batcher.page_count,
|
224
|
+
'total_processing_time' => total_time,
|
225
|
+
'batch_metadata' => @batch_metadata
|
226
|
+
}
|
227
|
+
end
|
228
|
+
|
229
|
+
# Check if batch should be checkpointed (Seed 1069 pattern)
|
230
|
+
#
|
231
|
+
# @param batch_num [Integer] batch number (1-indexed)
|
232
|
+
# @return [Boolean] true if trit is +1
|
233
|
+
def should_checkpoint?(batch_num)
|
234
|
+
trit_index = (batch_num - 1) % 7
|
235
|
+
SEED_1069[trit_index] == 1
|
236
|
+
end
|
237
|
+
end
|
238
|
+
end
|
data/lib/mathpix/client.rb
CHANGED
@@ -29,11 +29,11 @@ module Mathpix
|
|
29
29
|
src, source_ref = prepare_image_source(image_path_or_url, options)
|
30
30
|
|
31
31
|
response = post('/text', {
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
32
|
+
src: src,
|
33
|
+
formats: (options[:formats] || config.default_formats).map(&:to_s),
|
34
|
+
include_line_data: options[:include_line_data] || false,
|
35
|
+
**build_request_options(options)
|
36
|
+
})
|
37
37
|
|
38
38
|
Result.new(response, source_ref)
|
39
39
|
end
|
@@ -79,10 +79,10 @@ module Mathpix
|
|
79
79
|
end
|
80
80
|
|
81
81
|
response = post('/converter', {
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
82
|
+
mmd: mmd,
|
83
|
+
formats: formats_hash,
|
84
|
+
conversion_options: options[:conversion_options] || {}
|
85
|
+
})
|
86
86
|
|
87
87
|
conversion_id = response['conversion_id']
|
88
88
|
Conversion.new(self, conversion_id: conversion_id, mmd: mmd, formats: formats)
|
@@ -137,10 +137,10 @@ module Mathpix
|
|
137
137
|
def convert_document(document_path:, document_type:, **options)
|
138
138
|
# Encode document as base64 data URI or use URL
|
139
139
|
src = if url?(document_path)
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
140
|
+
document_path
|
141
|
+
else
|
142
|
+
encode_image(document_path) # Reuse existing encoding
|
143
|
+
end
|
144
144
|
|
145
145
|
# Build conversion request
|
146
146
|
request_body = {
|
@@ -151,7 +151,7 @@ module Mathpix
|
|
151
151
|
}
|
152
152
|
|
153
153
|
response = post('/pdf', request_body)
|
154
|
-
response['pdf_id']
|
154
|
+
response['pdf_id'] # Returns conversion ID for polling
|
155
155
|
end
|
156
156
|
|
157
157
|
# Get document conversion status
|
@@ -202,13 +202,13 @@ module Mathpix
|
|
202
202
|
# @param options [Hash] additional options
|
203
203
|
# @return [Array<String, String>] src value and source reference
|
204
204
|
# @raise [InvalidRequestError] if input looks like malformed URL
|
205
|
-
def prepare_image_source(input,
|
205
|
+
def prepare_image_source(input, _options = {})
|
206
206
|
# Handle hash input: { url: '...' } or { path: '...' }
|
207
207
|
if input.is_a?(Hash)
|
208
208
|
if input[:url] || input['url']
|
209
209
|
url = input[:url] || input['url']
|
210
|
-
url = config.upgrade_to_https(url)
|
211
|
-
validate_url!(url)
|
210
|
+
url = config.upgrade_to_https(url) # Auto-upgrade HTTP→HTTPS
|
211
|
+
validate_url!(url) # Raise InvalidRequestError if malformed
|
212
212
|
return [url, url]
|
213
213
|
elsif input[:path] || input['path']
|
214
214
|
path = input[:path] || input['path']
|
@@ -222,22 +222,20 @@ module Mathpix
|
|
222
222
|
|
223
223
|
# Detect if input is URL or local path
|
224
224
|
if url?(upgraded_input)
|
225
|
-
[upgraded_input, upgraded_input]
|
225
|
+
[upgraded_input, upgraded_input] # Use URL directly as src
|
226
226
|
elsif looks_like_url?(input)
|
227
227
|
# String contains URL-like patterns but isn't valid
|
228
228
|
raise InvalidRequestError, "Invalid URL format: #{input}"
|
229
229
|
else
|
230
230
|
# Try to encode as local file
|
231
231
|
begin
|
232
|
-
[encode_image(input), input]
|
233
|
-
rescue SecurityError, Errno::ENOENT
|
232
|
+
[encode_image(input), input] # Encode local file (use original path)
|
233
|
+
rescue SecurityError, Errno::ENOENT
|
234
234
|
# If file encoding fails and input doesn't look like a file path,
|
235
235
|
# it's likely a malformed URL
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
raise # Re-raise original error for actual file path issues
|
240
|
-
end
|
236
|
+
raise InvalidRequestError, "Invalid URL format: #{input}" unless looks_like_file_path?(input)
|
237
|
+
|
238
|
+
raise # Re-raise original error for actual file path issues
|
241
239
|
end
|
242
240
|
end
|
243
241
|
end
|
@@ -248,6 +246,7 @@ module Mathpix
|
|
248
246
|
# @return [Boolean]
|
249
247
|
def url?(str)
|
250
248
|
return false unless str.is_a?(String)
|
249
|
+
|
251
250
|
config.valid_url?(str)
|
252
251
|
end
|
253
252
|
|
@@ -260,6 +259,7 @@ module Mathpix
|
|
260
259
|
# @return [Boolean]
|
261
260
|
def looks_like_url?(str)
|
262
261
|
return false unless str.is_a?(String)
|
262
|
+
|
263
263
|
# URL-like patterns: contains protocol or www prefix
|
264
264
|
str.match?(%r{^(https?://|www\.)|://})
|
265
265
|
end
|
@@ -270,6 +270,7 @@ module Mathpix
|
|
270
270
|
# @raise [InvalidRequestError] if URL is not valid
|
271
271
|
def validate_url!(url)
|
272
272
|
return if config.valid_url?(url)
|
273
|
+
|
273
274
|
raise InvalidRequestError, "Invalid URL format: #{url}"
|
274
275
|
end
|
275
276
|
|
@@ -282,8 +283,9 @@ module Mathpix
|
|
282
283
|
# @return [Boolean]
|
283
284
|
def looks_like_file_path?(str)
|
284
285
|
return false unless str.is_a?(String)
|
286
|
+
|
285
287
|
# File path patterns: contains slashes, starts with ~, has file extension, or starts with .
|
286
|
-
str.match?(%r{^[
|
288
|
+
str.match?(%r{^[~/.]|/|\\|\.(?:png|jpe?g|gif|webp|pdf|docx|pptx)$}i)
|
287
289
|
end
|
288
290
|
|
289
291
|
# Encode image to base64 data URI (with path sanitization)
|
@@ -417,7 +419,11 @@ module Mathpix
|
|
417
419
|
retry_after: response['Retry-After']&.to_i
|
418
420
|
)
|
419
421
|
when Net::HTTPClientError
|
420
|
-
error_data =
|
422
|
+
error_data = begin
|
423
|
+
JSON.parse(response.body)
|
424
|
+
rescue StandardError
|
425
|
+
{}
|
426
|
+
end
|
421
427
|
raise APIError.new(
|
422
428
|
error_data['error'] || 'Client error',
|
423
429
|
status: response.code.to_i,
|
@@ -62,7 +62,7 @@ module Mathpix
|
|
62
62
|
@rate_limit = RATE_LIMIT_DEFAULT
|
63
63
|
|
64
64
|
# Structured logging
|
65
|
-
@logger = nil
|
65
|
+
@logger = nil # Can be set to Logger instance
|
66
66
|
end
|
67
67
|
|
68
68
|
def validate!
|
@@ -70,14 +70,10 @@ module Mathpix
|
|
70
70
|
raise ConfigurationError, 'app_key is required' if app_key.nil? || app_key.empty?
|
71
71
|
|
72
72
|
# Validate API URL uses HTTPS
|
73
|
-
if enforce_https && !api_url.start_with?('https://')
|
74
|
-
raise ConfigurationError, 'API URL must use HTTPS'
|
75
|
-
end
|
73
|
+
raise ConfigurationError, 'API URL must use HTTPS' if enforce_https && !api_url.start_with?('https://')
|
76
74
|
|
77
75
|
# Validate timeout
|
78
|
-
if timeout <= 0 || timeout > 300
|
79
|
-
raise ConfigurationError, 'Timeout must be between 1 and 300 seconds'
|
80
|
-
end
|
76
|
+
raise ConfigurationError, 'Timeout must be between 1 and 300 seconds' if timeout <= 0 || timeout > 300
|
81
77
|
|
82
78
|
true
|
83
79
|
end
|
@@ -132,7 +128,7 @@ module Mathpix
|
|
132
128
|
return url unless url.is_a?(String)
|
133
129
|
return url unless url.start_with?('http://')
|
134
130
|
|
135
|
-
url.sub(
|
131
|
+
url.sub(%r{^http://}, 'https://')
|
136
132
|
end
|
137
133
|
|
138
134
|
# Sanitize file path to prevent directory traversal
|
@@ -151,7 +147,7 @@ module Mathpix
|
|
151
147
|
|
152
148
|
# Check for directory traversal attempts
|
153
149
|
return nil if normalized.include?('../')
|
154
|
-
return nil if normalized.match?(
|
150
|
+
return nil if normalized.match?(%r{\.\.[/\\]})
|
155
151
|
|
156
152
|
# Check file exists (for local paths)
|
157
153
|
return nil unless File.exist?(normalized)
|