mathpix-mcp 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,182 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mathpix
4
+ # Configuration class with security defaults and validation
5
+ # Seed: 1069 - Deterministic configuration values
6
+ class Configuration
7
+ # Security constants
8
+ HTTPS_ONLY = true
9
+ MAX_FILE_SIZE_MB = 10
10
+ MAX_PATH_LENGTH = 1024
11
+ ALLOWED_SCHEMES = %w[https].freeze
12
+
13
+ # Resource limits
14
+ MIN_LIMIT = 1
15
+ MAX_LIMIT = 100
16
+ DEFAULT_LIMIT = 10
17
+
18
+ # Confidence thresholds (balanced ternary seed 1069)
19
+ CONFIDENCE_HIGH = 0.9
20
+ CONFIDENCE_MEDIUM = 0.7
21
+ CONFIDENCE_LOW = 0.5
22
+
23
+ # Rate limiting (requests per minute)
24
+ RATE_LIMIT_DEFAULT = 60
25
+ RATE_LIMIT_BURST = 10
26
+
27
+ attr_accessor :app_id, :app_key, :api_url, :timeout, :default_formats,
28
+ :user_agent, :enforce_https, :max_file_size_mb, :logger, :seed
29
+
30
+ attr_reader :rate_limit, :confidence_thresholds
31
+
32
+ # Alias endpoint for api_url
33
+ alias endpoint api_url
34
+ alias endpoint= api_url=
35
+
36
+ def initialize
37
+ @app_id = ENV.fetch('MATHPIX_APP_ID', nil)
38
+ @app_key = ENV.fetch('MATHPIX_APP_KEY', nil)
39
+ @api_url = ENV.fetch('MATHPIX_API_URL', 'https://api.mathpix.com/v3')
40
+ @timeout = ENV.fetch('MATHPIX_TIMEOUT', '30').to_i
41
+ @default_formats = [:latex_styled]
42
+ @user_agent = "mathpix-ruby/#{Mathpix::VERSION}"
43
+
44
+ # Security settings
45
+ @enforce_https = HTTPS_ONLY
46
+ @max_file_size_mb = MAX_FILE_SIZE_MB
47
+ @max_path_length = MAX_PATH_LENGTH
48
+
49
+ # Resource limits
50
+ @min_limit = MIN_LIMIT
51
+ @max_limit = MAX_LIMIT
52
+ @default_limit = DEFAULT_LIMIT
53
+
54
+ # Confidence thresholds
55
+ @confidence_thresholds = {
56
+ high: CONFIDENCE_HIGH,
57
+ medium: CONFIDENCE_MEDIUM,
58
+ low: CONFIDENCE_LOW
59
+ }
60
+
61
+ # Rate limiting
62
+ @rate_limit = RATE_LIMIT_DEFAULT
63
+
64
+ # Structured logging
65
+ @logger = nil # Can be set to Logger instance
66
+ end
67
+
68
+ def validate!
69
+ raise ConfigurationError, 'app_id is required' if app_id.nil? || app_id.empty?
70
+ raise ConfigurationError, 'app_key is required' if app_key.nil? || app_key.empty?
71
+
72
+ # Validate API URL uses HTTPS
73
+ raise ConfigurationError, 'API URL must use HTTPS' if enforce_https && !api_url.start_with?('https://')
74
+
75
+ # Validate timeout
76
+ raise ConfigurationError, 'Timeout must be between 1 and 300 seconds' if timeout <= 0 || timeout > 300
77
+
78
+ true
79
+ end
80
+
81
+ # Sanitize limit to be within bounds
82
+ #
83
+ # @param limit [Integer] requested limit
84
+ # @return [Integer] clamped limit
85
+ def sanitize_limit(limit)
86
+ [[limit.to_i, @min_limit].max, @max_limit].min
87
+ end
88
+
89
+ # Check if URL is allowed (HTTPS only)
90
+ #
91
+ # @param url [String] URL to validate
92
+ # @return [Boolean]
93
+ def valid_url?(url)
94
+ return false unless url.is_a?(String)
95
+ return false if url.length > @max_path_length
96
+
97
+ uri = URI.parse(url)
98
+
99
+ # Must be HTTP(S) scheme
100
+ return false unless %w[http https].include?(uri.scheme)
101
+
102
+ # Enforce HTTPS if enabled
103
+ return false if enforce_https && uri.scheme != 'https'
104
+
105
+ # Must have a host
106
+ return false if uri.host.nil? || uri.host.empty?
107
+
108
+ # Block localhost and private IPs
109
+ return false if uri.host.match?(/^(localhost|127\.|0\.0\.0\.0|::1)/)
110
+ return false if uri.host.match?(/^(10\.|172\.(1[6-9]|2[0-9]|3[01])\.|192\.168\.)/)
111
+
112
+ true
113
+ rescue URI::InvalidURIError
114
+ false
115
+ end
116
+
117
+ # Auto-upgrade HTTP to HTTPS for remote URLs
118
+ #
119
+ # This provides the same behavior for seamless URL support
120
+ #
121
+ # @param url [String] URL that may be HTTP or HTTPS
122
+ # @return [String] URL with https:// scheme
123
+ # @example
124
+ # upgrade_to_https('http://example.com/img.png')
125
+ # # => 'https://example.com/img.png'
126
+ def upgrade_to_https(url)
127
+ return url unless url.is_a?(String)
128
+ return url unless url.start_with?('http://')
129
+
130
+ url.sub(%r{^http://}, 'https://')
131
+ end
132
+
133
+ # Sanitize file path to prevent directory traversal
134
+ #
135
+ # @param path [String] file path
136
+ # @return [String, nil] sanitized path or nil if invalid
137
+ def sanitize_path(path)
138
+ return nil unless path.is_a?(String)
139
+ return nil if path.length > @max_path_length
140
+
141
+ # Remove null bytes
142
+ path = path.tr("\0", '')
143
+
144
+ # Normalize path
145
+ normalized = File.expand_path(path)
146
+
147
+ # Check for directory traversal attempts
148
+ return nil if normalized.include?('../')
149
+ return nil if normalized.match?(%r{\.\.[/\\]})
150
+
151
+ # Check file exists (for local paths)
152
+ return nil unless File.exist?(normalized)
153
+
154
+ # Check file size
155
+ size_mb = File.size(normalized).to_f / (1024 * 1024)
156
+ return nil if size_mb > @max_file_size_mb
157
+
158
+ normalized
159
+ rescue StandardError
160
+ nil
161
+ end
162
+
163
+ # Log structured message
164
+ #
165
+ # @param level [Symbol] log level (:debug, :info, :warn, :error)
166
+ # @param message [String] log message
167
+ # @param data [Hash] structured data
168
+ def log(level, message, data = {})
169
+ return unless @logger
170
+
171
+ structured_message = {
172
+ timestamp: Time.now.utc.iso8601,
173
+ level: level,
174
+ message: message,
175
+ seed: 1069,
176
+ **data
177
+ }.to_json
178
+
179
+ @logger.send(level, structured_message)
180
+ end
181
+ end
182
+ end
@@ -0,0 +1,345 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mathpix
4
+ # Document processing builder (PDF, DOCX, PPTX)
5
+ class Document
6
+ attr_reader :client, :document_path, :options
7
+
8
+ def initialize(client, document_path)
9
+ @client = client
10
+ @document_path = document_path
11
+ @options = {}
12
+ end
13
+
14
+ # Set output formats
15
+ # @param formats [Array<Symbol>] format names
16
+ # @return [self]
17
+ # @example
18
+ # doc.with_formats(:markdown, :latex, :docx)
19
+ def with_formats(*formats)
20
+ @options[:formats] = formats.flatten
21
+ self
22
+ end
23
+
24
+ # Enable table extraction
25
+ # @param options [Hash] table options
26
+ # @return [self]
27
+ def with_tables(**options)
28
+ @options[:include_table_html] = true
29
+ @options.merge!(options)
30
+ self
31
+ end
32
+
33
+ # Enable diagram extraction
34
+ # @return [self]
35
+ def with_diagrams
36
+ @options[:include_diagram_svg] = true
37
+ self
38
+ end
39
+
40
+ # Set quality level
41
+ # @param level [Symbol] :low, :medium, :high
42
+ # @return [self]
43
+ def quality(level)
44
+ @options[:quality] = level
45
+ self
46
+ end
47
+
48
+ # Enable line-level data (bounding boxes)
49
+ # @return [self]
50
+ def with_line_data
51
+ @options[:include_line_data] = true
52
+ self
53
+ end
54
+
55
+ # Enable word-level data (bounding boxes)
56
+ # @return [self]
57
+ def with_word_data
58
+ @options[:include_word_data] = true
59
+ self
60
+ end
61
+
62
+ # Set page range for processing
63
+ # @param start_page [Integer] first page (1-indexed)
64
+ # @param end_page [Integer, nil] last page (nil = all)
65
+ # @return [self]
66
+ def pages(start_page, end_page = nil)
67
+ @options[:page_ranges] = { start: start_page, end: end_page }
68
+ self
69
+ end
70
+
71
+ # Execute document conversion (async operation).
72
+ #
73
+ # The whole document is uploaded in a single request — the Mathpix /v3/pdf
74
+ # endpoint paginates large PDFs server-side.
75
+ #
76
+ # @return [DocumentConversion] conversion object (async)
77
+ # @example
78
+ # conversion = Mathpix::Document.new(client, 'paper.pdf')
79
+ # .with_formats(:markdown, :latex)
80
+ # .convert
81
+ # conversion.wait_until_complete
82
+ # conversion.save_markdown('output.md')
83
+ def convert
84
+ doc_type = detect_document_type
85
+ conversion_id = client.convert_document(
86
+ document_path: document_path,
87
+ document_type: doc_type,
88
+ **options
89
+ )
90
+ DocumentConversion.new(client, conversion_id, document_path, doc_type)
91
+ end
92
+
93
+ alias call convert
94
+ alias run convert
95
+
96
+ private
97
+
98
+ # Detect document type from file extension
99
+ # @return [Symbol] :pdf, :docx, :pptx
100
+ def detect_document_type
101
+ ext = File.extname(document_path).downcase
102
+ case ext
103
+ when '.pdf' then :pdf
104
+ when '.docx' then :docx
105
+ when '.pptx' then :pptx
106
+ else
107
+ raise InvalidImageError.new(
108
+ "Unsupported document format: #{ext}",
109
+ recommended_format: 'pdf, docx, pptx'
110
+ )
111
+ end
112
+ end
113
+ end
114
+
115
+ # Document Conversion Result (async operation)
116
+ #
117
+ # Polls Mathpix API until conversion completes
118
+ class DocumentConversion
119
+ attr_reader :client, :conversion_id, :document_path, :document_type
120
+
121
+ def initialize(client, conversion_id, document_path, document_type)
122
+ @client = client
123
+ @conversion_id = conversion_id
124
+ @document_path = document_path
125
+ @document_type = document_type
126
+ end
127
+
128
+ # Wait for conversion to complete
129
+ #
130
+ # @param max_wait [Integer] maximum wait time in seconds
131
+ # @param poll_interval [Float] seconds between polls
132
+ # @return [self]
133
+ def wait_until_complete(max_wait: 600, poll_interval: 3.0)
134
+ start_time = Time.now
135
+
136
+ loop do
137
+ status_data = client.get_document_status(conversion_id)
138
+ status = status_data['status']
139
+
140
+ case status
141
+ when 'completed'
142
+ @result = DocumentResult.new(build_result_data(status_data), document_path, document_type)
143
+ return self
144
+ when 'error', 'failed'
145
+ raise ConversionError.new(
146
+ "Document conversion failed: #{extract_status_error(status_data)}",
147
+ conversion_id: conversion_id,
148
+ conversion_status: status
149
+ )
150
+ else
151
+ # Any non-terminal status keeps polling. Mathpix reports several
152
+ # intermediate states (received, loaded, split, processing,
153
+ # pending, ...) — we only stop on 'completed' or an error.
154
+ elapsed = Time.now - start_time
155
+ if elapsed > max_wait
156
+ raise TimeoutError, "Document conversion timed out after #{max_wait}s (last status: #{status})"
157
+ end
158
+
159
+ sleep poll_interval
160
+ end
161
+ end
162
+ end
163
+
164
+ # Get result (must wait_until_complete first)
165
+ # @return [DocumentResult]
166
+ def result
167
+ @result || raise(ConversionError, 'Conversion not yet complete. Call wait_until_complete first.')
168
+ end
169
+
170
+ # Convenience method: wait and get result
171
+ # @return [DocumentResult]
172
+ def complete!
173
+ wait_until_complete
174
+ result
175
+ end
176
+
177
+ # Save markdown output
178
+ # @param path [String] output file path
179
+ def save_markdown(path)
180
+ complete! unless @result
181
+ @result.save_markdown(path)
182
+ end
183
+
184
+ # Save LaTeX output
185
+ # @param path [String] output file path
186
+ def save_latex(path)
187
+ complete! unless @result
188
+ @result.save_latex(path)
189
+ end
190
+
191
+ # Save HTML output
192
+ # @param path [String] output file path
193
+ def save_html(path)
194
+ complete! unless @result
195
+ @result.save_html(path)
196
+ end
197
+
198
+ # Save DOCX output
199
+ # @param path [String] output file path
200
+ def save_docx(path)
201
+ complete! unless @result
202
+ @result.save_docx(path)
203
+ end
204
+
205
+ private
206
+
207
+ # Merge fetched output content into the status payload so DocumentResult
208
+ # can expose markdown/html. The /v3/pdf/{id} status JSON never contains the
209
+ # converted text — it must be fetched from the .{ext} endpoints.
210
+ def build_result_data(status_data)
211
+ data = status_data.dup
212
+ data['markdown'] ||= fetch_output('mmd')
213
+ data['html'] ||= fetch_output('html')
214
+ data
215
+ end
216
+
217
+ def fetch_output(format)
218
+ client.get_document_output(conversion_id, format)
219
+ rescue Mathpix::Error
220
+ nil
221
+ end
222
+
223
+ # Pull a descriptive failure reason from a Mathpix status payload.
224
+ def extract_status_error(status_data)
225
+ info = status_data['error_info']
226
+ (info && (info['message'] || info['id'])) ||
227
+ status_data['error'] ||
228
+ 'unknown error'
229
+ end
230
+ end
231
+
232
+ # Document Result object
233
+ #
234
+ # Represents processed document with extracted content
235
+ class DocumentResult < Result
236
+ attr_reader :document_path, :document_type
237
+
238
+ def initialize(data, document_path = nil, document_type = nil)
239
+ super(data)
240
+ @document_path = document_path
241
+ @document_type = document_type
242
+ end
243
+
244
+ # Get all pages
245
+ # @return [Array<Hash>] page data
246
+ def pages
247
+ data['pages'] || []
248
+ end
249
+
250
+ # Get page count
251
+ # @return [Integer]
252
+ def page_count
253
+ pages.length
254
+ end
255
+
256
+ # Processing time (seconds if reported by the conversion, else nil)
257
+ # @return [Numeric, nil]
258
+ def processing_time
259
+ data['total_processing_time'] || data['processing_time'] || processing_time_ms
260
+ end
261
+
262
+ # Get all equations across all pages
263
+ # @return [Array<String>]
264
+ def equations
265
+ pages.flat_map { |p| p['equations'] || [] }
266
+ end
267
+
268
+ # Get all tables across all pages
269
+ # @return [Array<Hash>]
270
+ def tables
271
+ pages.flat_map { |p| p['tables'] || [] }
272
+ end
273
+
274
+ # Get all diagrams across all pages
275
+ # @return [Array<Hash>]
276
+ def diagrams
277
+ pages.flat_map { |p| p['diagrams'] || [] }
278
+ end
279
+
280
+ # Get markdown output
281
+ # @return [String, nil]
282
+ def markdown
283
+ data['markdown'] || data['mmd']
284
+ end
285
+
286
+ # Get LaTeX output
287
+ # @return [String, nil]
288
+ def latex
289
+ data['latex']
290
+ end
291
+
292
+ # Get HTML output
293
+ # @return [String, nil]
294
+ def html
295
+ data['html']
296
+ end
297
+
298
+ # Save markdown to file
299
+ # @param path [String] output file path
300
+ def save_markdown(path)
301
+ File.write(path, markdown) if markdown
302
+ end
303
+
304
+ # Save LaTeX to file
305
+ # @param path [String] output file path
306
+ def save_latex(path)
307
+ File.write(path, latex) if latex
308
+ end
309
+
310
+ # Save HTML to file
311
+ # @param path [String] output file path
312
+ def save_html(path)
313
+ File.write(path, html) if html
314
+ end
315
+
316
+ # Save DOCX output to file
317
+ # @param path [String] output file path
318
+ def save_docx(path)
319
+ if data['docx_url']
320
+ docx_data = client.download(data['docx_url'])
321
+ File.binwrite(path, docx_data)
322
+ elsif data['docx_data']
323
+ File.binwrite(path, data['docx_data'])
324
+ end
325
+ end
326
+
327
+ # Check if document is a specific type
328
+ # @return [Boolean]
329
+ def pdf?
330
+ document_type == :pdf
331
+ end
332
+
333
+ def docx?
334
+ document_type == :docx
335
+ end
336
+
337
+ def pptx?
338
+ document_type == :pptx
339
+ end
340
+ end
341
+
342
+ # Alias PDF class to Document for backward compatibility
343
+ PDF = Document
344
+ PDFResult = DocumentResult
345
+ end
@@ -0,0 +1,78 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mathpix
4
+ # Base error class
5
+ class Error < StandardError
6
+ attr_reader :details
7
+
8
+ def initialize(message, details: {})
9
+ super(message)
10
+ @details = details
11
+ end
12
+ end
13
+
14
+ # Configuration error
15
+ class ConfigurationError < Error; end
16
+
17
+ # API error
18
+ class APIError < Error
19
+ attr_reader :status
20
+
21
+ def initialize(message, status: nil, details: {})
22
+ super(message, details: details)
23
+ @status = status
24
+ end
25
+ end
26
+
27
+ # Rate limit error
28
+ class RateLimitError < APIError
29
+ attr_reader :retry_after
30
+
31
+ def initialize(message, retry_after: nil, **)
32
+ super(message, **)
33
+ @retry_after = retry_after
34
+ end
35
+ end
36
+
37
+ # Server error (5xx)
38
+ class ServerError < APIError; end
39
+
40
+ # Network/timeout error
41
+ class NetworkError < Error; end
42
+ class TimeoutError < NetworkError; end
43
+
44
+ # Low confidence error
45
+ class LowConfidenceError < Error
46
+ attr_reader :confidence, :suggestions
47
+
48
+ def initialize(message, confidence: nil, suggestions: [])
49
+ super(message)
50
+ @confidence = confidence
51
+ @suggestions = suggestions
52
+ end
53
+ end
54
+
55
+ # Invalid request error (malformed input)
56
+ class InvalidRequestError < Error; end
57
+
58
+ # Invalid image error
59
+ class InvalidImageError < Error
60
+ attr_reader :recommended_format
61
+
62
+ def initialize(message, recommended_format: nil)
63
+ super(message)
64
+ @recommended_format = recommended_format
65
+ end
66
+ end
67
+
68
+ # Conversion error
69
+ class ConversionError < Error
70
+ attr_reader :conversion_id, :conversion_status
71
+
72
+ def initialize(message, conversion_id: nil, conversion_status: nil)
73
+ super(message)
74
+ @conversion_id = conversion_id
75
+ @conversion_status = conversion_status
76
+ end
77
+ end
78
+ end