mindee 4.7.0.pre.rc1 → 4.7.0.pre.rc2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c88f4c2b47bf3a97cd519d1c658f12a874ed930c8e86d5ff208c0c44d4174cca
4
- data.tar.gz: 191850ef611da7ca60fac4827f6c530348adf2d07d4a7790dd46a6abb335e906
3
+ metadata.gz: 95c963d2c0530278f42fa3543609db9f34156d74e031c08c929be8043794425e
4
+ data.tar.gz: 92b4cfd60be3f710de9bdab714d4291b152d6d253373dcfae20428a940ad5ec9
5
5
  SHA512:
6
- metadata.gz: cbf412b01eb4c6a324f6548232ce24e7ed0bd6f20b12d3803a47218e971805b77c32989e644077809aaf3ec404946461deaf4e440385812228ece7d8d839d6b6
7
- data.tar.gz: 381aebed32ebee1cbd763f361c46bf76eff8fd6592f1d9652ccb79ca5515a804e20c8a36459c56af189e5af883ec9b02c2f69a921dde46d15ad6bb1b7224267d
6
+ metadata.gz: 64be40a13e4662730c5749d5348860abf30e0c44002e4501d19a99a8c51c4afb2a83f9d901b1b3b4bbc43d255d8db3a27410b4ff5d274028092b7ec1368d830d
7
+ data.tar.gz: 5e9201f8c7d9beee93f601e0fe12758c20f55175c7c0476544bada2d1f8a7828962c848c75bb882169dea9c677649f53b3e0eba824cbdc559810f196fb3c0a75
data/CHANGELOG.md CHANGED
@@ -1,5 +1,13 @@
1
1
  # Mindee Ruby API Library Changelog
2
2
 
3
+ ## v4.7.0-rc2 - 2025-08-20
4
+ ### Changes
5
+ * :sparkles: add missing accessors for PDF fixing options in `LocalInputSource`
6
+ ### Fixes
7
+ * :recycle: update existing PDF fixing syntax
8
+ * :memo: fix typos & documentation
9
+
10
+
3
11
  ## v4.7.0-rc1 - 2025-08-13
4
12
  ### Changes
5
13
  * :sparkles: add support for client V2 & associated features
@@ -8,10 +8,10 @@ model_id = 'MY_MODEL_ID'
8
8
  mindee_client = Mindee::ClientV2.new(api_key: api_key)
9
9
 
10
10
  # Set inference parameters
11
- params = Mindee::Input::InferenceParameters.new(
11
+ inference_params = Mindee::Input::InferenceParameters.new(
12
12
  # ID of the model, required.
13
13
  model_id,
14
- # If set to `True`, will enable Retrieval-Augmented Generation.
14
+ # If set to `true`, will enable Retrieval-Augmented Generation.
15
15
  rag: false,
16
16
  )
17
17
 
@@ -21,7 +21,7 @@ input_source = Mindee::Input::Source::PathInputSource.new(input_path)
21
21
  # Send for processing
22
22
  response = mindee_client.enqueue_and_get_inference(
23
23
  input_source,
24
- params # Note: this parameter can also be provided as a Hash.
24
+ inference_params # Note: this parameter can also be provided as a Hash.
25
25
  )
26
26
 
27
27
  # Print a brief summary of the parsed data
@@ -16,7 +16,7 @@ module Mindee
16
16
  # @param mime_type [String]
17
17
  def initialize(mime_type)
18
18
  @invalid_mimetype = mime_type
19
- super("'#{@invalid_mimetype}' mime type not allowed, must be one of" \
19
+ super("'#{@invalid_mimetype}' mime type not allowed, must be one of " \
20
20
  "#{Mindee::Input::Source::ALLOWED_MIME_TYPES.join(', ')}")
21
21
  end
22
22
  end
@@ -18,12 +18,10 @@ module Mindee
18
18
  # @return [Origami::PDF] A PdfDocument handle.
19
19
  def self.attach_image_as_new_file(input_buffer, format: 'jpg')
20
20
  magick_image = MiniMagick::Image.read(input_buffer)
21
- # NOTE: some jpeg images get rendered as three different versions of themselves per output if the format isn't
22
- # converted.
21
+ # NOTE: We force format consolidation to a single format to avoid frames being interpreted as the final output.
23
22
  magick_image.format(format)
24
23
  original_density = magick_image.resolution
25
- scale_factor = original_density[0].to_f / 4.166666 # No clue why the resolution needs to be reduced for
26
- # the pdf otherwise the resulting image shrinks.
24
+ scale_factor = original_density[0].to_f / 4.166666 # Convert from default 300 DPI to 72.
27
25
  magick_image.format('pdf', 0, { density: scale_factor.to_s })
28
26
  Origami::PDF.read(StringIO.new(magick_image.to_blob))
29
27
  end
@@ -32,8 +30,7 @@ module Mindee
32
30
  #
33
31
  # @param [Input::Source::LocalInputSource] input_source
34
32
  # @param [Integer] page_id ID of the Page to extract from.
35
- # @param [Array<Array<Geometry::Point>>, Array<Geometry::Quadrilateral>] polygons List of coordinates.
36
- # to extract.
33
+ # @param [Array<Array<Geometry::Point>>, Array<Geometry::Quadrilateral>] polygons List of coordinates to extract.
37
34
  # @return [Array<Image::ExtractedImage>] Extracted Images.
38
35
  def self.extract_multiple_images_from_source(input_source, page_id, polygons)
39
36
  new_stream = load_input_source_pdf_page_as_stringio(input_source, page_id)
@@ -69,10 +69,10 @@ module Mindee
69
69
  end
70
70
 
71
71
  # Deserializes a loaded response
72
- # @param response_class [Parsing::V2::JobResponse, Parsing::V2::InferenceResponse] class to return.
72
+ # @param response_class [Parsing::V2::CommonResponse] class to return.
73
73
  # @return [Parsing::V2::JobResponse, Parsing::V2::InferenceResponse]
74
74
  def deserialize_response(response_class)
75
- response_class.new(as_hash)
75
+ response_class.new(as_hash) # : Parsing::V2::JobResponse | Parsing::V2::InferenceResponse
76
76
  rescue StandardError
77
77
  raise Errors::MindeeInputError, 'Invalid response provided.'
78
78
  end
@@ -47,8 +47,7 @@ module Mindee
47
47
  end
48
48
 
49
49
  if filename.end_with?('.pdf') && repair_pdf
50
- rescue_broken_pdf(@io_stream)
51
- @file_mimetype = Marcel::MimeType.for @io_stream
50
+ fix_pdf!
52
51
 
53
52
  logger.debug("Loaded new input #{@filename} from #{self.class}")
54
53
  return if ALLOWED_MIME_TYPES.include? @file_mimetype
@@ -57,27 +56,40 @@ module Mindee
57
56
  raise Errors::MindeeMimeTypeError, @file_mimetype.to_s
58
57
  end
59
58
 
60
- # Attempts to fix pdf files if mimetype is rejected.
61
- # "Broken PDFs" are often a result of third-party injecting invalid headers.
62
- # This attempts to remove them and send the file
63
- # @param stream [StringIO, File]
64
- def rescue_broken_pdf(stream)
65
- stream.gets('%PDF-')
66
- raise Errors::MindeePDFError if stream.eof? || stream.pos > 500
67
-
68
- stream.pos = stream.pos - 5
69
- data = stream.read
70
- @io_stream.close
71
-
72
- @io_stream = StringIO.new
73
- @io_stream << data
59
+ # @deprecated See {#fix_pdf!} or {#self.fix_pdf} instead.
60
+ def rescue_broken_pdf(_)
61
+ fix_pdf!
74
62
  end
75
63
 
76
- # Shorthand for pdf mimetype validation.
64
+ # Shorthand for PDF mimetype validation.
77
65
  def pdf?
78
66
  @file_mimetype.to_s == 'application/pdf'
79
67
  end
80
68
 
69
+ # Attempts to fix the PDF data in the file.
70
+ # @param maximum_offset [Integer] Maximum offset to look for the PDF header.
71
+ # @return [void]
72
+ # @raise [Mindee::Errors::MindeePDFError]
73
+ def fix_pdf!(maximum_offset: 500)
74
+ @io_stream = LocalInputSource.fix_pdf(@io_stream, maximum_offset: maximum_offset)
75
+ @io_stream.rewind
76
+ @file_mimetype = Marcel::MimeType.for @io_stream
77
+ end
78
+
79
+ # Attempt to fix the PDF data in the given stream.
80
+ # @param stream [StringIO] The stream to fix.
81
+ # @param maximum_offset [Integer] Maximum offset to look for the PDF header.
82
+ # @return [StringIO] The fixed stream.
83
+ # @raise [Mindee::Errors::MindeePDFError]
84
+ def self.fix_pdf(stream, maximum_offset: 500)
85
+ out_stream = StringIO.new
86
+ stream.gets('%PDF-')
87
+ raise Errors::MindeePDFError if stream.eof? || stream.pos > maximum_offset
88
+
89
+ stream.pos = stream.pos - 5
90
+ out_stream << stream.read
91
+ end
92
+
81
93
  # Cuts a PDF file according to provided options.
82
94
  # @param options [PageOptions, nil] Page cutting/merge options:
83
95
  #
@@ -166,8 +178,8 @@ module Mindee
166
178
  @io_stream.rewind
167
179
  end
168
180
 
169
- # Checks whether the file has source text if it is a pdf. False otherwise
170
- # @return [bool] True if the file is a PDF and has source text.
181
+ # Checks whether the file has source text if it is a pdf. `false` otherwise
182
+ # @return [bool] `true` if the file is a PDF and has source text.
171
183
  def source_text?
172
184
  Mindee::PDF::PDFTools.source_text?(@io_stream)
173
185
  end
@@ -78,7 +78,7 @@ module Mindee
78
78
  #
79
79
  # @param method_name [Symbol] The name of the method being checked.
80
80
  # @param include_private [bool] Whether to include private methods in the check.
81
- # @return [bool] True if the method can be responded to, false otherwise.
81
+ # @return [bool] `true` if the method can be responded to, false otherwise.
82
82
  def respond_to_missing?(method_name, include_private = false)
83
83
  @all_values.key?(method_name.to_s) || super
84
84
  end
@@ -40,25 +40,25 @@ module Mindee
40
40
  end
41
41
 
42
42
  # Check if this is a certain confidence level.
43
- # @return [Boolean] True if confidence is certain.
43
+ # @return [Boolean] `true` if confidence is certain.
44
44
  def certain?
45
45
  @value == CERTAIN
46
46
  end
47
47
 
48
48
  # Check if this is a high confidence level.
49
- # @return [Boolean] True if confidence is high.
49
+ # @return [Boolean] `true` if confidence is high.
50
50
  def high?
51
51
  @value == HIGH
52
52
  end
53
53
 
54
54
  # Check if this is a medium confidence level.
55
- # @return [Boolean] True if confidence is medium.
55
+ # @return [Boolean] `true` if confidence is medium.
56
56
  def medium?
57
57
  @value == MEDIUM
58
58
  end
59
59
 
60
60
  # Check if this is a low confidence level.
61
- # @return [Boolean] True if confidence is low.
61
+ # @return [Boolean] `true` if confidence is low.
62
62
  def low?
63
63
  @value == LOW
64
64
  end
@@ -71,7 +71,7 @@ module Mindee
71
71
 
72
72
  # Compare two FieldConfidence instances.
73
73
  # @param other [FieldConfidence] The other confidence to compare.
74
- # @return [Boolean] True if they have the same value.
74
+ # @return [Boolean] `true` if they have the same value.
75
75
  def ==(other)
76
76
  other.is_a?(FieldConfidence) && @value == other.value
77
77
  end
@@ -43,7 +43,7 @@ module Mindee
43
43
 
44
44
  # Check if method_missing should handle the method.
45
45
  # @param method_name [Symbol] The method name.
46
- # @return [Boolean] True if the method should be handled.
46
+ # @return [Boolean] `true` if the method should be handled.
47
47
  def respond_to_missing?(method_name, include_private = false)
48
48
  key?(method_name.to_s) || super
49
49
  end
@@ -49,7 +49,7 @@ module Mindee
49
49
  end
50
50
 
51
51
  # Check if the list is empty.
52
- # @return [Boolean] True if the list has no items.
52
+ # @return [Boolean] `true` if the list has no items.
53
53
  def empty?
54
54
  @items.empty?
55
55
  end
@@ -74,7 +74,7 @@ module Mindee
74
74
  end
75
75
 
76
76
  # Iterator for Enumerator inheritance.
77
- # NOTE: Untyped due to incomplete support in steep.
77
+ # NOTE: Untyped due to incomplete support in current supported version of RBS.
78
78
  def each(&block)
79
79
  return to_enum(:each) unless block_given?
80
80
 
@@ -91,7 +91,7 @@ module Mindee
91
91
 
92
92
  # Check if method_missing should handle the method.
93
93
  # @param method_name [Symbol] The method name.
94
- # @return [Boolean] True if the method should be handled.
94
+ # @return [Boolean] `true` if the method should be handled.
95
95
  def respond_to_missing?(method_name, include_private = false)
96
96
  @fields.respond_to?(method_name) || super
97
97
  end
@@ -26,7 +26,6 @@ module Mindee
26
26
  if @value.is_a?(TrueClass) || @value.is_a?(FalseClass)
27
27
  @value ? 'True' : 'False'
28
28
  elsif @value.is_a?(Integer) || @value.is_a?(Float)
29
- # NOTE: explicitly typing because steep is very, very dumb
30
29
  num = @value # @type var num: Integer | Float
31
30
  format_numeric_value(num)
32
31
  else
@@ -44,7 +44,7 @@ module Mindee
44
44
  # Checks a PDFs stream content for text operators
45
45
  # See https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf page 243-251.
46
46
  # @param [StringIO] stream Stream object from a PDFs page.
47
- # @return [bool] True if a text operator is found in the stream.
47
+ # @return [bool] `true` if a text operator is found in the stream.
48
48
  def self.stream_has_text?(stream)
49
49
  data = stream.data
50
50
  return false if data.nil? || data.empty?
@@ -55,7 +55,7 @@ module Mindee
55
55
 
56
56
  # Checks whether the file has source_text. Sends false if the file isn't a PDF.
57
57
  # @param [StringIO] pdf_data Abinary-encoded stream representing the PDF file.
58
- # @return [bool] True if the pdf has source text, false otherwise.
58
+ # @return [bool] `true` if the pdf has source text, false otherwise.
59
59
  def self.source_text?(pdf_data)
60
60
  begin
61
61
  pdf_data.rewind
@@ -3,7 +3,7 @@
3
3
  # Mindee
4
4
  module Mindee
5
5
  # Current version.
6
- VERSION = '4.7.0-rc1'
6
+ VERSION = '4.7.0-rc2'
7
7
 
8
8
  # Finds and return the current platform.
9
9
  # @return [Symbol, Hash[String | Symbol, Regexp], Nil?]
@@ -1,5 +1,5 @@
1
1
  # Stub for the mini_magick library.
2
- # Note: though typing annotations for the MiniMagick library now exist, it seems that they aren't strict enough
2
+ # NOTE: though typing annotations for the MiniMagick library now exist, it seems that they aren't strict enough
3
3
  # to match the rules we have on the repo, hence the existence of this file and the overrides present below.
4
4
  module MiniMagick
5
5
  class Image
@@ -8,7 +8,7 @@ module Mindee
8
8
  def self.process_secret_key: (String) -> String
9
9
  def get_hmac_signature: (String) -> String
10
10
  def valid_hmac_signature?: (String, String) -> bool
11
- def deserialize_response: (singleton(Parsing::V2::JobResponse) | singleton(Parsing::V2::InferenceResponse))-> (Parsing::V2::JobResponse | Parsing::V2::InferenceResponse)
11
+ def deserialize_response: (singleton(Parsing::V2::CommonResponse))-> (Parsing::V2::JobResponse | Parsing::V2::InferenceResponse)
12
12
  end
13
13
  end
14
14
  end
@@ -8,10 +8,14 @@ module Mindee
8
8
  attr_reader filename: String
9
9
  attr_reader io_stream: StringIO | File
10
10
  def initialize: (StringIO | File, String, ?repair_pdf: bool) -> void
11
+
12
+ def fix_pdf!: (?maximum_offset: Integer) -> void
13
+ def self.fix_pdf: (StringIO | File, ?maximum_offset: Integer) -> StringIO
14
+
11
15
  def logger: () -> Logger
12
16
 
13
17
 
14
- def rescue_broken_pdf: (StringIO | File) -> (StringIO | File)
18
+ def rescue_broken_pdf: (untyped) -> void
15
19
  def pdf?: -> bool
16
20
  def apply_page_options: (PageOptions) -> StringIO?
17
21
  def process_pdf: (PageOptions) -> StringIO?
@@ -12,7 +12,6 @@ module Mindee
12
12
  def empty?: -> bool
13
13
  def size: -> Integer
14
14
  def length: -> Integer
15
- # NOTE: Steep is incapable of handling typing of `each` when multiple types are used.
16
15
  def each: () { (untyped) -> untyped } -> untyped
17
16
  def []: (Integer) -> (BaseField)
18
17
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mindee
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.7.0.pre.rc1
4
+ version: 4.7.0.pre.rc2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mindee, SA
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-08-13 00:00:00.000000000 Z
11
+ date: 2025-08-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: base64