ocr-file 0.0.7 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8b87806d21622a72c6166c35fe4367f5b07135e5e7fab4e8be8b8941f75439dc
4
- data.tar.gz: d342a91e9b23f8677784553327ba1cc1c00e1599415512b28226f8e9f6bc55b4
3
+ metadata.gz: e77eefa085a14282b42584bd4bf6796a99e10589552b52858cb8f5dd75c84b97
4
+ data.tar.gz: 9fde9adae0c252ecc56937a314c676903a6b2a6ababe51030cbcd9ab3ee1ba81
5
5
  SHA512:
6
- metadata.gz: ecadeeb21a358274bce4ed3d7fce66e53d31ff3abe940ff1b9d77893f12b73bfd41e9ac35324e3a98f004638f9d1906760ef962a3637fbaf48973faeec9a17cb
7
- data.tar.gz: 5d4a149dd6d0da1feb723b08c327edab414b75f0b633cea53aaee00d43313d26b84659956957acec7550a822998b76a760b3888770a606d8b4a1f9bb14f807c2
6
+ metadata.gz: f6c9cf596d6a78ccea7e1fb45543826e1b95dab74449700eb0f0bed4bace802fbc15fba118a982fbf7daed9ebb188e417876f1e6a9cb6f620eb3630a4aaed7af
7
+ data.tar.gz: 9aab569d476170d8c7b405f65a1629d8ec789f008f734a7cc7f49c5716c89c7a7b5010ae3096ed7e4b3358fe650c221da658801a893cd76f6183f23b19696349
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- ocr-file (0.0.6)
4
+ ocr-file (0.0.9)
5
5
  active_attr (~> 0.15.4)
6
6
  console-style (~> 0.0.1)
7
7
  hexapdf (~> 0.23.0)
data/README.md CHANGED
@@ -45,6 +45,7 @@ You will need to install `tesseract` with your desired language on your system,
45
45
  image_preprocess: true,
46
46
  effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'], # Applies effects as listed. 'norm' is also available
47
47
  automatic_reprocess: true, # Will possibly do double + the operations but can produce better results automatically
48
+ dimensions: [width, height], # Can be nil but will lock the images
48
49
  # PDF to Image Processing
49
50
  optimise_pdf: true,
50
51
  extract_pdf_images: true, # if false will screenshot each PDF page
@@ -79,9 +80,10 @@ You will need to install `tesseract` with your desired language on your system,
79
80
 
80
81
  # How to merge files into a single PDF:
81
82
  # The files can be images or other PDFs
82
- filepaths = []
83
- documents = file_paths.map { |path| OcrFile::ImageEngines::PdfEngine.open_pdf(path, password: '') }
84
- merged_document = OcrFile::ImageEngines::PdfEngine.merge(documents)
83
+ file_paths = []
84
+ merged_document = ::HexaPDF::Document.new
85
+ dimensions = [width, height] # or nil to maintain dimensions
86
+ documents = file_paths.map { |path| OcrFile::ImageEngines::PdfEngine.insert_image(merged_document, path, dimensions: dimensions) }
85
87
  OcrFile::ImageEngines::PdfEngine.save_pdf(merged_document, save_file_path, optimise: true)
86
88
  ```
87
89
 
@@ -25,6 +25,7 @@ module OcrFile
25
25
  image_preprocess: true,
26
26
  effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'],
27
27
  automatic_reprocess: true,
28
+ dimensions: nil, # width, height. Will lock images to these dimensions
28
29
  # PDF to Image Processing
29
30
  optimise_pdf: true,
30
31
  extract_pdf_images: true, # if false will screenshot each PDF page
@@ -66,7 +67,7 @@ module OcrFile
66
67
 
67
68
  def image?
68
69
  return false if pdf?
69
- ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.downcase.include?(".#{type}")}
70
+ ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.downcase.include?(".#{type}") }
70
71
  end
71
72
 
72
73
  # Treat anything which isnt a PDF or image as text
@@ -5,7 +5,7 @@ module OcrFile
5
5
  # Conversion of image types
6
6
  # Rotation and detection of skew
7
7
 
8
- attr_reader :image_path, :image, :temp_path, :save_file_path, :config
8
+ attr_reader :image_path, :image, :temp_path, :save_file_path, :config, :width, :height
9
9
 
10
10
  def initialize(image_path:, temp_path:, save_file_path:, config:)
11
11
  @image_path = image_path
@@ -22,11 +22,18 @@ module OcrFile
22
22
  # end
23
23
 
24
24
  @image = MiniMagick::Image.open(image_path)
25
+
26
+ @width = @image[:width]
27
+ @height = @image[:height]
25
28
  end
26
29
 
27
30
  def convert!
28
31
  return @image_path unless @config[:image_preprocess]
29
32
 
33
+ if @config[:dimensions].is_a?(Array) && @config[:dimensions].size == 2
34
+ resize(width, height)
35
+ end
36
+
30
37
  @config[:effects].each do |effect|
31
38
  self.send(effect.to_sym)
32
39
  end
@@ -39,6 +46,10 @@ module OcrFile
39
46
  @save_file_path
40
47
  end
41
48
 
49
+ def resize(width, height)
50
+ @image.resize("#{width}x#{height}")
51
+ end
52
+
42
53
  # Effects
43
54
  # http://www.imagemagick.org/script/command-line-options.php
44
55
  def bw
@@ -61,9 +61,26 @@ module OcrFile
61
61
  image_paths
62
62
  end
63
63
 
64
- def insert_image(document, image_path)
65
- canvas = document.pages.add.canvas
66
- canvas.image(image_path, at: [0, 0], height: 700)
64
+ def insert_image(document, image_path, dimensions: nil)
65
+ image_processor = OcrFile::ImageEngines::ImageMagick.new(
66
+ image_path: image_path,
67
+ temp_path: @temp_folder_path,
68
+ save_file_path: '',
69
+ config: @config
70
+ )
71
+
72
+ if dimensions
73
+ width = dimensions[0]
74
+ height = dimensions[1]
75
+ else
76
+ width = image_processor.width
77
+ height = image_processor.height
78
+ end
79
+
80
+ page = document.pages.add([0, 0, width, height])
81
+ page.canvas.image(@image || image_path, at: [0, 0], width: width, height: height)
82
+
83
+ document
67
84
  end
68
85
 
69
86
  def combine(text, pdf_of_images)
@@ -1,3 +1,3 @@
1
1
  module OcrFile
2
- VERSION = "0.0.7"
2
+ VERSION = "0.0.10"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ocr-file
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7
4
+ version: 0.0.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - trex22
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-07-22 00:00:00.000000000 Z
11
+ date: 2023-07-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: console-style