onlyoffice_pdf_parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 661dfba5483c4463abeac412e29aef2c7813ac31edf78e17ca3593ed879e10aa
4
+ data.tar.gz: 5bcabaa8c35ecf07a814f555693c64720234bffd1e9ee2e82b831e8c65f72a09
5
+ SHA512:
6
+ metadata.gz: a91f4477b8d9a32de8459856a6e3e965380f5143be90cea0c67b1b94367ecfc1e4961f34be916dc5ae9bfbdf634d0f331d3a45f0b7c34261cf6d7ad9ddc392eb
7
+ data.tar.gz: b0960da736c0e12c99f02c7dc8c8dc3c08569b25f81b5995b858fd060322e77ab7c3145c4aeca9670be3ef12e8247d8550b410335c3b407bdc5a5a0b7aaf8f55
data/README.md ADDED
@@ -0,0 +1,12 @@
1
+ # Onlyoffice PDF Parser
2
+
3
+ It is gem for parsing pdf files.
4
+
5
+ ### Example
6
+
7
+ ```
8
+ require 'onlyoffice_pdf_parser'
9
+
10
+ OnlyofficePdfParser::PdfParser.parse('Text.pdf')
11
+
12
+ ```
@@ -0,0 +1,110 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'image_size'
4
+ require 'rmagick'
5
+ require_relative 'helpers/array_helper'
6
+ require_relative 'helpers/cursor_point'
7
+
8
+ module OnlyofficePdfParser
9
+ # class for storing bmp image pixels data
10
+ class BmpImage
11
+ include Magick
12
+ attr_accessor :path_to_image, :pixels, :width, :height
13
+ # @return [String] binary dat of file
14
+ attr_reader :data
15
+
16
+ def initialize(param = nil)
17
+ return unless param
18
+
19
+ init_data(param)
20
+ image_size = ImageSize.new(data).size
21
+
22
+ @width = image_size.first
23
+ @height = image_size.last
24
+ fetch_pixels
25
+ end
26
+
27
+ def to_s
28
+ path_to_image
29
+ end
30
+
31
+ def ==(other)
32
+ return false unless other.width == width && other.height == height
33
+
34
+ pixels.each_with_index do |row, row_index|
35
+ row.each_with_index do |pixel, pixel_index|
36
+ other_pixel = other.pixels[row_index][pixel_index]
37
+ result = (pixel == other_pixel)
38
+ return false unless result
39
+ end
40
+ end
41
+ true
42
+ end
43
+
44
+ def get_sub_image(start_point = CursorPoint.new(0, 0), width = 0, height = 0)
45
+ sub_image = BmpImage.new
46
+ pixels_array = []
47
+ height.times do |current_height|
48
+ line_array = []
49
+ width.times do |current_width|
50
+ pixel_line = pixels[start_point.top + current_height]
51
+ # If pixels match to near to the edge of right border of image, then end
52
+ return nil unless pixel_line
53
+
54
+ line_array << pixel_line[start_point.left + current_width]
55
+ end
56
+ pixels_array << line_array
57
+ end
58
+ sub_image.pixels = pixels_array
59
+ sub_image.width = width
60
+ sub_image.height = height
61
+ sub_image
62
+ end
63
+
64
+ def get_sub_image_array(path_to_sub_image)
65
+ coordinates_array = []
66
+ sub_image = BmpImage.new(path_to_sub_image)
67
+ first_sub_image_line = sub_image.pixels.first
68
+ pixels.each_with_index do |current_line, image_line_index|
69
+ included_indexes = ArrayHelper.get_array_inclusion_indexes(current_line, first_sub_image_line)
70
+ included_indexes.each do |current_included_index|
71
+ coordinates = image_location_start_find(current_included_index, image_line_index)
72
+ got_sub_image = get_sub_image(coordinates,
73
+ sub_image.width,
74
+ sub_image.height)
75
+ coordinates_array << coordinates if got_sub_image == sub_image
76
+ end
77
+ end
78
+ coordinates_array
79
+ end
80
+
81
+ private
82
+
83
+ # @param param [String] file path of file binaryt
84
+ # @return [Void] init class data
85
+ def init_data(param)
86
+ if OnlyofficePdfParser::FileHelper.file_path?(param)
87
+ @data = File.read(param)
88
+ @path_to_image = param
89
+ else
90
+ @data = param
91
+ @path_to_image = '[Binary Steam]'
92
+ end
93
+ end
94
+
95
+ # @return [Void] Fill @pixel with data
96
+ def fetch_pixels
97
+ tmp_file = Tempfile.new('onlyoffice_pdf_parser')
98
+ File.open(tmp_file, 'wb') { |file| file.write(data) }
99
+ @pixels = ImageList.new(tmp_file.path).get_pixels(0, 0, width, height).each_slice(width).to_a
100
+ tmp_file.unlink
101
+ end
102
+
103
+ # @param x_coordinate [Integer] x of start search
104
+ # @param y_coordinate [Integer] y of start search
105
+ # @return [CursorPoint] point to start
106
+ def image_location_start_find(x_coordinate, y_coordinate)
107
+ CursorPoint.new(x_coordinate % width, y_coordinate)
108
+ end
109
+ end
110
+ end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ module OnlyofficePdfParser
4
+ # Methods to help working with array
5
+ class ArrayHelper
6
+ # Get list of indexes in which one array is included to another
7
+ # @param array [Array] big array
8
+ # @param included_array [Array] array to search
9
+ # @return [Array, Integer] indexes
10
+ def self.get_array_inclusion_indexes(array,
11
+ included_array)
12
+ index_array = []
13
+ first_element = included_array.first
14
+ array.each_with_index do |array_element, array_element_index|
15
+ if array_element == first_element
16
+ array_part = array.slice(array_element_index, included_array.length)
17
+ index_array << array_element_index if array_part == included_array
18
+ end
19
+ end
20
+ index_array
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,47 @@
1
+ # frozen_string_literal: true
2
+
3
+ module OnlyofficePdfParser
4
+ # Class for working with cursor coordinates
5
+ class CursorPoint
6
+ attr_accessor :left, :top
7
+
8
+ def initialize(left, top)
9
+ @left = left
10
+ @top = top
11
+ end
12
+
13
+ alias width left
14
+ alias height top
15
+ alias x left
16
+ alias y top
17
+
18
+ def dup
19
+ CursorPoint.new(@left, @top)
20
+ end
21
+
22
+ def to_s
23
+ "[#{@left}, #{@top}]"
24
+ end
25
+
26
+ def ==(other)
27
+ if other.respond_to?(:left) && other.respond_to?(:top)
28
+ @left == other.left && @top == other.top
29
+ else
30
+ false
31
+ end
32
+ end
33
+
34
+ def [](name)
35
+ case name
36
+ when :width
37
+ left
38
+ when :height
39
+ top
40
+ else
41
+ 'Unknown attribute'
42
+ end
43
+ end
44
+ end
45
+
46
+ Dimensions = CursorPoint
47
+ end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ module OnlyofficePdfParser
4
+ # Class for working with files
5
+ class FileHelper
6
+ class << self
7
+ # @return [String] name for temp file
8
+ def generate_temp_name
9
+ file = Tempfile.new(%w[onlyoffice_bmp_parser .bmp])
10
+ path = file.path
11
+ file.unlink
12
+ path
13
+ end
14
+
15
+ # @return [True, False] if string is file path
16
+ def file_path?(string)
17
+ File.exist?(string)
18
+ rescue ArgumentError
19
+ false
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Helper methods to convert pdf to bmp
4
+ module PdfConvertToBmpHelper
5
+ # Fill @pages_bmp with data
6
+ def fetch_bmp_binary
7
+ temp_path = OnlyofficePdfParser::FileHelper.generate_temp_name
8
+ `convert "#{@file_path}" #{temp_path}`
9
+ files = multipage_files(temp_path)
10
+ files.each do |file|
11
+ @pages_in_bmp << File.binread(file)
12
+ end
13
+ end
14
+
15
+ private
16
+
17
+ # @param path_pattern [String] pattern to find bmps
18
+ # @return [Array<String>] list of bmps
19
+ def multipage_files(path_pattern)
20
+ files_dir = File.dirname(path_pattern)
21
+ files_base = File.basename(path_pattern, '.*')
22
+ Dir["#{files_dir}/#{files_base}*"]
23
+ end
24
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module OnlyofficePdfParser
4
+ # Module for helper methods for `pdf-reader` gem
5
+ module PdfReaderHelper
6
+ # Return font name from page data
7
+ # @param page [PDF::Reader::Page] page to parse
8
+ # @return [String] font
9
+ def parse_font(page)
10
+ return :unknown if page.fonts[:F1].nil?
11
+
12
+ font_string = page.fonts[:F1][:BaseFont].to_s
13
+ font_string = /(?=\+)(.*)/.match(font_string).to_s # remove from "+" to ","
14
+ font_string.delete('+')
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,97 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pdf/reader'
4
+ require 'tempfile'
5
+ require_relative 'helpers/file_helper'
6
+ require_relative 'pdf_structure/pdf_reader_helper'
7
+ require_relative 'pdf_structure/pdf_convert_to_bmp_helper'
8
+
9
+ module OnlyofficePdfParser
10
+ # Class for working and parsing PDF files
11
+ class PdfStructure
12
+ include PdfConvertToBmpHelper
13
+ include PdfReaderHelper
14
+ # @return [Array, Pages] array of pages
15
+ attr_accessor :pages
16
+ # @return [String] full path to file
17
+ attr_accessor :file_path
18
+ # @return [Array<String>] bin representation of bmps
19
+ attr_reader :pages_in_bmp
20
+
21
+ def initialize(pages: [], file_path: nil)
22
+ @file_path = file_path
23
+ @pages = pages
24
+ @pages_in_bmp = []
25
+ end
26
+
27
+ def [](parameter)
28
+ case parameter
29
+ when :pages
30
+ @pages
31
+ when :page_size
32
+ @page_size
33
+ else
34
+ raise "Unknown instance variable - #{parameter}."
35
+ end
36
+ end
37
+
38
+ # @return [True, false] Check if pdf file contains graphic pattern
39
+ def contain_pattern?(path_to_patter)
40
+ pages_in_bmp.each do |current_page|
41
+ bmp = BmpImage.new(current_page)
42
+ array = bmp.get_sub_image_array(path_to_patter)
43
+ return true unless array.empty?
44
+ end
45
+ false
46
+ end
47
+
48
+ # Parse file using `pdf-reader` gem
49
+ def pdf_reader_parse
50
+ PDF::Reader.open(file_path.to_s) do |reader|
51
+ reader.pages.each do |page|
52
+ @pages << { text: page.text,
53
+ fonts: parse_font(page) }
54
+ end
55
+ end
56
+ end
57
+
58
+ # @return [Array <Integer>] page size of pdf in points
59
+ def page_size_points
60
+ return @page_size_points if @page_size_points
61
+
62
+ pdfinfo = `pdfinfo "#{@file_path}"`
63
+ page_size_fraction = pdfinfo.split('Page size:')[1].split('pts').first.strip.split(', ').first.split(' x ')
64
+ @page_size_points = page_size_fraction.map { |size| size.to_f.round }
65
+ end
66
+
67
+ # @return [String, nil] name of page size
68
+ def page_size
69
+ @page_size = PAGE_SIZE_FOR_PDF.key(page_size_points)
70
+ @page_size ||= "Landscape #{PAGE_SIZE_FOR_PDF.key(page_size_points.reverse)}"
71
+ end
72
+
73
+ PAGE_SIZE_FOR_PDF = { 'US Letter' => [612, 792],
74
+ 'US Legal' => [612, 1008],
75
+ 'A4' => [595, 842],
76
+ 'A5' => [420, 595],
77
+ 'B5' => [499, 709],
78
+ 'Envelope #10' => [297, 684],
79
+ 'Envelope DL' => [312, 624],
80
+ 'Tabloid' => [792, 1224],
81
+ 'A3' => [842, 1191],
82
+ 'Tabloid Oversize' => [864, 1296],
83
+ 'ROC 16K' => [558, 774],
84
+ 'Envelope Choukei 3' => [340, 666],
85
+ 'Super B/A3' => [936, 1368] }.freeze
86
+
87
+ def self.parse(filename)
88
+ file = PdfStructure.new(pages: [], file_path: filename)
89
+ file.pdf_reader_parse
90
+ file.fetch_bmp_binary
91
+ file.page_size
92
+ file
93
+ end
94
+ end
95
+
96
+ PdfParser = PdfStructure
97
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ module OnlyofficePdfParser
4
+ module Version
5
+ STRING = '0.1.0'
6
+ end
7
+ end
@@ -0,0 +1,4 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'onlyoffice_pdf_parser/bmp_image.rb'
4
+ require_relative 'onlyoffice_pdf_parser/pdf_structure.rb'
metadata ADDED
@@ -0,0 +1,103 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: onlyoffice_pdf_parser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Pavel Lobashov
8
+ - Dmitry Rotaty
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2020-03-13 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: image_size
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - "~>"
19
+ - !ruby/object:Gem::Version
20
+ version: '2'
21
+ type: :runtime
22
+ prerelease: false
23
+ version_requirements: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - "~>"
26
+ - !ruby/object:Gem::Version
27
+ version: '2'
28
+ - !ruby/object:Gem::Dependency
29
+ name: pdf-reader
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - "~>"
33
+ - !ruby/object:Gem::Version
34
+ version: '2'
35
+ type: :runtime
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - "~>"
40
+ - !ruby/object:Gem::Version
41
+ version: '2'
42
+ - !ruby/object:Gem::Dependency
43
+ name: rmagick
44
+ requirement: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - ">="
47
+ - !ruby/object:Gem::Version
48
+ version: '2'
49
+ - - "<"
50
+ - !ruby/object:Gem::Version
51
+ version: '5'
52
+ type: :runtime
53
+ prerelease: false
54
+ version_requirements: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - ">="
57
+ - !ruby/object:Gem::Version
58
+ version: '2'
59
+ - - "<"
60
+ - !ruby/object:Gem::Version
61
+ version: '5'
62
+ description: Wrapper for Testrail by OnlyOffice
63
+ email:
64
+ - shockwavenn@gmail.com
65
+ - kvazilife@gmail.com
66
+ executables: []
67
+ extensions: []
68
+ extra_rdoc_files: []
69
+ files:
70
+ - README.md
71
+ - lib/onlyoffice_pdf_parser.rb
72
+ - lib/onlyoffice_pdf_parser/bmp_image.rb
73
+ - lib/onlyoffice_pdf_parser/helpers/array_helper.rb
74
+ - lib/onlyoffice_pdf_parser/helpers/cursor_point.rb
75
+ - lib/onlyoffice_pdf_parser/helpers/file_helper.rb
76
+ - lib/onlyoffice_pdf_parser/pdf_structure.rb
77
+ - lib/onlyoffice_pdf_parser/pdf_structure/pdf_convert_to_bmp_helper.rb
78
+ - lib/onlyoffice_pdf_parser/pdf_structure/pdf_reader_helper.rb
79
+ - lib/onlyoffice_pdf_parser/version.rb
80
+ homepage: https://github.com/onlyoffice-testing-robot/onlyoffice_pdf_parser
81
+ licenses:
82
+ - AGPL-3.0
83
+ metadata: {}
84
+ post_install_message:
85
+ rdoc_options: []
86
+ require_paths:
87
+ - lib
88
+ required_ruby_version: !ruby/object:Gem::Requirement
89
+ requirements:
90
+ - - ">="
91
+ - !ruby/object:Gem::Version
92
+ version: '0'
93
+ required_rubygems_version: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - ">="
96
+ - !ruby/object:Gem::Version
97
+ version: '0'
98
+ requirements: []
99
+ rubygems_version: 3.0.6
100
+ signing_key:
101
+ specification_version: 4
102
+ summary: ONLYOFFICE Testrail Wrapper Gem
103
+ test_files: []