onlyoffice_pdf_parser 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 661dfba5483c4463abeac412e29aef2c7813ac31edf78e17ca3593ed879e10aa
4
+ data.tar.gz: 5bcabaa8c35ecf07a814f555693c64720234bffd1e9ee2e82b831e8c65f72a09
5
+ SHA512:
6
+ metadata.gz: a91f4477b8d9a32de8459856a6e3e965380f5143be90cea0c67b1b94367ecfc1e4961f34be916dc5ae9bfbdf634d0f331d3a45f0b7c34261cf6d7ad9ddc392eb
7
+ data.tar.gz: b0960da736c0e12c99f02c7dc8c8dc3c08569b25f81b5995b858fd060322e77ab7c3145c4aeca9670be3ef12e8247d8550b410335c3b407bdc5a5a0b7aaf8f55
data/README.md ADDED
@@ -0,0 +1,12 @@
1
+ # Onlyoffice PDF Parser
2
+
3
+ It is gem for parsing pdf files.
4
+
5
+ ### Example
6
+
7
+ ```
8
+ require 'onlyoffice_pdf_parser'
9
+
10
+ OnlyofficePdfParser::PdfParser.parse('Text.pdf')
11
+
12
+ ```
@@ -0,0 +1,110 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'image_size'
4
+ require 'rmagick'
5
+ require_relative 'helpers/array_helper'
6
+ require_relative 'helpers/cursor_point'
7
+
8
+ module OnlyofficePdfParser
9
+ # class for storing bmp image pixels data
10
+ class BmpImage
11
+ include Magick
12
+ attr_accessor :path_to_image, :pixels, :width, :height
13
+ # @return [String] binary dat of file
14
+ attr_reader :data
15
+
16
+ def initialize(param = nil)
17
+ return unless param
18
+
19
+ init_data(param)
20
+ image_size = ImageSize.new(data).size
21
+
22
+ @width = image_size.first
23
+ @height = image_size.last
24
+ fetch_pixels
25
+ end
26
+
27
+ def to_s
28
+ path_to_image
29
+ end
30
+
31
+ def ==(other)
32
+ return false unless other.width == width && other.height == height
33
+
34
+ pixels.each_with_index do |row, row_index|
35
+ row.each_with_index do |pixel, pixel_index|
36
+ other_pixel = other.pixels[row_index][pixel_index]
37
+ result = (pixel == other_pixel)
38
+ return false unless result
39
+ end
40
+ end
41
+ true
42
+ end
43
+
44
+ def get_sub_image(start_point = CursorPoint.new(0, 0), width = 0, height = 0)
45
+ sub_image = BmpImage.new
46
+ pixels_array = []
47
+ height.times do |current_height|
48
+ line_array = []
49
+ width.times do |current_width|
50
+ pixel_line = pixels[start_point.top + current_height]
51
+ # If pixels match to near to the edge of right border of image, then end
52
+ return nil unless pixel_line
53
+
54
+ line_array << pixel_line[start_point.left + current_width]
55
+ end
56
+ pixels_array << line_array
57
+ end
58
+ sub_image.pixels = pixels_array
59
+ sub_image.width = width
60
+ sub_image.height = height
61
+ sub_image
62
+ end
63
+
64
+ def get_sub_image_array(path_to_sub_image)
65
+ coordinates_array = []
66
+ sub_image = BmpImage.new(path_to_sub_image)
67
+ first_sub_image_line = sub_image.pixels.first
68
+ pixels.each_with_index do |current_line, image_line_index|
69
+ included_indexes = ArrayHelper.get_array_inclusion_indexes(current_line, first_sub_image_line)
70
+ included_indexes.each do |current_included_index|
71
+ coordinates = image_location_start_find(current_included_index, image_line_index)
72
+ got_sub_image = get_sub_image(coordinates,
73
+ sub_image.width,
74
+ sub_image.height)
75
+ coordinates_array << coordinates if got_sub_image == sub_image
76
+ end
77
+ end
78
+ coordinates_array
79
+ end
80
+
81
+ private
82
+
83
+ # @param param [String] file path of file binaryt
84
+ # @return [Void] init class data
85
+ def init_data(param)
86
+ if OnlyofficePdfParser::FileHelper.file_path?(param)
87
+ @data = File.read(param)
88
+ @path_to_image = param
89
+ else
90
+ @data = param
91
+ @path_to_image = '[Binary Steam]'
92
+ end
93
+ end
94
+
95
+ # @return [Void] Fill @pixel with data
96
+ def fetch_pixels
97
+ tmp_file = Tempfile.new('onlyoffice_pdf_parser')
98
+ File.open(tmp_file, 'wb') { |file| file.write(data) }
99
+ @pixels = ImageList.new(tmp_file.path).get_pixels(0, 0, width, height).each_slice(width).to_a
100
+ tmp_file.unlink
101
+ end
102
+
103
+ # @param x_coordinate [Integer] x of start search
104
+ # @param y_coordinate [Integer] y of start search
105
+ # @return [CursorPoint] point to start
106
+ def image_location_start_find(x_coordinate, y_coordinate)
107
+ CursorPoint.new(x_coordinate % width, y_coordinate)
108
+ end
109
+ end
110
+ end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ module OnlyofficePdfParser
4
+ # Methods to help working with array
5
+ class ArrayHelper
6
+ # Get list of indexes in which one array is included to another
7
+ # @param array [Array] big array
8
+ # @param included_array [Array] array to search
9
+ # @return [Array, Integer] indexes
10
+ def self.get_array_inclusion_indexes(array,
11
+ included_array)
12
+ index_array = []
13
+ first_element = included_array.first
14
+ array.each_with_index do |array_element, array_element_index|
15
+ if array_element == first_element
16
+ array_part = array.slice(array_element_index, included_array.length)
17
+ index_array << array_element_index if array_part == included_array
18
+ end
19
+ end
20
+ index_array
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,47 @@
1
+ # frozen_string_literal: true
2
+
3
+ module OnlyofficePdfParser
4
+ # Class for working with cursor coordinates
5
+ class CursorPoint
6
+ attr_accessor :left, :top
7
+
8
+ def initialize(left, top)
9
+ @left = left
10
+ @top = top
11
+ end
12
+
13
+ alias width left
14
+ alias height top
15
+ alias x left
16
+ alias y top
17
+
18
+ def dup
19
+ CursorPoint.new(@left, @top)
20
+ end
21
+
22
+ def to_s
23
+ "[#{@left}, #{@top}]"
24
+ end
25
+
26
+ def ==(other)
27
+ if other.respond_to?(:left) && other.respond_to?(:top)
28
+ @left == other.left && @top == other.top
29
+ else
30
+ false
31
+ end
32
+ end
33
+
34
+ def [](name)
35
+ case name
36
+ when :width
37
+ left
38
+ when :height
39
+ top
40
+ else
41
+ 'Unknown attribute'
42
+ end
43
+ end
44
+ end
45
+
46
+ Dimensions = CursorPoint
47
+ end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ module OnlyofficePdfParser
4
+ # Class for working with files
5
+ class FileHelper
6
+ class << self
7
+ # @return [String] name for temp file
8
+ def generate_temp_name
9
+ file = Tempfile.new(%w[onlyoffice_bmp_parser .bmp])
10
+ path = file.path
11
+ file.unlink
12
+ path
13
+ end
14
+
15
+ # @return [True, False] if string is file path
16
+ def file_path?(string)
17
+ File.exist?(string)
18
+ rescue ArgumentError
19
+ false
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Helper methods to convert pdf to bmp
4
+ module PdfConvertToBmpHelper
5
+ # Fill @pages_bmp with data
6
+ def fetch_bmp_binary
7
+ temp_path = OnlyofficePdfParser::FileHelper.generate_temp_name
8
+ `convert "#{@file_path}" #{temp_path}`
9
+ files = multipage_files(temp_path)
10
+ files.each do |file|
11
+ @pages_in_bmp << File.binread(file)
12
+ end
13
+ end
14
+
15
+ private
16
+
17
+ # @param path_pattern [String] pattern to find bmps
18
+ # @return [Array<String>] list of bmps
19
+ def multipage_files(path_pattern)
20
+ files_dir = File.dirname(path_pattern)
21
+ files_base = File.basename(path_pattern, '.*')
22
+ Dir["#{files_dir}/#{files_base}*"]
23
+ end
24
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module OnlyofficePdfParser
4
+ # Module for helper methods for `pdf-reader` gem
5
+ module PdfReaderHelper
6
+ # Return font name from page data
7
+ # @param page [PDF::Reader::Page] page to parse
8
+ # @return [String] font
9
+ def parse_font(page)
10
+ return :unknown if page.fonts[:F1].nil?
11
+
12
+ font_string = page.fonts[:F1][:BaseFont].to_s
13
+ font_string = /(?=\+)(.*)/.match(font_string).to_s # remove from "+" to ","
14
+ font_string.delete('+')
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,97 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pdf/reader'
4
+ require 'tempfile'
5
+ require_relative 'helpers/file_helper'
6
+ require_relative 'pdf_structure/pdf_reader_helper'
7
+ require_relative 'pdf_structure/pdf_convert_to_bmp_helper'
8
+
9
+ module OnlyofficePdfParser
10
+ # Class for working and parsing PDF files
11
+ class PdfStructure
12
+ include PdfConvertToBmpHelper
13
+ include PdfReaderHelper
14
+ # @return [Array, Pages] array of pages
15
+ attr_accessor :pages
16
+ # @return [String] full path to file
17
+ attr_accessor :file_path
18
+ # @return [Array<String>] bin representation of bmps
19
+ attr_reader :pages_in_bmp
20
+
21
+ def initialize(pages: [], file_path: nil)
22
+ @file_path = file_path
23
+ @pages = pages
24
+ @pages_in_bmp = []
25
+ end
26
+
27
+ def [](parameter)
28
+ case parameter
29
+ when :pages
30
+ @pages
31
+ when :page_size
32
+ @page_size
33
+ else
34
+ raise "Unknown instance variable - #{parameter}."
35
+ end
36
+ end
37
+
38
+ # @return [True, false] Check if pdf file contains graphic pattern
39
+ def contain_pattern?(path_to_patter)
40
+ pages_in_bmp.each do |current_page|
41
+ bmp = BmpImage.new(current_page)
42
+ array = bmp.get_sub_image_array(path_to_patter)
43
+ return true unless array.empty?
44
+ end
45
+ false
46
+ end
47
+
48
+ # Parse file using `pdf-reader` gem
49
+ def pdf_reader_parse
50
+ PDF::Reader.open(file_path.to_s) do |reader|
51
+ reader.pages.each do |page|
52
+ @pages << { text: page.text,
53
+ fonts: parse_font(page) }
54
+ end
55
+ end
56
+ end
57
+
58
+ # @return [Array <Integer>] page size of pdf in points
59
+ def page_size_points
60
+ return @page_size_points if @page_size_points
61
+
62
+ pdfinfo = `pdfinfo "#{@file_path}"`
63
+ page_size_fraction = pdfinfo.split('Page size:')[1].split('pts').first.strip.split(', ').first.split(' x ')
64
+ @page_size_points = page_size_fraction.map { |size| size.to_f.round }
65
+ end
66
+
67
+ # @return [String, nil] name of page size
68
+ def page_size
69
+ @page_size = PAGE_SIZE_FOR_PDF.key(page_size_points)
70
+ @page_size ||= "Landscape #{PAGE_SIZE_FOR_PDF.key(page_size_points.reverse)}"
71
+ end
72
+
73
+ PAGE_SIZE_FOR_PDF = { 'US Letter' => [612, 792],
74
+ 'US Legal' => [612, 1008],
75
+ 'A4' => [595, 842],
76
+ 'A5' => [420, 595],
77
+ 'B5' => [499, 709],
78
+ 'Envelope #10' => [297, 684],
79
+ 'Envelope DL' => [312, 624],
80
+ 'Tabloid' => [792, 1224],
81
+ 'A3' => [842, 1191],
82
+ 'Tabloid Oversize' => [864, 1296],
83
+ 'ROC 16K' => [558, 774],
84
+ 'Envelope Choukei 3' => [340, 666],
85
+ 'Super B/A3' => [936, 1368] }.freeze
86
+
87
+ def self.parse(filename)
88
+ file = PdfStructure.new(pages: [], file_path: filename)
89
+ file.pdf_reader_parse
90
+ file.fetch_bmp_binary
91
+ file.page_size
92
+ file
93
+ end
94
+ end
95
+
96
+ PdfParser = PdfStructure
97
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ module OnlyofficePdfParser
4
+ module Version
5
+ STRING = '0.1.0'
6
+ end
7
+ end
@@ -0,0 +1,4 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'onlyoffice_pdf_parser/bmp_image.rb'
4
+ require_relative 'onlyoffice_pdf_parser/pdf_structure.rb'
metadata ADDED
@@ -0,0 +1,103 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: onlyoffice_pdf_parser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Pavel Lobashov
8
+ - Dmitry Rotaty
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2020-03-13 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: image_size
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - "~>"
19
+ - !ruby/object:Gem::Version
20
+ version: '2'
21
+ type: :runtime
22
+ prerelease: false
23
+ version_requirements: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - "~>"
26
+ - !ruby/object:Gem::Version
27
+ version: '2'
28
+ - !ruby/object:Gem::Dependency
29
+ name: pdf-reader
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - "~>"
33
+ - !ruby/object:Gem::Version
34
+ version: '2'
35
+ type: :runtime
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - "~>"
40
+ - !ruby/object:Gem::Version
41
+ version: '2'
42
+ - !ruby/object:Gem::Dependency
43
+ name: rmagick
44
+ requirement: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - ">="
47
+ - !ruby/object:Gem::Version
48
+ version: '2'
49
+ - - "<"
50
+ - !ruby/object:Gem::Version
51
+ version: '5'
52
+ type: :runtime
53
+ prerelease: false
54
+ version_requirements: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - ">="
57
+ - !ruby/object:Gem::Version
58
+ version: '2'
59
+ - - "<"
60
+ - !ruby/object:Gem::Version
61
+ version: '5'
62
+ description: Wrapper for Testrail by OnlyOffice
63
+ email:
64
+ - shockwavenn@gmail.com
65
+ - kvazilife@gmail.com
66
+ executables: []
67
+ extensions: []
68
+ extra_rdoc_files: []
69
+ files:
70
+ - README.md
71
+ - lib/onlyoffice_pdf_parser.rb
72
+ - lib/onlyoffice_pdf_parser/bmp_image.rb
73
+ - lib/onlyoffice_pdf_parser/helpers/array_helper.rb
74
+ - lib/onlyoffice_pdf_parser/helpers/cursor_point.rb
75
+ - lib/onlyoffice_pdf_parser/helpers/file_helper.rb
76
+ - lib/onlyoffice_pdf_parser/pdf_structure.rb
77
+ - lib/onlyoffice_pdf_parser/pdf_structure/pdf_convert_to_bmp_helper.rb
78
+ - lib/onlyoffice_pdf_parser/pdf_structure/pdf_reader_helper.rb
79
+ - lib/onlyoffice_pdf_parser/version.rb
80
+ homepage: https://github.com/onlyoffice-testing-robot/onlyoffice_pdf_parser
81
+ licenses:
82
+ - AGPL-3.0
83
+ metadata: {}
84
+ post_install_message:
85
+ rdoc_options: []
86
+ require_paths:
87
+ - lib
88
+ required_ruby_version: !ruby/object:Gem::Requirement
89
+ requirements:
90
+ - - ">="
91
+ - !ruby/object:Gem::Version
92
+ version: '0'
93
+ required_rubygems_version: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - ">="
96
+ - !ruby/object:Gem::Version
97
+ version: '0'
98
+ requirements: []
99
+ rubygems_version: 3.0.6
100
+ signing_key:
101
+ specification_version: 4
102
+ summary: ONLYOFFICE Testrail Wrapper Gem
103
+ test_files: []