onlyoffice_pdf_parser 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +12 -0
- data/lib/onlyoffice_pdf_parser/bmp_image.rb +110 -0
- data/lib/onlyoffice_pdf_parser/helpers/array_helper.rb +23 -0
- data/lib/onlyoffice_pdf_parser/helpers/cursor_point.rb +47 -0
- data/lib/onlyoffice_pdf_parser/helpers/file_helper.rb +23 -0
- data/lib/onlyoffice_pdf_parser/pdf_structure/pdf_convert_to_bmp_helper.rb +24 -0
- data/lib/onlyoffice_pdf_parser/pdf_structure/pdf_reader_helper.rb +17 -0
- data/lib/onlyoffice_pdf_parser/pdf_structure.rb +97 -0
- data/lib/onlyoffice_pdf_parser/version.rb +7 -0
- data/lib/onlyoffice_pdf_parser.rb +4 -0
- metadata +103 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 661dfba5483c4463abeac412e29aef2c7813ac31edf78e17ca3593ed879e10aa
|
4
|
+
data.tar.gz: 5bcabaa8c35ecf07a814f555693c64720234bffd1e9ee2e82b831e8c65f72a09
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: a91f4477b8d9a32de8459856a6e3e965380f5143be90cea0c67b1b94367ecfc1e4961f34be916dc5ae9bfbdf634d0f331d3a45f0b7c34261cf6d7ad9ddc392eb
|
7
|
+
data.tar.gz: b0960da736c0e12c99f02c7dc8c8dc3c08569b25f81b5995b858fd060322e77ab7c3145c4aeca9670be3ef12e8247d8550b410335c3b407bdc5a5a0b7aaf8f55
|
data/README.md
ADDED
@@ -0,0 +1,110 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'image_size'
|
4
|
+
require 'rmagick'
|
5
|
+
require_relative 'helpers/array_helper'
|
6
|
+
require_relative 'helpers/cursor_point'
|
7
|
+
|
8
|
+
module OnlyofficePdfParser
|
9
|
+
# class for storing bmp image pixels data
|
10
|
+
class BmpImage
|
11
|
+
include Magick
|
12
|
+
attr_accessor :path_to_image, :pixels, :width, :height
|
13
|
+
# @return [String] binary dat of file
|
14
|
+
attr_reader :data
|
15
|
+
|
16
|
+
def initialize(param = nil)
|
17
|
+
return unless param
|
18
|
+
|
19
|
+
init_data(param)
|
20
|
+
image_size = ImageSize.new(data).size
|
21
|
+
|
22
|
+
@width = image_size.first
|
23
|
+
@height = image_size.last
|
24
|
+
fetch_pixels
|
25
|
+
end
|
26
|
+
|
27
|
+
def to_s
|
28
|
+
path_to_image
|
29
|
+
end
|
30
|
+
|
31
|
+
def ==(other)
|
32
|
+
return false unless other.width == width && other.height == height
|
33
|
+
|
34
|
+
pixels.each_with_index do |row, row_index|
|
35
|
+
row.each_with_index do |pixel, pixel_index|
|
36
|
+
other_pixel = other.pixels[row_index][pixel_index]
|
37
|
+
result = (pixel == other_pixel)
|
38
|
+
return false unless result
|
39
|
+
end
|
40
|
+
end
|
41
|
+
true
|
42
|
+
end
|
43
|
+
|
44
|
+
def get_sub_image(start_point = CursorPoint.new(0, 0), width = 0, height = 0)
|
45
|
+
sub_image = BmpImage.new
|
46
|
+
pixels_array = []
|
47
|
+
height.times do |current_height|
|
48
|
+
line_array = []
|
49
|
+
width.times do |current_width|
|
50
|
+
pixel_line = pixels[start_point.top + current_height]
|
51
|
+
# If pixels match to near to the edge of right border of image, then end
|
52
|
+
return nil unless pixel_line
|
53
|
+
|
54
|
+
line_array << pixel_line[start_point.left + current_width]
|
55
|
+
end
|
56
|
+
pixels_array << line_array
|
57
|
+
end
|
58
|
+
sub_image.pixels = pixels_array
|
59
|
+
sub_image.width = width
|
60
|
+
sub_image.height = height
|
61
|
+
sub_image
|
62
|
+
end
|
63
|
+
|
64
|
+
def get_sub_image_array(path_to_sub_image)
|
65
|
+
coordinates_array = []
|
66
|
+
sub_image = BmpImage.new(path_to_sub_image)
|
67
|
+
first_sub_image_line = sub_image.pixels.first
|
68
|
+
pixels.each_with_index do |current_line, image_line_index|
|
69
|
+
included_indexes = ArrayHelper.get_array_inclusion_indexes(current_line, first_sub_image_line)
|
70
|
+
included_indexes.each do |current_included_index|
|
71
|
+
coordinates = image_location_start_find(current_included_index, image_line_index)
|
72
|
+
got_sub_image = get_sub_image(coordinates,
|
73
|
+
sub_image.width,
|
74
|
+
sub_image.height)
|
75
|
+
coordinates_array << coordinates if got_sub_image == sub_image
|
76
|
+
end
|
77
|
+
end
|
78
|
+
coordinates_array
|
79
|
+
end
|
80
|
+
|
81
|
+
private
|
82
|
+
|
83
|
+
# @param param [String] file path of file binaryt
|
84
|
+
# @return [Void] init class data
|
85
|
+
def init_data(param)
|
86
|
+
if OnlyofficePdfParser::FileHelper.file_path?(param)
|
87
|
+
@data = File.read(param)
|
88
|
+
@path_to_image = param
|
89
|
+
else
|
90
|
+
@data = param
|
91
|
+
@path_to_image = '[Binary Steam]'
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
# @return [Void] Fill @pixel with data
|
96
|
+
def fetch_pixels
|
97
|
+
tmp_file = Tempfile.new('onlyoffice_pdf_parser')
|
98
|
+
File.open(tmp_file, 'wb') { |file| file.write(data) }
|
99
|
+
@pixels = ImageList.new(tmp_file.path).get_pixels(0, 0, width, height).each_slice(width).to_a
|
100
|
+
tmp_file.unlink
|
101
|
+
end
|
102
|
+
|
103
|
+
# @param x_coordinate [Integer] x of start search
|
104
|
+
# @param y_coordinate [Integer] y of start search
|
105
|
+
# @return [CursorPoint] point to start
|
106
|
+
def image_location_start_find(x_coordinate, y_coordinate)
|
107
|
+
CursorPoint.new(x_coordinate % width, y_coordinate)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module OnlyofficePdfParser
|
4
|
+
# Methods to help working with array
|
5
|
+
class ArrayHelper
|
6
|
+
# Get list of indexes in which one array is included to another
|
7
|
+
# @param array [Array] big array
|
8
|
+
# @param included_array [Array] array to search
|
9
|
+
# @return [Array, Integer] indexes
|
10
|
+
def self.get_array_inclusion_indexes(array,
|
11
|
+
included_array)
|
12
|
+
index_array = []
|
13
|
+
first_element = included_array.first
|
14
|
+
array.each_with_index do |array_element, array_element_index|
|
15
|
+
if array_element == first_element
|
16
|
+
array_part = array.slice(array_element_index, included_array.length)
|
17
|
+
index_array << array_element_index if array_part == included_array
|
18
|
+
end
|
19
|
+
end
|
20
|
+
index_array
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module OnlyofficePdfParser
|
4
|
+
# Class for working with cursor coordinates
|
5
|
+
class CursorPoint
|
6
|
+
attr_accessor :left, :top
|
7
|
+
|
8
|
+
def initialize(left, top)
|
9
|
+
@left = left
|
10
|
+
@top = top
|
11
|
+
end
|
12
|
+
|
13
|
+
alias width left
|
14
|
+
alias height top
|
15
|
+
alias x left
|
16
|
+
alias y top
|
17
|
+
|
18
|
+
def dup
|
19
|
+
CursorPoint.new(@left, @top)
|
20
|
+
end
|
21
|
+
|
22
|
+
def to_s
|
23
|
+
"[#{@left}, #{@top}]"
|
24
|
+
end
|
25
|
+
|
26
|
+
def ==(other)
|
27
|
+
if other.respond_to?(:left) && other.respond_to?(:top)
|
28
|
+
@left == other.left && @top == other.top
|
29
|
+
else
|
30
|
+
false
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def [](name)
|
35
|
+
case name
|
36
|
+
when :width
|
37
|
+
left
|
38
|
+
when :height
|
39
|
+
top
|
40
|
+
else
|
41
|
+
'Unknown attribute'
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
Dimensions = CursorPoint
|
47
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module OnlyofficePdfParser
|
4
|
+
# Class for working with files
|
5
|
+
class FileHelper
|
6
|
+
class << self
|
7
|
+
# @return [String] name for temp file
|
8
|
+
def generate_temp_name
|
9
|
+
file = Tempfile.new(%w[onlyoffice_bmp_parser .bmp])
|
10
|
+
path = file.path
|
11
|
+
file.unlink
|
12
|
+
path
|
13
|
+
end
|
14
|
+
|
15
|
+
# @return [True, False] if string is file path
|
16
|
+
def file_path?(string)
|
17
|
+
File.exist?(string)
|
18
|
+
rescue ArgumentError
|
19
|
+
false
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Helper methods to convert pdf to bmp
|
4
|
+
module PdfConvertToBmpHelper
|
5
|
+
# Fill @pages_bmp with data
|
6
|
+
def fetch_bmp_binary
|
7
|
+
temp_path = OnlyofficePdfParser::FileHelper.generate_temp_name
|
8
|
+
`convert "#{@file_path}" #{temp_path}`
|
9
|
+
files = multipage_files(temp_path)
|
10
|
+
files.each do |file|
|
11
|
+
@pages_in_bmp << File.binread(file)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
# @param path_pattern [String] pattern to find bmps
|
18
|
+
# @return [Array<String>] list of bmps
|
19
|
+
def multipage_files(path_pattern)
|
20
|
+
files_dir = File.dirname(path_pattern)
|
21
|
+
files_base = File.basename(path_pattern, '.*')
|
22
|
+
Dir["#{files_dir}/#{files_base}*"]
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module OnlyofficePdfParser
|
4
|
+
# Module for helper methods for `pdf-reader` gem
|
5
|
+
module PdfReaderHelper
|
6
|
+
# Return font name from page data
|
7
|
+
# @param page [PDF::Reader::Page] page to parse
|
8
|
+
# @return [String] font
|
9
|
+
def parse_font(page)
|
10
|
+
return :unknown if page.fonts[:F1].nil?
|
11
|
+
|
12
|
+
font_string = page.fonts[:F1][:BaseFont].to_s
|
13
|
+
font_string = /(?=\+)(.*)/.match(font_string).to_s # remove from "+" to ","
|
14
|
+
font_string.delete('+')
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,97 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'pdf/reader'
|
4
|
+
require 'tempfile'
|
5
|
+
require_relative 'helpers/file_helper'
|
6
|
+
require_relative 'pdf_structure/pdf_reader_helper'
|
7
|
+
require_relative 'pdf_structure/pdf_convert_to_bmp_helper'
|
8
|
+
|
9
|
+
module OnlyofficePdfParser
|
10
|
+
# Class for working and parsing PDF files
|
11
|
+
class PdfStructure
|
12
|
+
include PdfConvertToBmpHelper
|
13
|
+
include PdfReaderHelper
|
14
|
+
# @return [Array, Pages] array of pages
|
15
|
+
attr_accessor :pages
|
16
|
+
# @return [String] full path to file
|
17
|
+
attr_accessor :file_path
|
18
|
+
# @return [Array<String>] bin representation of bmps
|
19
|
+
attr_reader :pages_in_bmp
|
20
|
+
|
21
|
+
def initialize(pages: [], file_path: nil)
|
22
|
+
@file_path = file_path
|
23
|
+
@pages = pages
|
24
|
+
@pages_in_bmp = []
|
25
|
+
end
|
26
|
+
|
27
|
+
def [](parameter)
|
28
|
+
case parameter
|
29
|
+
when :pages
|
30
|
+
@pages
|
31
|
+
when :page_size
|
32
|
+
@page_size
|
33
|
+
else
|
34
|
+
raise "Unknown instance variable - #{parameter}."
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# @return [True, false] Check if pdf file contains graphic pattern
|
39
|
+
def contain_pattern?(path_to_patter)
|
40
|
+
pages_in_bmp.each do |current_page|
|
41
|
+
bmp = BmpImage.new(current_page)
|
42
|
+
array = bmp.get_sub_image_array(path_to_patter)
|
43
|
+
return true unless array.empty?
|
44
|
+
end
|
45
|
+
false
|
46
|
+
end
|
47
|
+
|
48
|
+
# Parse file using `pdf-reader` gem
|
49
|
+
def pdf_reader_parse
|
50
|
+
PDF::Reader.open(file_path.to_s) do |reader|
|
51
|
+
reader.pages.each do |page|
|
52
|
+
@pages << { text: page.text,
|
53
|
+
fonts: parse_font(page) }
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# @return [Array <Integer>] page size of pdf in points
|
59
|
+
def page_size_points
|
60
|
+
return @page_size_points if @page_size_points
|
61
|
+
|
62
|
+
pdfinfo = `pdfinfo "#{@file_path}"`
|
63
|
+
page_size_fraction = pdfinfo.split('Page size:')[1].split('pts').first.strip.split(', ').first.split(' x ')
|
64
|
+
@page_size_points = page_size_fraction.map { |size| size.to_f.round }
|
65
|
+
end
|
66
|
+
|
67
|
+
# @return [String, nil] name of page size
|
68
|
+
def page_size
|
69
|
+
@page_size = PAGE_SIZE_FOR_PDF.key(page_size_points)
|
70
|
+
@page_size ||= "Landscape #{PAGE_SIZE_FOR_PDF.key(page_size_points.reverse)}"
|
71
|
+
end
|
72
|
+
|
73
|
+
PAGE_SIZE_FOR_PDF = { 'US Letter' => [612, 792],
|
74
|
+
'US Legal' => [612, 1008],
|
75
|
+
'A4' => [595, 842],
|
76
|
+
'A5' => [420, 595],
|
77
|
+
'B5' => [499, 709],
|
78
|
+
'Envelope #10' => [297, 684],
|
79
|
+
'Envelope DL' => [312, 624],
|
80
|
+
'Tabloid' => [792, 1224],
|
81
|
+
'A3' => [842, 1191],
|
82
|
+
'Tabloid Oversize' => [864, 1296],
|
83
|
+
'ROC 16K' => [558, 774],
|
84
|
+
'Envelope Choukei 3' => [340, 666],
|
85
|
+
'Super B/A3' => [936, 1368] }.freeze
|
86
|
+
|
87
|
+
def self.parse(filename)
|
88
|
+
file = PdfStructure.new(pages: [], file_path: filename)
|
89
|
+
file.pdf_reader_parse
|
90
|
+
file.fetch_bmp_binary
|
91
|
+
file.page_size
|
92
|
+
file
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
PdfParser = PdfStructure
|
97
|
+
end
|
metadata
ADDED
@@ -0,0 +1,103 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: onlyoffice_pdf_parser
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Pavel Lobashov
|
8
|
+
- Dmitry Rotaty
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2020-03-13 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: image_size
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
requirements:
|
18
|
+
- - "~>"
|
19
|
+
- !ruby/object:Gem::Version
|
20
|
+
version: '2'
|
21
|
+
type: :runtime
|
22
|
+
prerelease: false
|
23
|
+
version_requirements: !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - "~>"
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
version: '2'
|
28
|
+
- !ruby/object:Gem::Dependency
|
29
|
+
name: pdf-reader
|
30
|
+
requirement: !ruby/object:Gem::Requirement
|
31
|
+
requirements:
|
32
|
+
- - "~>"
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: '2'
|
35
|
+
type: :runtime
|
36
|
+
prerelease: false
|
37
|
+
version_requirements: !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - "~>"
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: '2'
|
42
|
+
- !ruby/object:Gem::Dependency
|
43
|
+
name: rmagick
|
44
|
+
requirement: !ruby/object:Gem::Requirement
|
45
|
+
requirements:
|
46
|
+
- - ">="
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: '2'
|
49
|
+
- - "<"
|
50
|
+
- !ruby/object:Gem::Version
|
51
|
+
version: '5'
|
52
|
+
type: :runtime
|
53
|
+
prerelease: false
|
54
|
+
version_requirements: !ruby/object:Gem::Requirement
|
55
|
+
requirements:
|
56
|
+
- - ">="
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
version: '2'
|
59
|
+
- - "<"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '5'
|
62
|
+
description: Wrapper for Testrail by OnlyOffice
|
63
|
+
email:
|
64
|
+
- shockwavenn@gmail.com
|
65
|
+
- kvazilife@gmail.com
|
66
|
+
executables: []
|
67
|
+
extensions: []
|
68
|
+
extra_rdoc_files: []
|
69
|
+
files:
|
70
|
+
- README.md
|
71
|
+
- lib/onlyoffice_pdf_parser.rb
|
72
|
+
- lib/onlyoffice_pdf_parser/bmp_image.rb
|
73
|
+
- lib/onlyoffice_pdf_parser/helpers/array_helper.rb
|
74
|
+
- lib/onlyoffice_pdf_parser/helpers/cursor_point.rb
|
75
|
+
- lib/onlyoffice_pdf_parser/helpers/file_helper.rb
|
76
|
+
- lib/onlyoffice_pdf_parser/pdf_structure.rb
|
77
|
+
- lib/onlyoffice_pdf_parser/pdf_structure/pdf_convert_to_bmp_helper.rb
|
78
|
+
- lib/onlyoffice_pdf_parser/pdf_structure/pdf_reader_helper.rb
|
79
|
+
- lib/onlyoffice_pdf_parser/version.rb
|
80
|
+
homepage: https://github.com/onlyoffice-testing-robot/onlyoffice_pdf_parser
|
81
|
+
licenses:
|
82
|
+
- AGPL-3.0
|
83
|
+
metadata: {}
|
84
|
+
post_install_message:
|
85
|
+
rdoc_options: []
|
86
|
+
require_paths:
|
87
|
+
- lib
|
88
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
89
|
+
requirements:
|
90
|
+
- - ">="
|
91
|
+
- !ruby/object:Gem::Version
|
92
|
+
version: '0'
|
93
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
94
|
+
requirements:
|
95
|
+
- - ">="
|
96
|
+
- !ruby/object:Gem::Version
|
97
|
+
version: '0'
|
98
|
+
requirements: []
|
99
|
+
rubygems_version: 3.0.6
|
100
|
+
signing_key:
|
101
|
+
specification_version: 4
|
102
|
+
summary: ONLYOFFICE Testrail Wrapper Gem
|
103
|
+
test_files: []
|