onlyoffice_pdf_parser 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +12 -0
- data/lib/onlyoffice_pdf_parser/bmp_image.rb +110 -0
- data/lib/onlyoffice_pdf_parser/helpers/array_helper.rb +23 -0
- data/lib/onlyoffice_pdf_parser/helpers/cursor_point.rb +47 -0
- data/lib/onlyoffice_pdf_parser/helpers/file_helper.rb +23 -0
- data/lib/onlyoffice_pdf_parser/pdf_structure/pdf_convert_to_bmp_helper.rb +24 -0
- data/lib/onlyoffice_pdf_parser/pdf_structure/pdf_reader_helper.rb +17 -0
- data/lib/onlyoffice_pdf_parser/pdf_structure.rb +97 -0
- data/lib/onlyoffice_pdf_parser/version.rb +7 -0
- data/lib/onlyoffice_pdf_parser.rb +4 -0
- metadata +103 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 661dfba5483c4463abeac412e29aef2c7813ac31edf78e17ca3593ed879e10aa
|
4
|
+
data.tar.gz: 5bcabaa8c35ecf07a814f555693c64720234bffd1e9ee2e82b831e8c65f72a09
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: a91f4477b8d9a32de8459856a6e3e965380f5143be90cea0c67b1b94367ecfc1e4961f34be916dc5ae9bfbdf634d0f331d3a45f0b7c34261cf6d7ad9ddc392eb
|
7
|
+
data.tar.gz: b0960da736c0e12c99f02c7dc8c8dc3c08569b25f81b5995b858fd060322e77ab7c3145c4aeca9670be3ef12e8247d8550b410335c3b407bdc5a5a0b7aaf8f55
|
data/README.md
ADDED
@@ -0,0 +1,110 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'image_size'
|
4
|
+
require 'rmagick'
|
5
|
+
require_relative 'helpers/array_helper'
|
6
|
+
require_relative 'helpers/cursor_point'
|
7
|
+
|
8
|
+
module OnlyofficePdfParser
|
9
|
+
# class for storing bmp image pixels data
|
10
|
+
class BmpImage
|
11
|
+
include Magick
|
12
|
+
attr_accessor :path_to_image, :pixels, :width, :height
|
13
|
+
# @return [String] binary dat of file
|
14
|
+
attr_reader :data
|
15
|
+
|
16
|
+
def initialize(param = nil)
|
17
|
+
return unless param
|
18
|
+
|
19
|
+
init_data(param)
|
20
|
+
image_size = ImageSize.new(data).size
|
21
|
+
|
22
|
+
@width = image_size.first
|
23
|
+
@height = image_size.last
|
24
|
+
fetch_pixels
|
25
|
+
end
|
26
|
+
|
27
|
+
def to_s
|
28
|
+
path_to_image
|
29
|
+
end
|
30
|
+
|
31
|
+
def ==(other)
|
32
|
+
return false unless other.width == width && other.height == height
|
33
|
+
|
34
|
+
pixels.each_with_index do |row, row_index|
|
35
|
+
row.each_with_index do |pixel, pixel_index|
|
36
|
+
other_pixel = other.pixels[row_index][pixel_index]
|
37
|
+
result = (pixel == other_pixel)
|
38
|
+
return false unless result
|
39
|
+
end
|
40
|
+
end
|
41
|
+
true
|
42
|
+
end
|
43
|
+
|
44
|
+
def get_sub_image(start_point = CursorPoint.new(0, 0), width = 0, height = 0)
|
45
|
+
sub_image = BmpImage.new
|
46
|
+
pixels_array = []
|
47
|
+
height.times do |current_height|
|
48
|
+
line_array = []
|
49
|
+
width.times do |current_width|
|
50
|
+
pixel_line = pixels[start_point.top + current_height]
|
51
|
+
# If pixels match to near to the edge of right border of image, then end
|
52
|
+
return nil unless pixel_line
|
53
|
+
|
54
|
+
line_array << pixel_line[start_point.left + current_width]
|
55
|
+
end
|
56
|
+
pixels_array << line_array
|
57
|
+
end
|
58
|
+
sub_image.pixels = pixels_array
|
59
|
+
sub_image.width = width
|
60
|
+
sub_image.height = height
|
61
|
+
sub_image
|
62
|
+
end
|
63
|
+
|
64
|
+
def get_sub_image_array(path_to_sub_image)
|
65
|
+
coordinates_array = []
|
66
|
+
sub_image = BmpImage.new(path_to_sub_image)
|
67
|
+
first_sub_image_line = sub_image.pixels.first
|
68
|
+
pixels.each_with_index do |current_line, image_line_index|
|
69
|
+
included_indexes = ArrayHelper.get_array_inclusion_indexes(current_line, first_sub_image_line)
|
70
|
+
included_indexes.each do |current_included_index|
|
71
|
+
coordinates = image_location_start_find(current_included_index, image_line_index)
|
72
|
+
got_sub_image = get_sub_image(coordinates,
|
73
|
+
sub_image.width,
|
74
|
+
sub_image.height)
|
75
|
+
coordinates_array << coordinates if got_sub_image == sub_image
|
76
|
+
end
|
77
|
+
end
|
78
|
+
coordinates_array
|
79
|
+
end
|
80
|
+
|
81
|
+
private
|
82
|
+
|
83
|
+
# @param param [String] file path of file binaryt
|
84
|
+
# @return [Void] init class data
|
85
|
+
def init_data(param)
|
86
|
+
if OnlyofficePdfParser::FileHelper.file_path?(param)
|
87
|
+
@data = File.read(param)
|
88
|
+
@path_to_image = param
|
89
|
+
else
|
90
|
+
@data = param
|
91
|
+
@path_to_image = '[Binary Steam]'
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
# @return [Void] Fill @pixel with data
|
96
|
+
def fetch_pixels
|
97
|
+
tmp_file = Tempfile.new('onlyoffice_pdf_parser')
|
98
|
+
File.open(tmp_file, 'wb') { |file| file.write(data) }
|
99
|
+
@pixels = ImageList.new(tmp_file.path).get_pixels(0, 0, width, height).each_slice(width).to_a
|
100
|
+
tmp_file.unlink
|
101
|
+
end
|
102
|
+
|
103
|
+
# @param x_coordinate [Integer] x of start search
|
104
|
+
# @param y_coordinate [Integer] y of start search
|
105
|
+
# @return [CursorPoint] point to start
|
106
|
+
def image_location_start_find(x_coordinate, y_coordinate)
|
107
|
+
CursorPoint.new(x_coordinate % width, y_coordinate)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module OnlyofficePdfParser
|
4
|
+
# Methods to help working with array
|
5
|
+
class ArrayHelper
|
6
|
+
# Get list of indexes in which one array is included to another
|
7
|
+
# @param array [Array] big array
|
8
|
+
# @param included_array [Array] array to search
|
9
|
+
# @return [Array, Integer] indexes
|
10
|
+
def self.get_array_inclusion_indexes(array,
|
11
|
+
included_array)
|
12
|
+
index_array = []
|
13
|
+
first_element = included_array.first
|
14
|
+
array.each_with_index do |array_element, array_element_index|
|
15
|
+
if array_element == first_element
|
16
|
+
array_part = array.slice(array_element_index, included_array.length)
|
17
|
+
index_array << array_element_index if array_part == included_array
|
18
|
+
end
|
19
|
+
end
|
20
|
+
index_array
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module OnlyofficePdfParser
|
4
|
+
# Class for working with cursor coordinates
|
5
|
+
class CursorPoint
|
6
|
+
attr_accessor :left, :top
|
7
|
+
|
8
|
+
def initialize(left, top)
|
9
|
+
@left = left
|
10
|
+
@top = top
|
11
|
+
end
|
12
|
+
|
13
|
+
alias width left
|
14
|
+
alias height top
|
15
|
+
alias x left
|
16
|
+
alias y top
|
17
|
+
|
18
|
+
def dup
|
19
|
+
CursorPoint.new(@left, @top)
|
20
|
+
end
|
21
|
+
|
22
|
+
def to_s
|
23
|
+
"[#{@left}, #{@top}]"
|
24
|
+
end
|
25
|
+
|
26
|
+
def ==(other)
|
27
|
+
if other.respond_to?(:left) && other.respond_to?(:top)
|
28
|
+
@left == other.left && @top == other.top
|
29
|
+
else
|
30
|
+
false
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def [](name)
|
35
|
+
case name
|
36
|
+
when :width
|
37
|
+
left
|
38
|
+
when :height
|
39
|
+
top
|
40
|
+
else
|
41
|
+
'Unknown attribute'
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
Dimensions = CursorPoint
|
47
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module OnlyofficePdfParser
|
4
|
+
# Class for working with files
|
5
|
+
class FileHelper
|
6
|
+
class << self
|
7
|
+
# @return [String] name for temp file
|
8
|
+
def generate_temp_name
|
9
|
+
file = Tempfile.new(%w[onlyoffice_bmp_parser .bmp])
|
10
|
+
path = file.path
|
11
|
+
file.unlink
|
12
|
+
path
|
13
|
+
end
|
14
|
+
|
15
|
+
# @return [True, False] if string is file path
|
16
|
+
def file_path?(string)
|
17
|
+
File.exist?(string)
|
18
|
+
rescue ArgumentError
|
19
|
+
false
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Helper methods to convert pdf to bmp
|
4
|
+
module PdfConvertToBmpHelper
|
5
|
+
# Fill @pages_bmp with data
|
6
|
+
def fetch_bmp_binary
|
7
|
+
temp_path = OnlyofficePdfParser::FileHelper.generate_temp_name
|
8
|
+
`convert "#{@file_path}" #{temp_path}`
|
9
|
+
files = multipage_files(temp_path)
|
10
|
+
files.each do |file|
|
11
|
+
@pages_in_bmp << File.binread(file)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
# @param path_pattern [String] pattern to find bmps
|
18
|
+
# @return [Array<String>] list of bmps
|
19
|
+
def multipage_files(path_pattern)
|
20
|
+
files_dir = File.dirname(path_pattern)
|
21
|
+
files_base = File.basename(path_pattern, '.*')
|
22
|
+
Dir["#{files_dir}/#{files_base}*"]
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module OnlyofficePdfParser
|
4
|
+
# Module for helper methods for `pdf-reader` gem
|
5
|
+
module PdfReaderHelper
|
6
|
+
# Return font name from page data
|
7
|
+
# @param page [PDF::Reader::Page] page to parse
|
8
|
+
# @return [String] font
|
9
|
+
def parse_font(page)
|
10
|
+
return :unknown if page.fonts[:F1].nil?
|
11
|
+
|
12
|
+
font_string = page.fonts[:F1][:BaseFont].to_s
|
13
|
+
font_string = /(?=\+)(.*)/.match(font_string).to_s # remove from "+" to ","
|
14
|
+
font_string.delete('+')
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,97 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'pdf/reader'
|
4
|
+
require 'tempfile'
|
5
|
+
require_relative 'helpers/file_helper'
|
6
|
+
require_relative 'pdf_structure/pdf_reader_helper'
|
7
|
+
require_relative 'pdf_structure/pdf_convert_to_bmp_helper'
|
8
|
+
|
9
|
+
module OnlyofficePdfParser
|
10
|
+
# Class for working and parsing PDF files
|
11
|
+
class PdfStructure
|
12
|
+
include PdfConvertToBmpHelper
|
13
|
+
include PdfReaderHelper
|
14
|
+
# @return [Array, Pages] array of pages
|
15
|
+
attr_accessor :pages
|
16
|
+
# @return [String] full path to file
|
17
|
+
attr_accessor :file_path
|
18
|
+
# @return [Array<String>] bin representation of bmps
|
19
|
+
attr_reader :pages_in_bmp
|
20
|
+
|
21
|
+
def initialize(pages: [], file_path: nil)
|
22
|
+
@file_path = file_path
|
23
|
+
@pages = pages
|
24
|
+
@pages_in_bmp = []
|
25
|
+
end
|
26
|
+
|
27
|
+
def [](parameter)
|
28
|
+
case parameter
|
29
|
+
when :pages
|
30
|
+
@pages
|
31
|
+
when :page_size
|
32
|
+
@page_size
|
33
|
+
else
|
34
|
+
raise "Unknown instance variable - #{parameter}."
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# @return [True, false] Check if pdf file contains graphic pattern
|
39
|
+
def contain_pattern?(path_to_patter)
|
40
|
+
pages_in_bmp.each do |current_page|
|
41
|
+
bmp = BmpImage.new(current_page)
|
42
|
+
array = bmp.get_sub_image_array(path_to_patter)
|
43
|
+
return true unless array.empty?
|
44
|
+
end
|
45
|
+
false
|
46
|
+
end
|
47
|
+
|
48
|
+
# Parse file using `pdf-reader` gem
|
49
|
+
def pdf_reader_parse
|
50
|
+
PDF::Reader.open(file_path.to_s) do |reader|
|
51
|
+
reader.pages.each do |page|
|
52
|
+
@pages << { text: page.text,
|
53
|
+
fonts: parse_font(page) }
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# @return [Array <Integer>] page size of pdf in points
|
59
|
+
def page_size_points
|
60
|
+
return @page_size_points if @page_size_points
|
61
|
+
|
62
|
+
pdfinfo = `pdfinfo "#{@file_path}"`
|
63
|
+
page_size_fraction = pdfinfo.split('Page size:')[1].split('pts').first.strip.split(', ').first.split(' x ')
|
64
|
+
@page_size_points = page_size_fraction.map { |size| size.to_f.round }
|
65
|
+
end
|
66
|
+
|
67
|
+
# @return [String, nil] name of page size
|
68
|
+
def page_size
|
69
|
+
@page_size = PAGE_SIZE_FOR_PDF.key(page_size_points)
|
70
|
+
@page_size ||= "Landscape #{PAGE_SIZE_FOR_PDF.key(page_size_points.reverse)}"
|
71
|
+
end
|
72
|
+
|
73
|
+
PAGE_SIZE_FOR_PDF = { 'US Letter' => [612, 792],
|
74
|
+
'US Legal' => [612, 1008],
|
75
|
+
'A4' => [595, 842],
|
76
|
+
'A5' => [420, 595],
|
77
|
+
'B5' => [499, 709],
|
78
|
+
'Envelope #10' => [297, 684],
|
79
|
+
'Envelope DL' => [312, 624],
|
80
|
+
'Tabloid' => [792, 1224],
|
81
|
+
'A3' => [842, 1191],
|
82
|
+
'Tabloid Oversize' => [864, 1296],
|
83
|
+
'ROC 16K' => [558, 774],
|
84
|
+
'Envelope Choukei 3' => [340, 666],
|
85
|
+
'Super B/A3' => [936, 1368] }.freeze
|
86
|
+
|
87
|
+
def self.parse(filename)
|
88
|
+
file = PdfStructure.new(pages: [], file_path: filename)
|
89
|
+
file.pdf_reader_parse
|
90
|
+
file.fetch_bmp_binary
|
91
|
+
file.page_size
|
92
|
+
file
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
PdfParser = PdfStructure
|
97
|
+
end
|
metadata
ADDED
@@ -0,0 +1,103 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: onlyoffice_pdf_parser
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Pavel Lobashov
|
8
|
+
- Dmitry Rotaty
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2020-03-13 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: image_size
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
requirements:
|
18
|
+
- - "~>"
|
19
|
+
- !ruby/object:Gem::Version
|
20
|
+
version: '2'
|
21
|
+
type: :runtime
|
22
|
+
prerelease: false
|
23
|
+
version_requirements: !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - "~>"
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
version: '2'
|
28
|
+
- !ruby/object:Gem::Dependency
|
29
|
+
name: pdf-reader
|
30
|
+
requirement: !ruby/object:Gem::Requirement
|
31
|
+
requirements:
|
32
|
+
- - "~>"
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: '2'
|
35
|
+
type: :runtime
|
36
|
+
prerelease: false
|
37
|
+
version_requirements: !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - "~>"
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: '2'
|
42
|
+
- !ruby/object:Gem::Dependency
|
43
|
+
name: rmagick
|
44
|
+
requirement: !ruby/object:Gem::Requirement
|
45
|
+
requirements:
|
46
|
+
- - ">="
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: '2'
|
49
|
+
- - "<"
|
50
|
+
- !ruby/object:Gem::Version
|
51
|
+
version: '5'
|
52
|
+
type: :runtime
|
53
|
+
prerelease: false
|
54
|
+
version_requirements: !ruby/object:Gem::Requirement
|
55
|
+
requirements:
|
56
|
+
- - ">="
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
version: '2'
|
59
|
+
- - "<"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '5'
|
62
|
+
description: Wrapper for Testrail by OnlyOffice
|
63
|
+
email:
|
64
|
+
- shockwavenn@gmail.com
|
65
|
+
- kvazilife@gmail.com
|
66
|
+
executables: []
|
67
|
+
extensions: []
|
68
|
+
extra_rdoc_files: []
|
69
|
+
files:
|
70
|
+
- README.md
|
71
|
+
- lib/onlyoffice_pdf_parser.rb
|
72
|
+
- lib/onlyoffice_pdf_parser/bmp_image.rb
|
73
|
+
- lib/onlyoffice_pdf_parser/helpers/array_helper.rb
|
74
|
+
- lib/onlyoffice_pdf_parser/helpers/cursor_point.rb
|
75
|
+
- lib/onlyoffice_pdf_parser/helpers/file_helper.rb
|
76
|
+
- lib/onlyoffice_pdf_parser/pdf_structure.rb
|
77
|
+
- lib/onlyoffice_pdf_parser/pdf_structure/pdf_convert_to_bmp_helper.rb
|
78
|
+
- lib/onlyoffice_pdf_parser/pdf_structure/pdf_reader_helper.rb
|
79
|
+
- lib/onlyoffice_pdf_parser/version.rb
|
80
|
+
homepage: https://github.com/onlyoffice-testing-robot/onlyoffice_pdf_parser
|
81
|
+
licenses:
|
82
|
+
- AGPL-3.0
|
83
|
+
metadata: {}
|
84
|
+
post_install_message:
|
85
|
+
rdoc_options: []
|
86
|
+
require_paths:
|
87
|
+
- lib
|
88
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
89
|
+
requirements:
|
90
|
+
- - ">="
|
91
|
+
- !ruby/object:Gem::Version
|
92
|
+
version: '0'
|
93
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
94
|
+
requirements:
|
95
|
+
- - ">="
|
96
|
+
- !ruby/object:Gem::Version
|
97
|
+
version: '0'
|
98
|
+
requirements: []
|
99
|
+
rubygems_version: 3.0.6
|
100
|
+
signing_key:
|
101
|
+
specification_version: 4
|
102
|
+
summary: ONLYOFFICE Testrail Wrapper Gem
|
103
|
+
test_files: []
|