toc_extract 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/toc_extract/extractor.rb +9 -1
- data/lib/toc_extract/preview.rb +54 -0
- data/lib/toc_extract.rb +1 -0
- metadata +16 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6b4ad0c8374da7bd65d180846d2cdc3753de5882414e2bd8c9938cf5c6576f69
|
4
|
+
data.tar.gz: 10576f74ffbe6849cd18de5448dcc4cbfd86c77b058b3aa854bdbfe4f0961597
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6ac11b25e4d645093870437716cb1532e439fc0224e90d5312781a730081333cb7cc9e2221aa53c1938b81b647ac603a9a49d08cbceec7a7604a214a8474d7e4
|
7
|
+
data.tar.gz: 9786a77f1068b8489243c067ea5e95b77921c3689bdc28115768748e65515ff13cc38ded2bf6c3247d18a75bf3ad54bd4d62e9296d49cfca91a88cf2256073e0
|
@@ -11,6 +11,14 @@ class TocExtract
|
|
11
11
|
require "pdf/reader"
|
12
12
|
require "pdf/reader/find_text"
|
13
13
|
|
14
|
+
def self.extract(pdf_file, template, toc_start_page, toc_end_page)
|
15
|
+
lines = TocExtract.toc_lines(pdf_file, template, toc_start_page, toc_end_page)
|
16
|
+
sections = TocExtract.sections_from_toc_lines(lines, template)
|
17
|
+
TocExtract.fill_bounding_boxes(pdf_file, sections, toc_end_page)
|
18
|
+
|
19
|
+
sections
|
20
|
+
end
|
21
|
+
|
14
22
|
|
15
23
|
def self.toc_lines(pdf_file, template, toc_start_page, toc_end_page)
|
16
24
|
# To detect the lines, we assume that all the elements on a line have the same y value.
|
@@ -98,7 +106,7 @@ class TocExtract
|
|
98
106
|
break
|
99
107
|
end
|
100
108
|
end
|
101
|
-
page = page.reverse.to_i
|
109
|
+
page = page.reverse.to_i - 1 # Since TOC pages are 1-based
|
102
110
|
|
103
111
|
# title
|
104
112
|
title = line[title_start..title_end-1].sub(/\.+$/, '')
|
@@ -0,0 +1,54 @@
|
|
1
|
+
class TocExtract
|
2
|
+
def self.preview(pdf_file, section, crop_width, crop_height)
|
3
|
+
require "rmagick"
|
4
|
+
require "pdf/reader"
|
5
|
+
require "pdf/reader/find_text"
|
6
|
+
|
7
|
+
result = ""
|
8
|
+
|
9
|
+
PDF::Reader.open(pdf_file) do |reader|
|
10
|
+
page_num = section.page_number
|
11
|
+
|
12
|
+
target_page = reader.pages[page_num]
|
13
|
+
page_width = target_page.width
|
14
|
+
page_height = target_page.height
|
15
|
+
|
16
|
+
images = Magick::Image.read("#{pdf_file}[#{page_num}]") do |info|
|
17
|
+
info.density = 150
|
18
|
+
end
|
19
|
+
|
20
|
+
img = images.first
|
21
|
+
img_width = img.columns
|
22
|
+
img_height = img.rows
|
23
|
+
|
24
|
+
scale_x = img_width.to_f / page_width
|
25
|
+
scale_y = img_height.to_f / page_height
|
26
|
+
|
27
|
+
# In pdf, x,y are the bottom left coordinates, converting them to top left
|
28
|
+
# Convert pdf-reader coordinates to RMagick coordinates
|
29
|
+
# pdf-reader: origin at bottom-left, Y increases upward
|
30
|
+
# RMagick: origin at top-left, Y increases downward
|
31
|
+
# Add some padding as well
|
32
|
+
pdf_x = section.bounding_box["x"] - 30
|
33
|
+
pdf_y = section.bounding_box["endy"] + 10
|
34
|
+
|
35
|
+
img_x = (pdf_x * scale_x).round
|
36
|
+
img_y = ((page_height - pdf_y) * scale_y).round # Flip Y coordinate
|
37
|
+
img_width_pixels = (crop_width * scale_x).round
|
38
|
+
img_height_pixels = (crop_height * scale_y).round
|
39
|
+
|
40
|
+
# Ensure coordinates are within image bounds
|
41
|
+
img_x = [ img_x, 0 ].max
|
42
|
+
img_y = [ img_y, 0 ].max
|
43
|
+
img_width_pixels = [ img_width_pixels, img_width ].min
|
44
|
+
img_height_pixels = [ img_height_pixels, img_height ].min
|
45
|
+
|
46
|
+
cropped_img = img.crop(img_x, img_y, img_width_pixels, img_height_pixels)
|
47
|
+
|
48
|
+
cropped_img.resize_to_fit!(crop_width, crop_height)
|
49
|
+
result = cropped_img.to_blob { |info| info.format = "PNG" }
|
50
|
+
end
|
51
|
+
|
52
|
+
result
|
53
|
+
end
|
54
|
+
end
|
data/lib/toc_extract.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: toc_extract
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Arash Afshar
|
@@ -37,6 +37,20 @@ dependencies:
|
|
37
37
|
- - "~>"
|
38
38
|
- !ruby/object:Gem::Version
|
39
39
|
version: '1.0'
|
40
|
+
- !ruby/object:Gem::Dependency
|
41
|
+
name: rmagick
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - "~>"
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '6.1'
|
47
|
+
type: :runtime
|
48
|
+
prerelease: false
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - "~>"
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '6.1'
|
40
54
|
- !ruby/object:Gem::Dependency
|
41
55
|
name: rake
|
42
56
|
requirement: !ruby/object:Gem::Requirement
|
@@ -119,6 +133,7 @@ files:
|
|
119
133
|
- README.md
|
120
134
|
- lib/toc_extract.rb
|
121
135
|
- lib/toc_extract/extractor.rb
|
136
|
+
- lib/toc_extract/preview.rb
|
122
137
|
homepage: https://github.com/Arash-Afshar/toc_extract
|
123
138
|
licenses:
|
124
139
|
- MIT
|