toc_extract 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8166372166f4ca2fc3242398b6c9ff620bdc0760260a882e6373fec870361a84
4
- data.tar.gz: 8a10993154383702ed1629e8f20026a817f3115ed076c06cbe4e8ccce561fcbd
3
+ metadata.gz: 6b4ad0c8374da7bd65d180846d2cdc3753de5882414e2bd8c9938cf5c6576f69
4
+ data.tar.gz: 10576f74ffbe6849cd18de5448dcc4cbfd86c77b058b3aa854bdbfe4f0961597
5
5
  SHA512:
6
- metadata.gz: dc2c0915f1c84cda13741dcc674c4878c1731e275c012293dcbc55dc366f50f27a80cda053c6ee00c8522017355651c625b6b81d477c90a84b54899aff64cad5
7
- data.tar.gz: 647c126a09844ed72603180b6f2d9569bf83537240d5bdea89dbfdca06a2478dd39800b6a981502ddfaced54f6e662bfb0e6d434a4c97d7da1124e85e7a85655
6
+ metadata.gz: 6ac11b25e4d645093870437716cb1532e439fc0224e90d5312781a730081333cb7cc9e2221aa53c1938b81b647ac603a9a49d08cbceec7a7604a214a8474d7e4
7
+ data.tar.gz: 9786a77f1068b8489243c067ea5e95b77921c3689bdc28115768748e65515ff13cc38ded2bf6c3247d18a75bf3ad54bd4d62e9296d49cfca91a88cf2256073e0
@@ -11,6 +11,14 @@ class TocExtract
11
11
  require "pdf/reader"
12
12
  require "pdf/reader/find_text"
13
13
 
14
+ def self.extract(pdf_file, template, toc_start_page, toc_end_page)
15
+ lines = TocExtract.toc_lines(pdf_file, template, toc_start_page, toc_end_page)
16
+ sections = TocExtract.sections_from_toc_lines(lines, template)
17
+ TocExtract.fill_bounding_boxes(pdf_file, sections, toc_end_page)
18
+
19
+ sections
20
+ end
21
+
14
22
 
15
23
  def self.toc_lines(pdf_file, template, toc_start_page, toc_end_page)
16
24
  # To detect the lines, we assume that all the elements on a line have the same y value.
@@ -98,7 +106,7 @@ class TocExtract
98
106
  break
99
107
  end
100
108
  end
101
- page = page.reverse.to_i
109
+ page = page.reverse.to_i - 1 # Since TOC pages are 1-based
102
110
 
103
111
  # title
104
112
  title = line[title_start..title_end-1].sub(/\.+$/, '')
@@ -0,0 +1,54 @@
1
+ class TocExtract
2
+ def self.preview(pdf_file, section, crop_width, crop_height)
3
+ require "rmagick"
4
+ require "pdf/reader"
5
+ require "pdf/reader/find_text"
6
+
7
+ result = ""
8
+
9
+ PDF::Reader.open(pdf_file) do |reader|
10
+ page_num = section.page_number
11
+
12
+ target_page = reader.pages[page_num]
13
+ page_width = target_page.width
14
+ page_height = target_page.height
15
+
16
+ images = Magick::Image.read("#{pdf_file}[#{page_num}]") do |info|
17
+ info.density = 150
18
+ end
19
+
20
+ img = images.first
21
+ img_width = img.columns
22
+ img_height = img.rows
23
+
24
+ scale_x = img_width.to_f / page_width
25
+ scale_y = img_height.to_f / page_height
26
+
27
+ # In pdf, x,y are the bottom left coordinates, converting them to top left
28
+ # Convert pdf-reader coordinates to RMagick coordinates
29
+ # pdf-reader: origin at bottom-left, Y increases upward
30
+ # RMagick: origin at top-left, Y increases downward
31
+ # Add some padding as well
32
+ pdf_x = section.bounding_box["x"] - 30
33
+ pdf_y = section.bounding_box["endy"] + 10
34
+
35
+ img_x = (pdf_x * scale_x).round
36
+ img_y = ((page_height - pdf_y) * scale_y).round # Flip Y coordinate
37
+ img_width_pixels = (crop_width * scale_x).round
38
+ img_height_pixels = (crop_height * scale_y).round
39
+
40
+ # Ensure coordinates are within image bounds
41
+ img_x = [ img_x, 0 ].max
42
+ img_y = [ img_y, 0 ].max
43
+ img_width_pixels = [ img_width_pixels, img_width ].min
44
+ img_height_pixels = [ img_height_pixels, img_height ].min
45
+
46
+ cropped_img = img.crop(img_x, img_y, img_width_pixels, img_height_pixels)
47
+
48
+ cropped_img.resize_to_fit!(crop_width, crop_height)
49
+ result = cropped_img.to_blob { |info| info.format = "PNG" }
50
+ end
51
+
52
+ result
53
+ end
54
+ end
data/lib/toc_extract.rb CHANGED
@@ -6,3 +6,4 @@ class TocExtract
6
6
  end
7
7
 
8
8
  require_relative 'toc_extract/extractor'
9
+ require_relative 'toc_extract/preview'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: toc_extract
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Arash Afshar
@@ -37,6 +37,20 @@ dependencies:
37
37
  - - "~>"
38
38
  - !ruby/object:Gem::Version
39
39
  version: '1.0'
40
+ - !ruby/object:Gem::Dependency
41
+ name: rmagick
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '6.1'
47
+ type: :runtime
48
+ prerelease: false
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '6.1'
40
54
  - !ruby/object:Gem::Dependency
41
55
  name: rake
42
56
  requirement: !ruby/object:Gem::Requirement
@@ -119,6 +133,7 @@ files:
119
133
  - README.md
120
134
  - lib/toc_extract.rb
121
135
  - lib/toc_extract/extractor.rb
136
+ - lib/toc_extract/preview.rb
122
137
  homepage: https://github.com/Arash-Afshar/toc_extract
123
138
  licenses:
124
139
  - MIT