toc_extract 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +5 -0
- data/lib/toc_extract/extractor.rb +144 -0
- data/lib/toc_extract.rb +8 -0
- metadata +145 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 8166372166f4ca2fc3242398b6c9ff620bdc0760260a882e6373fec870361a84
|
4
|
+
data.tar.gz: 8a10993154383702ed1629e8f20026a817f3115ed076c06cbe4e8ccce561fcbd
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: dc2c0915f1c84cda13741dcc674c4878c1731e275c012293dcbc55dc366f50f27a80cda053c6ee00c8522017355651c625b6b81d477c90a84b54899aff64cad5
|
7
|
+
data.tar.gz: 647c126a09844ed72603180b6f2d9569bf83537240d5bdea89dbfdca06a2478dd39800b6a981502ddfaced54f6e662bfb0e6d434a4c97d7da1124e85e7a85655
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2024 TOCExtract Contributors
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,5 @@
|
|
1
|
+
# TocExtract: Extracts Table of Content from a PDF
|
2
|
+
|
3
|
+
This is implemented for the specific case of the pdf in https://github.com/coinbase/cb-mpc/tree/master/docs.
|
4
|
+
To run the tests, download https://github.com/coinbase/cb-mpc/blob/master/docs/spec/zk-proofs-spec.pdf at 3e54547 to the root directory.
|
5
|
+
|
@@ -0,0 +1,144 @@
|
|
1
|
+
class Section
|
2
|
+
attr_accessor :id, :title, :page_number, :bounding_box
|
3
|
+
def initialize(id, title, page_number)
|
4
|
+
@id = id
|
5
|
+
@title = title
|
6
|
+
@page_number = page_number
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
class TocExtract
|
11
|
+
require "pdf/reader"
|
12
|
+
require "pdf/reader/find_text"
|
13
|
+
|
14
|
+
|
15
|
+
def self.toc_lines(pdf_file, template, toc_start_page, toc_end_page)
|
16
|
+
# To detect the lines, we assume that all the elements on a line have the same y value.
|
17
|
+
# If this assumption changes in the future, we can instead list all the deltas between
|
18
|
+
# y values and compute breaking points where a noticeable jumpt in delta occures.
|
19
|
+
|
20
|
+
# template is a placeholder for future extensions. At the moment, always assume that
|
21
|
+
# each line starts with a section number, followed by section title and optionally
|
22
|
+
# some dots and ends with a page number. Morever, we assume that pages use arabic
|
23
|
+
# numbering as opposed to roman numerals.
|
24
|
+
|
25
|
+
|
26
|
+
lines = []
|
27
|
+
max_delta = 0
|
28
|
+
line = ""
|
29
|
+
last_y = 0
|
30
|
+
PDF::Reader.open(pdf_file) do |reader|
|
31
|
+
reader.pages.each_with_index do |page, page_num|
|
32
|
+
next if page_num < toc_start_page
|
33
|
+
break if page_num > toc_end_page
|
34
|
+
page.extend(PDF::Reader::FindText)
|
35
|
+
runs = page.runs(merge: false)
|
36
|
+
runs.each do |run|
|
37
|
+
y = run.y
|
38
|
+
last_y = y if last_y == 0
|
39
|
+
delta = (last_y - y).abs
|
40
|
+
if delta > max_delta
|
41
|
+
lines << line
|
42
|
+
line = run.text
|
43
|
+
last_y = y
|
44
|
+
else
|
45
|
+
line += run.text
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# Merge lines based on the template. Currently, hard-coded as <NUMBER><TEXT>[...]<NUMBER>
|
52
|
+
# Also romeve the lines that contain page numbers. This should also be part of the template, assuming arabic numbering
|
53
|
+
real_lines = []
|
54
|
+
for line in lines
|
55
|
+
if line[0].match?(/[\d]/)
|
56
|
+
next if line.scan(/\D/).empty?
|
57
|
+
real_lines << line
|
58
|
+
else
|
59
|
+
# we could be unlucky and get a line that happens to have number in the its text portion
|
60
|
+
# and the line breaks at that exact moment. Ignoring for now.
|
61
|
+
if real_lines.length == 0
|
62
|
+
next # this is the toc header
|
63
|
+
else
|
64
|
+
real_lines[-1] += line
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
real_lines
|
70
|
+
end
|
71
|
+
|
72
|
+
def self.sections_from_toc_lines(lines, template)
|
73
|
+
# Same as before, the template is important, but is hard-coded as <NUMBER><TEXT>[...]<NUMBER>
|
74
|
+
sections = []
|
75
|
+
for line in lines
|
76
|
+
section_id = ""
|
77
|
+
page = ""
|
78
|
+
|
79
|
+
title_start = 0
|
80
|
+
title_end = line.length
|
81
|
+
|
82
|
+
# Section id
|
83
|
+
line.each_char do |c|
|
84
|
+
if c.match?(/[\d\.]/)
|
85
|
+
section_id += c
|
86
|
+
title_start += 1
|
87
|
+
else
|
88
|
+
break
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
# Page number
|
93
|
+
line.reverse.each_char do |c|
|
94
|
+
if c.match?(/[\d]/)
|
95
|
+
page += c
|
96
|
+
title_end -= 1
|
97
|
+
else
|
98
|
+
break
|
99
|
+
end
|
100
|
+
end
|
101
|
+
page = page.reverse.to_i
|
102
|
+
|
103
|
+
# title
|
104
|
+
title = line[title_start..title_end-1].sub(/\.+$/, '')
|
105
|
+
sections << Section.new(section_id, title, page)
|
106
|
+
end
|
107
|
+
|
108
|
+
sections
|
109
|
+
end
|
110
|
+
|
111
|
+
def self.fill_bounding_boxes(pdf_file, sections, toc_end_page)
|
112
|
+
i = 0
|
113
|
+
bboxes = {}
|
114
|
+
content_parts = []
|
115
|
+
PDF::Reader.open(pdf_file) do |reader|
|
116
|
+
reader.pages.each_with_index do |page, page_num|
|
117
|
+
next if page_num <= toc_end_page
|
118
|
+
page.extend(PDF::Reader::FindText)
|
119
|
+
runs = page.runs(merge: false)
|
120
|
+
runs.each do |run|
|
121
|
+
content_parts << run.text
|
122
|
+
bboxes[i] = {
|
123
|
+
"x" => run.x,
|
124
|
+
"y" => run.y,
|
125
|
+
"width" => run.width,
|
126
|
+
"endx" => run.endx,
|
127
|
+
"endy" => run.endy,
|
128
|
+
"page" => page_num
|
129
|
+
}
|
130
|
+
i += run.text.length
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
content = content_parts.join
|
135
|
+
|
136
|
+
# For each section, search in the content and find its position,
|
137
|
+
# look it up in the bounding box and store it
|
138
|
+
for section in sections
|
139
|
+
pos = content.index(section.title)
|
140
|
+
# puts "not found #{pos} for #{section.title}" if bboxes.key?(pos)
|
141
|
+
section.bounding_box = bboxes[pos]
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
data/lib/toc_extract.rb
ADDED
metadata
ADDED
@@ -0,0 +1,145 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: toc_extract
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Arash Afshar
|
8
|
+
bindir: bin
|
9
|
+
cert_chain: []
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
11
|
+
dependencies:
|
12
|
+
- !ruby/object:Gem::Dependency
|
13
|
+
name: pdf-reader
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
15
|
+
requirements:
|
16
|
+
- - "~>"
|
17
|
+
- !ruby/object:Gem::Version
|
18
|
+
version: '2.15'
|
19
|
+
type: :runtime
|
20
|
+
prerelease: false
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
22
|
+
requirements:
|
23
|
+
- - "~>"
|
24
|
+
- !ruby/object:Gem::Version
|
25
|
+
version: '2.15'
|
26
|
+
- !ruby/object:Gem::Dependency
|
27
|
+
name: pdf-reader-find_text
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
29
|
+
requirements:
|
30
|
+
- - "~>"
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '1.0'
|
33
|
+
type: :runtime
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - "~>"
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '1.0'
|
40
|
+
- !ruby/object:Gem::Dependency
|
41
|
+
name: rake
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - "~>"
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '13.0'
|
47
|
+
type: :development
|
48
|
+
prerelease: false
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - "~>"
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '13.0'
|
54
|
+
- !ruby/object:Gem::Dependency
|
55
|
+
name: rspec
|
56
|
+
requirement: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - "~>"
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '3.0'
|
61
|
+
type: :development
|
62
|
+
prerelease: false
|
63
|
+
version_requirements: !ruby/object:Gem::Requirement
|
64
|
+
requirements:
|
65
|
+
- - "~>"
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: '3.0'
|
68
|
+
- !ruby/object:Gem::Dependency
|
69
|
+
name: rubocop
|
70
|
+
requirement: !ruby/object:Gem::Requirement
|
71
|
+
requirements:
|
72
|
+
- - "~>"
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: '1.21'
|
75
|
+
type: :development
|
76
|
+
prerelease: false
|
77
|
+
version_requirements: !ruby/object:Gem::Requirement
|
78
|
+
requirements:
|
79
|
+
- - "~>"
|
80
|
+
- !ruby/object:Gem::Version
|
81
|
+
version: '1.21'
|
82
|
+
- !ruby/object:Gem::Dependency
|
83
|
+
name: rubocop-rake
|
84
|
+
requirement: !ruby/object:Gem::Requirement
|
85
|
+
requirements:
|
86
|
+
- - "~>"
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: '0.6'
|
89
|
+
type: :development
|
90
|
+
prerelease: false
|
91
|
+
version_requirements: !ruby/object:Gem::Requirement
|
92
|
+
requirements:
|
93
|
+
- - "~>"
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
version: '0.6'
|
96
|
+
- !ruby/object:Gem::Dependency
|
97
|
+
name: rubocop-rspec
|
98
|
+
requirement: !ruby/object:Gem::Requirement
|
99
|
+
requirements:
|
100
|
+
- - "~>"
|
101
|
+
- !ruby/object:Gem::Version
|
102
|
+
version: '2.0'
|
103
|
+
type: :development
|
104
|
+
prerelease: false
|
105
|
+
version_requirements: !ruby/object:Gem::Requirement
|
106
|
+
requirements:
|
107
|
+
- - "~>"
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '2.0'
|
110
|
+
description: A Ruby gem for extracting and parsing table of contents from PDF documents
|
111
|
+
with bounding box information
|
112
|
+
email:
|
113
|
+
- arash.afshar.edu@gmail.com
|
114
|
+
executables: []
|
115
|
+
extensions: []
|
116
|
+
extra_rdoc_files: []
|
117
|
+
files:
|
118
|
+
- LICENSE.txt
|
119
|
+
- README.md
|
120
|
+
- lib/toc_extract.rb
|
121
|
+
- lib/toc_extract/extractor.rb
|
122
|
+
homepage: https://github.com/Arash-Afshar/toc_extract
|
123
|
+
licenses:
|
124
|
+
- MIT
|
125
|
+
metadata:
|
126
|
+
allowed_push_host: https://rubygems.org
|
127
|
+
source_code_uri: https://github.com/Arash-Afshar/toc_extract
|
128
|
+
rdoc_options: []
|
129
|
+
require_paths:
|
130
|
+
- lib
|
131
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
132
|
+
requirements:
|
133
|
+
- - ">="
|
134
|
+
- !ruby/object:Gem::Version
|
135
|
+
version: 2.7.0
|
136
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
137
|
+
requirements:
|
138
|
+
- - ">="
|
139
|
+
- !ruby/object:Gem::Version
|
140
|
+
version: '0'
|
141
|
+
requirements: []
|
142
|
+
rubygems_version: 3.6.9
|
143
|
+
specification_version: 4
|
144
|
+
summary: Extract table of contents from PDF files
|
145
|
+
test_files: []
|