toc_extract 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 8166372166f4ca2fc3242398b6c9ff620bdc0760260a882e6373fec870361a84
4
+ data.tar.gz: 8a10993154383702ed1629e8f20026a817f3115ed076c06cbe4e8ccce561fcbd
5
+ SHA512:
6
+ metadata.gz: dc2c0915f1c84cda13741dcc674c4878c1731e275c012293dcbc55dc366f50f27a80cda053c6ee00c8522017355651c625b6b81d477c90a84b54899aff64cad5
7
+ data.tar.gz: 647c126a09844ed72603180b6f2d9569bf83537240d5bdea89dbfdca06a2478dd39800b6a981502ddfaced54f6e662bfb0e6d434a4c97d7da1124e85e7a85655
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 TOCExtract Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,5 @@
1
+ # TocExtract: Extracts Table of Content from a PDF
2
+
3
+ This is implemented for the specific case of the pdf in https://github.com/coinbase/cb-mpc/tree/master/docs.
4
+ To run the tests, download https://github.com/coinbase/cb-mpc/blob/master/docs/spec/zk-proofs-spec.pdf at 3e54547 to the root directory.
5
+
@@ -0,0 +1,144 @@
1
+ class Section
2
+ attr_accessor :id, :title, :page_number, :bounding_box
3
+ def initialize(id, title, page_number)
4
+ @id = id
5
+ @title = title
6
+ @page_number = page_number
7
+ end
8
+ end
9
+
10
+ class TocExtract
11
+ require "pdf/reader"
12
+ require "pdf/reader/find_text"
13
+
14
+
15
+ def self.toc_lines(pdf_file, template, toc_start_page, toc_end_page)
16
+ # To detect the lines, we assume that all the elements on a line have the same y value.
17
+ # If this assumption changes in the future, we can instead list all the deltas between
18
+ # y values and compute breaking points where a noticeable jumpt in delta occures.
19
+
20
+ # template is a placeholder for future extensions. At the moment, always assume that
21
+ # each line starts with a section number, followed by section title and optionally
22
+ # some dots and ends with a page number. Morever, we assume that pages use arabic
23
+ # numbering as opposed to roman numerals.
24
+
25
+
26
+ lines = []
27
+ max_delta = 0
28
+ line = ""
29
+ last_y = 0
30
+ PDF::Reader.open(pdf_file) do |reader|
31
+ reader.pages.each_with_index do |page, page_num|
32
+ next if page_num < toc_start_page
33
+ break if page_num > toc_end_page
34
+ page.extend(PDF::Reader::FindText)
35
+ runs = page.runs(merge: false)
36
+ runs.each do |run|
37
+ y = run.y
38
+ last_y = y if last_y == 0
39
+ delta = (last_y - y).abs
40
+ if delta > max_delta
41
+ lines << line
42
+ line = run.text
43
+ last_y = y
44
+ else
45
+ line += run.text
46
+ end
47
+ end
48
+ end
49
+ end
50
+
51
+ # Merge lines based on the template. Currently, hard-coded as <NUMBER><TEXT>[...]<NUMBER>
52
+ # Also romeve the lines that contain page numbers. This should also be part of the template, assuming arabic numbering
53
+ real_lines = []
54
+ for line in lines
55
+ if line[0].match?(/[\d]/)
56
+ next if line.scan(/\D/).empty?
57
+ real_lines << line
58
+ else
59
+ # we could be unlucky and get a line that happens to have number in the its text portion
60
+ # and the line breaks at that exact moment. Ignoring for now.
61
+ if real_lines.length == 0
62
+ next # this is the toc header
63
+ else
64
+ real_lines[-1] += line
65
+ end
66
+ end
67
+ end
68
+
69
+ real_lines
70
+ end
71
+
72
+ def self.sections_from_toc_lines(lines, template)
73
+ # Same as before, the template is important, but is hard-coded as <NUMBER><TEXT>[...]<NUMBER>
74
+ sections = []
75
+ for line in lines
76
+ section_id = ""
77
+ page = ""
78
+
79
+ title_start = 0
80
+ title_end = line.length
81
+
82
+ # Section id
83
+ line.each_char do |c|
84
+ if c.match?(/[\d\.]/)
85
+ section_id += c
86
+ title_start += 1
87
+ else
88
+ break
89
+ end
90
+ end
91
+
92
+ # Page number
93
+ line.reverse.each_char do |c|
94
+ if c.match?(/[\d]/)
95
+ page += c
96
+ title_end -= 1
97
+ else
98
+ break
99
+ end
100
+ end
101
+ page = page.reverse.to_i
102
+
103
+ # title
104
+ title = line[title_start..title_end-1].sub(/\.+$/, '')
105
+ sections << Section.new(section_id, title, page)
106
+ end
107
+
108
+ sections
109
+ end
110
+
111
+ def self.fill_bounding_boxes(pdf_file, sections, toc_end_page)
112
+ i = 0
113
+ bboxes = {}
114
+ content_parts = []
115
+ PDF::Reader.open(pdf_file) do |reader|
116
+ reader.pages.each_with_index do |page, page_num|
117
+ next if page_num <= toc_end_page
118
+ page.extend(PDF::Reader::FindText)
119
+ runs = page.runs(merge: false)
120
+ runs.each do |run|
121
+ content_parts << run.text
122
+ bboxes[i] = {
123
+ "x" => run.x,
124
+ "y" => run.y,
125
+ "width" => run.width,
126
+ "endx" => run.endx,
127
+ "endy" => run.endy,
128
+ "page" => page_num
129
+ }
130
+ i += run.text.length
131
+ end
132
+ end
133
+ end
134
+ content = content_parts.join
135
+
136
+ # For each section, search in the content and find its position,
137
+ # look it up in the bounding box and store it
138
+ for section in sections
139
+ pos = content.index(section.title)
140
+ # puts "not found #{pos} for #{section.title}" if bboxes.key?(pos)
141
+ section.bounding_box = bboxes[pos]
142
+ end
143
+ end
144
+ end
@@ -0,0 +1,8 @@
1
+ class TocExtract
2
+ def self.hi(language = "english")
3
+ translator = Translator.new(language)
4
+ translator.hi
5
+ end
6
+ end
7
+
8
+ require_relative 'toc_extract/extractor'
metadata ADDED
@@ -0,0 +1,145 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: toc_extract
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Arash Afshar
8
+ bindir: bin
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: pdf-reader
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - "~>"
17
+ - !ruby/object:Gem::Version
18
+ version: '2.15'
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - "~>"
24
+ - !ruby/object:Gem::Version
25
+ version: '2.15'
26
+ - !ruby/object:Gem::Dependency
27
+ name: pdf-reader-find_text
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - "~>"
31
+ - !ruby/object:Gem::Version
32
+ version: '1.0'
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '1.0'
40
+ - !ruby/object:Gem::Dependency
41
+ name: rake
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '13.0'
47
+ type: :development
48
+ prerelease: false
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '13.0'
54
+ - !ruby/object:Gem::Dependency
55
+ name: rspec
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: '3.0'
61
+ type: :development
62
+ prerelease: false
63
+ version_requirements: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - "~>"
66
+ - !ruby/object:Gem::Version
67
+ version: '3.0'
68
+ - !ruby/object:Gem::Dependency
69
+ name: rubocop
70
+ requirement: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - "~>"
73
+ - !ruby/object:Gem::Version
74
+ version: '1.21'
75
+ type: :development
76
+ prerelease: false
77
+ version_requirements: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - "~>"
80
+ - !ruby/object:Gem::Version
81
+ version: '1.21'
82
+ - !ruby/object:Gem::Dependency
83
+ name: rubocop-rake
84
+ requirement: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - "~>"
87
+ - !ruby/object:Gem::Version
88
+ version: '0.6'
89
+ type: :development
90
+ prerelease: false
91
+ version_requirements: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - "~>"
94
+ - !ruby/object:Gem::Version
95
+ version: '0.6'
96
+ - !ruby/object:Gem::Dependency
97
+ name: rubocop-rspec
98
+ requirement: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - "~>"
101
+ - !ruby/object:Gem::Version
102
+ version: '2.0'
103
+ type: :development
104
+ prerelease: false
105
+ version_requirements: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - "~>"
108
+ - !ruby/object:Gem::Version
109
+ version: '2.0'
110
+ description: A Ruby gem for extracting and parsing table of contents from PDF documents
111
+ with bounding box information
112
+ email:
113
+ - arash.afshar.edu@gmail.com
114
+ executables: []
115
+ extensions: []
116
+ extra_rdoc_files: []
117
+ files:
118
+ - LICENSE.txt
119
+ - README.md
120
+ - lib/toc_extract.rb
121
+ - lib/toc_extract/extractor.rb
122
+ homepage: https://github.com/Arash-Afshar/toc_extract
123
+ licenses:
124
+ - MIT
125
+ metadata:
126
+ allowed_push_host: https://rubygems.org
127
+ source_code_uri: https://github.com/Arash-Afshar/toc_extract
128
+ rdoc_options: []
129
+ require_paths:
130
+ - lib
131
+ required_ruby_version: !ruby/object:Gem::Requirement
132
+ requirements:
133
+ - - ">="
134
+ - !ruby/object:Gem::Version
135
+ version: 2.7.0
136
+ required_rubygems_version: !ruby/object:Gem::Requirement
137
+ requirements:
138
+ - - ">="
139
+ - !ruby/object:Gem::Version
140
+ version: '0'
141
+ requirements: []
142
+ rubygems_version: 3.6.9
143
+ specification_version: 4
144
+ summary: Extract table of contents from PDF files
145
+ test_files: []