pdf-extract 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/assign.rb +72 -0
- data/bin/config.json +4 -0
- data/bin/fac_v19n11_s5.mask.pdf +0 -0
- data/bin/margins.mask.pdf +0 -0
- data/bin/one-column.mask.pdf +24110 -39
- data/bin/pdf-extract +146 -0
- data/bin/s002040050107_Arch_Toxicol_1994_68_8.mask.pdf +0 -0
- data/bin/some3.mask.pdf +0 -0
- data/bin/some5.mask.pdf +0 -0
- data/bin/some6.mask.pdf +0 -0
- data/bin/train.rb +48 -0
- data/bin/two-column.mask.pdf +0 -0
- data/data/familynames.db +0 -0
- data/data/stopwords.txt +1 -0
- data/lib/analysis/columns.rb +75 -0
- data/lib/analysis/margins.rb +84 -0
- data/lib/analysis/sections.rb +156 -0
- data/lib/analysis/titles.rb +53 -0
- data/lib/analysis/zones.rb +128 -0
- data/lib/font_metrics.rb +240 -0
- data/lib/kmeans.rb +114 -0
- data/lib/language.rb +58 -0
- data/lib/model/characters.rb +320 -0
- data/lib/model/chunks.rb +103 -0
- data/lib/model/regions.rb +112 -0
- data/lib/multi_range.rb +69 -0
- data/lib/names.rb +85 -0
- data/lib/pdf-extract.rb +77 -0
- data/lib/pdf.rb +255 -0
- data/lib/references/references.rb +184 -0
- data/lib/references/resolve.rb +113 -0
- data/lib/references/resolved_references.rb +37 -0
- data/lib/spatial.rb +188 -0
- data/lib/view/abstract_view.rb +32 -0
- data/lib/view/pdf_view.rb +43 -0
- data/lib/view/png_view.rb +30 -0
- data/lib/view/xml_view.rb +113 -0
- metadata +208 -0
data/bin/assign.rb
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "json"
|
4
|
+
require "highline"
|
5
|
+
require_relative "../lib/pdf-extract"
|
6
|
+
|
7
|
+
class Assign
|
8
|
+
|
9
|
+
def initialize features, categories
|
10
|
+
# The features of the data we wish to learn from.
|
11
|
+
@features = features
|
12
|
+
|
13
|
+
# The possible categorizations of data items.
|
14
|
+
@categories = categories
|
15
|
+
|
16
|
+
@hl = HighLine.new
|
17
|
+
end
|
18
|
+
|
19
|
+
def data_entry category, section
|
20
|
+
entry = {}
|
21
|
+
@features.each { |f| entry[f] = section[f] }
|
22
|
+
entry[:file] = File.split(ARGV[0]).last
|
23
|
+
entry[:category] = category
|
24
|
+
entry[:word_count] = section[:word_count]
|
25
|
+
|
26
|
+
puts entry
|
27
|
+
end
|
28
|
+
|
29
|
+
def with_category
|
30
|
+
@hl.choose do |menu|
|
31
|
+
menu.prompt = "Category?"
|
32
|
+
@categories.each do |category|
|
33
|
+
menu.choice(category) do
|
34
|
+
yield category
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
# Display each section and ask if it is ref or non-ref.
|
43
|
+
features = [:letter_ratio, :name_ratio, :year_ratio, :cap_ratio]
|
44
|
+
categories = [:reference, :body, :mix, :none]
|
45
|
+
data = []
|
46
|
+
pdf = PdfExtract.parse(ARGV[0]) { |pdf| pdf.sections }
|
47
|
+
assign = Assign.new(features, categories)
|
48
|
+
|
49
|
+
pdf[:sections].each do |section|
|
50
|
+
if section[:word_count] < 5
|
51
|
+
# Low word count sections are definitely not ref sections.
|
52
|
+
# Don't show them to the user.
|
53
|
+
data << assign.data_entry(:none, section)
|
54
|
+
else
|
55
|
+
puts ""
|
56
|
+
puts "-----"
|
57
|
+
puts ""
|
58
|
+
puts PdfExtract::Spatial.get_text_content(section)
|
59
|
+
puts ""
|
60
|
+
|
61
|
+
assign.with_category { |category| data << assign.data_entry(category, section) }
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
puts data
|
66
|
+
|
67
|
+
# File.open(ARGV[1]).write(data.to_json)
|
68
|
+
|
69
|
+
|
70
|
+
|
71
|
+
|
72
|
+
|
data/bin/config.json
ADDED
Binary file
|
Binary file
|