pdf-extract 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/assign.rb ADDED
@@ -0,0 +1,72 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "json"
4
+ require "highline"
5
+ require_relative "../lib/pdf-extract"
6
+
7
+ class Assign
8
+
9
+ def initialize features, categories
10
+ # The features of the data we wish to learn from.
11
+ @features = features
12
+
13
+ # The possible categorizations of data items.
14
+ @categories = categories
15
+
16
+ @hl = HighLine.new
17
+ end
18
+
19
+ def data_entry category, section
20
+ entry = {}
21
+ @features.each { |f| entry[f] = section[f] }
22
+ entry[:file] = File.split(ARGV[0]).last
23
+ entry[:category] = category
24
+ entry[:word_count] = section[:word_count]
25
+
26
+ puts entry
27
+ end
28
+
29
+ def with_category
30
+ @hl.choose do |menu|
31
+ menu.prompt = "Category?"
32
+ @categories.each do |category|
33
+ menu.choice(category) do
34
+ yield category
35
+ end
36
+ end
37
+ end
38
+ end
39
+
40
+ end
41
+
42
+ # Display each section and ask if it is ref or non-ref.
43
+ features = [:letter_ratio, :name_ratio, :year_ratio, :cap_ratio]
44
+ categories = [:reference, :body, :mix, :none]
45
+ data = []
46
+ pdf = PdfExtract.parse(ARGV[0]) { |pdf| pdf.sections }
47
+ assign = Assign.new(features, categories)
48
+
49
+ pdf[:sections].each do |section|
50
+ if section[:word_count] < 5
51
+ # Low word count sections are definitely not ref sections.
52
+ # Don't show them to the user.
53
+ data << assign.data_entry(:none, section)
54
+ else
55
+ puts ""
56
+ puts "-----"
57
+ puts ""
58
+ puts PdfExtract::Spatial.get_text_content(section)
59
+ puts ""
60
+
61
+ assign.with_category { |category| data << assign.data_entry(category, section) }
62
+ end
63
+ end
64
+
65
+ puts data
66
+
67
+ # File.open(ARGV[1]).write(data.to_json)
68
+
69
+
70
+
71
+
72
+
data/bin/config.json ADDED
@@ -0,0 +1,4 @@
1
+ {
2
+ "body_ratio": 15,
3
+ "line_slop": -5
4
+ }
Binary file
Binary file