pdf-extract 0.0.8 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -1,66 +0,0 @@
1
- require_relative "names"
2
-
3
- module PdfExtract::Language
4
-
5
- def self.transliterate s
6
- s = s.gsub "\ufb01", "fi"
7
- s = s.gsub "\ufb02", "fl"
8
- s = s.gsub "\ufb03", "ffi"
9
- s = s.gsub "\ufb04", "ffl"
10
- s = s.gsub "\ufb06", "st"
11
- s = s.gsub "\u2018", "'"
12
- s = s.gsub "\u2019", "'"
13
- s = s.gsub "\u2013", "-"
14
- s = s.gsub "\u2014", "-"
15
- s = s.gsub "\u201c", "\""
16
- s = s.gsub "\u201d", "\""
17
- s = s.gsub "\u25af", "("
18
- s = s.gsub "\u00b4", ""
19
- s = s.gsub "\u00b1", "-"
20
-
21
- s = s.gsub /\s+/, " "
22
- end
23
-
24
- def self.letter_ratio s
25
- s.count("A-Z0-9\-[],.\"'()") / s.length.to_f
26
- end
27
-
28
- # TODO Ignore caps in middle of words
29
- def self.cap_ratio s
30
- sentence_end = true
31
- cap_count = 0
32
-
33
- s.each_char do |c|
34
- if c =~ /\./
35
- sentence_end = true
36
- elsif c =~ /[A-Z]/
37
- cap_count = cap_count + 1 unless sentence_end
38
- sentence_end = false
39
- elsif c =~ /[^\s]/
40
- sentence_end = false
41
- end
42
- end
43
-
44
- cap_count / s.split.length.to_f
45
- end
46
-
47
- def self.year_ratio s
48
- words = s.split
49
-
50
- year_words = words.map do |word|
51
- word =~ /[^\d]\d{4}[^\d]/
52
- end
53
-
54
- year_words.reject { |year_word| not year_word }.length / words.length.to_f
55
- end
56
-
57
- def self.name_ratio content
58
- PdfExtract::Names.detect_names(content)[:name_frequency]
59
- end
60
-
61
- def self.word_count s
62
- s.split.count
63
- end
64
-
65
- end
66
-