wordtree 0.3.1 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/ext/Makefile +239 -0
- data/ext/extconf.rb +5 -0
- data/ext/wordtree.cc +125 -0
- data/lib/wordtree/archdown.rb +2 -4
- data/lib/wordtree/book.rb +10 -36
- data/lib/wordtree/book_list.rb +38 -0
- data/lib/wordtree/disk/librarian.rb +8 -33
- data/lib/wordtree/disk/library.rb +5 -1
- data/lib/wordtree/ngrams.rb +12 -0
- data/lib/wordtree/text.rb +37 -0
- data/lib/wordtree/version.rb +2 -2
- data/spec/wordtree/book_spec.rb +26 -44
- data/spec/wordtree/disk/librarian_spec.rb +0 -44
- data/spec/wordtree/text_spec.rb +81 -0
- data/wordtree.gemspec +17 -15
- metadata +68 -93
- data/lib/wordtree/text_utils.rb +0 -107
- data/spec/wordtree/text_utils_spec.rb +0 -89
data/lib/wordtree/text_utils.rb
DELETED
@@ -1,107 +0,0 @@
|
|
1
|
-
require 'strscan'
|
2
|
-
|
3
|
-
module WordTree
|
4
|
-
module TextUtils
|
5
|
-
def self.split_near(text, split_index)
|
6
|
-
if split_index >= text.size
|
7
|
-
return [text, ""]
|
8
|
-
else
|
9
|
-
index = split_index
|
10
|
-
while index >= 0
|
11
|
-
if text[index] == ' '
|
12
|
-
return [text[0...index], text[(index+1)..-1]]
|
13
|
-
end
|
14
|
-
index -= 1
|
15
|
-
end
|
16
|
-
return [text[0...split_index], text[split_index..-1]]
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
|
-
# Remove punctuation an non-alphabetical characters from a text, and return
|
21
|
-
# a cleaned-up version wrapped at +wrap+ characters per line.
|
22
|
-
def self.clean_text(input, wrap=120)
|
23
|
-
join = nil
|
24
|
-
output = String.new
|
25
|
-
output_line = String.new
|
26
|
-
|
27
|
-
# Ignore non-UTF-8 characters
|
28
|
-
input = input.encode('UTF-8', :invalid => :replace, :undef => :replace).downcase
|
29
|
-
|
30
|
-
_0 = '0'.ord
|
31
|
-
_9 = '9'.ord
|
32
|
-
_a = 'a'.ord
|
33
|
-
_z = 'z'.ord
|
34
|
-
_A = 'A'.ord
|
35
|
-
_Z = 'Z'.ord
|
36
|
-
_dash = '-'.ord
|
37
|
-
_space = ' '.ord
|
38
|
-
_newline = "\n".ord
|
39
|
-
_period = '.'.ord
|
40
|
-
_question = '?'.ord
|
41
|
-
|
42
|
-
join_lines = false
|
43
|
-
just_added_space = false
|
44
|
-
just_added_period = false
|
45
|
-
line_length = 0
|
46
|
-
input.each_char do |c|
|
47
|
-
c = c.ord
|
48
|
-
# Change upper-case to lower-case
|
49
|
-
c -= 32 if (c >= _A && c <= _Z)
|
50
|
-
# Change newlines to spaces
|
51
|
-
c = _space if c == _newline
|
52
|
-
# Change question marks to periods (i.e. both count as sentence boundaries)
|
53
|
-
c = _period if c == _question
|
54
|
-
|
55
|
-
if c == _dash
|
56
|
-
# In case of a dash, set the scoop-spaces-up flag
|
57
|
-
join_lines = true
|
58
|
-
elsif join_lines && (c == _space)
|
59
|
-
# ignore
|
60
|
-
elsif (c == _period) && !just_added_period
|
61
|
-
if !just_added_space
|
62
|
-
output << _space.chr
|
63
|
-
end
|
64
|
-
output << c.chr
|
65
|
-
just_added_period = true
|
66
|
-
just_added_space = true
|
67
|
-
elsif (c >= _a && c <= _z) || (c == _space && !just_added_space)
|
68
|
-
# Add letters and spaces
|
69
|
-
output << _space.chr if just_added_period
|
70
|
-
output << c.chr
|
71
|
-
line_length += 1
|
72
|
-
just_added_space = (c == _space)
|
73
|
-
just_added_period = false
|
74
|
-
join_lines = false
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
wrapped_output = String.new
|
79
|
-
begin
|
80
|
-
output_line, remainder = split_near(output, wrap)
|
81
|
-
wrapped_output << output_line + "\n"
|
82
|
-
output = remainder
|
83
|
-
end while remainder.size > wrap
|
84
|
-
wrapped_output << remainder + "\n" unless remainder.empty?
|
85
|
-
|
86
|
-
return wrapped_output
|
87
|
-
end
|
88
|
-
|
89
|
-
def self.each_ngram(input, n=1, &block)
|
90
|
-
onegram_re = /([^ \n]+[ \n])/
|
91
|
-
ngram_re = /([^ \n]+[ \n]){#{n},#{n}}/
|
92
|
-
s = StringScanner.new(input)
|
93
|
-
while !s.eos?
|
94
|
-
if words = s.scan(ngram_re)
|
95
|
-
yield words.rstrip.tr("\n", " ") if block_given?
|
96
|
-
# Move back to beginning of n-word sequence
|
97
|
-
s.unscan
|
98
|
-
end
|
99
|
-
# Move forward one word
|
100
|
-
if !s.scan(onegram_re)
|
101
|
-
# if we can't find a word, let's try to recover by scanning one char at a time
|
102
|
-
s.scan(/./m)
|
103
|
-
end
|
104
|
-
end
|
105
|
-
end
|
106
|
-
end
|
107
|
-
end
|
@@ -1,89 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
require 'wordtree/text_utils'
|
3
|
-
require 'timeout'
|
4
|
-
|
5
|
-
describe WordTree::TextUtils do
|
6
|
-
context "#split_near" do
|
7
|
-
it "splits on spaces" do
|
8
|
-
line, rem = WordTree::TextUtils.split_near("it is near", 7)
|
9
|
-
expect(line).to eq("it is")
|
10
|
-
expect(rem).to eq("near")
|
11
|
-
end
|
12
|
-
|
13
|
-
it "removes a space if index lands on one" do
|
14
|
-
line, rem = WordTree::TextUtils.split_near("it is near", 5)
|
15
|
-
expect(line).to eq("it is")
|
16
|
-
expect(rem).to eq("near")
|
17
|
-
end
|
18
|
-
|
19
|
-
it "keeps the whole line if index is >= length of line" do
|
20
|
-
line, rem = WordTree::TextUtils.split_near("it is near", 10)
|
21
|
-
expect(line).to eq("it is near")
|
22
|
-
expect(rem).to eq("")
|
23
|
-
|
24
|
-
line, rem = WordTree::TextUtils.split_near("it is near", 11)
|
25
|
-
expect(line).to eq("it is near")
|
26
|
-
expect(rem).to eq("")
|
27
|
-
end
|
28
|
-
|
29
|
-
it "splits at the index anyway if no spaces are found" do
|
30
|
-
line, rem = WordTree::TextUtils.split_near("itisnear", 4)
|
31
|
-
expect(line).to eq("itis")
|
32
|
-
expect(rem).to eq("near")
|
33
|
-
end
|
34
|
-
end
|
35
|
-
|
36
|
-
context "#clean_text" do
|
37
|
-
it "wraps" do
|
38
|
-
sample_text = "This, [here] is awesome, right"
|
39
|
-
cleaned = WordTree::TextUtils.clean_text(sample_text, 10)
|
40
|
-
expect(cleaned).to eq("this here\nis awesome\nright\n")
|
41
|
-
|
42
|
-
cleaned = WordTree::TextUtils.clean_text(sample_text, 15)
|
43
|
-
expect(cleaned).to eq("this here is\nawesome right\n")
|
44
|
-
|
45
|
-
cleaned = WordTree::TextUtils.clean_text(sample_text, 150)
|
46
|
-
expect(cleaned).to eq("this here is awesome right\n")
|
47
|
-
end
|
48
|
-
|
49
|
-
it "joins lines ending in -" do
|
50
|
-
sample_text = "What-\never\ndo you\n mean?"
|
51
|
-
cleaned = WordTree::TextUtils.clean_text(sample_text, 10)
|
52
|
-
expect(cleaned).to eq("whatever\ndo you\nmean .\n")
|
53
|
-
end
|
54
|
-
|
55
|
-
it "does not ignore sentence boundaries" do
|
56
|
-
sample_text = "This is a sentence. And so is this? Keep the dots."
|
57
|
-
cleaned = WordTree::TextUtils.clean_text(sample_text, 150)
|
58
|
-
expect(cleaned).to eq("this is a sentence . and so is this . keep the dots .\n")
|
59
|
-
cleaned = WordTree::TextUtils.clean_text(sample_text, 10)
|
60
|
-
expect(cleaned).to eq("this is a\nsentence .\nand so is\nthis .\nkeep the\ndots .\n")
|
61
|
-
end
|
62
|
-
|
63
|
-
it "compresses sentence boundary punctuation and spaces" do
|
64
|
-
sample_text = "words . . and.. stuff"
|
65
|
-
cleaned = WordTree::TextUtils.clean_text(sample_text, 150)
|
66
|
-
expect(cleaned).to eq("words . and . stuff\n")
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
|
-
context "#each_ngram" do
|
71
|
-
it "yields ngrams in succession" do
|
72
|
-
sample_text = "one word\n. two\n"
|
73
|
-
expect{ |b| WordTree::TextUtils.each_ngram(sample_text, 1, &b) }.to \
|
74
|
-
yield_successive_args("one", "word", ".", "two")
|
75
|
-
expect{ |b| WordTree::TextUtils.each_ngram(sample_text, 2, &b) }.to \
|
76
|
-
yield_successive_args("one word", "word .", ". two")
|
77
|
-
expect{ |b| WordTree::TextUtils.each_ngram(sample_text, 3, &b) }.to \
|
78
|
-
yield_successive_args("one word .", "word . two")
|
79
|
-
end
|
80
|
-
|
81
|
-
it "doesn't hang on unexpected input" do
|
82
|
-
sample_text = "one word\n. two \n"
|
83
|
-
Timeout.timeout(3) do
|
84
|
-
expect{ |b| WordTree::TextUtils.each_ngram(sample_text, 1, &b) }.to \
|
85
|
-
yield_successive_args("one", "word", ".", "two")
|
86
|
-
end
|
87
|
-
end
|
88
|
-
end
|
89
|
-
end
|