wordtree 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/ext/Makefile +239 -0
- data/ext/extconf.rb +5 -0
- data/ext/wordtree.cc +125 -0
- data/lib/wordtree/archdown.rb +2 -4
- data/lib/wordtree/book.rb +10 -36
- data/lib/wordtree/book_list.rb +38 -0
- data/lib/wordtree/disk/librarian.rb +8 -33
- data/lib/wordtree/disk/library.rb +5 -1
- data/lib/wordtree/ngrams.rb +12 -0
- data/lib/wordtree/text.rb +37 -0
- data/lib/wordtree/version.rb +2 -2
- data/spec/wordtree/book_spec.rb +26 -44
- data/spec/wordtree/disk/librarian_spec.rb +0 -44
- data/spec/wordtree/text_spec.rb +81 -0
- data/wordtree.gemspec +17 -15
- metadata +68 -93
- data/lib/wordtree/text_utils.rb +0 -107
- data/spec/wordtree/text_utils_spec.rb +0 -89
data/lib/wordtree/text_utils.rb
DELETED
@@ -1,107 +0,0 @@
|
|
1
|
-
require 'strscan'
|
2
|
-
|
3
|
-
module WordTree
|
4
|
-
module TextUtils
|
5
|
-
def self.split_near(text, split_index)
|
6
|
-
if split_index >= text.size
|
7
|
-
return [text, ""]
|
8
|
-
else
|
9
|
-
index = split_index
|
10
|
-
while index >= 0
|
11
|
-
if text[index] == ' '
|
12
|
-
return [text[0...index], text[(index+1)..-1]]
|
13
|
-
end
|
14
|
-
index -= 1
|
15
|
-
end
|
16
|
-
return [text[0...split_index], text[split_index..-1]]
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
|
-
# Remove punctuation an non-alphabetical characters from a text, and return
|
21
|
-
# a cleaned-up version wrapped at +wrap+ characters per line.
|
22
|
-
def self.clean_text(input, wrap=120)
|
23
|
-
join = nil
|
24
|
-
output = String.new
|
25
|
-
output_line = String.new
|
26
|
-
|
27
|
-
# Ignore non-UTF-8 characters
|
28
|
-
input = input.encode('UTF-8', :invalid => :replace, :undef => :replace).downcase
|
29
|
-
|
30
|
-
_0 = '0'.ord
|
31
|
-
_9 = '9'.ord
|
32
|
-
_a = 'a'.ord
|
33
|
-
_z = 'z'.ord
|
34
|
-
_A = 'A'.ord
|
35
|
-
_Z = 'Z'.ord
|
36
|
-
_dash = '-'.ord
|
37
|
-
_space = ' '.ord
|
38
|
-
_newline = "\n".ord
|
39
|
-
_period = '.'.ord
|
40
|
-
_question = '?'.ord
|
41
|
-
|
42
|
-
join_lines = false
|
43
|
-
just_added_space = false
|
44
|
-
just_added_period = false
|
45
|
-
line_length = 0
|
46
|
-
input.each_char do |c|
|
47
|
-
c = c.ord
|
48
|
-
# Change upper-case to lower-case
|
49
|
-
c -= 32 if (c >= _A && c <= _Z)
|
50
|
-
# Change newlines to spaces
|
51
|
-
c = _space if c == _newline
|
52
|
-
# Change question marks to periods (i.e. both count as sentence boundaries)
|
53
|
-
c = _period if c == _question
|
54
|
-
|
55
|
-
if c == _dash
|
56
|
-
# In case of a dash, set the scoop-spaces-up flag
|
57
|
-
join_lines = true
|
58
|
-
elsif join_lines && (c == _space)
|
59
|
-
# ignore
|
60
|
-
elsif (c == _period) && !just_added_period
|
61
|
-
if !just_added_space
|
62
|
-
output << _space.chr
|
63
|
-
end
|
64
|
-
output << c.chr
|
65
|
-
just_added_period = true
|
66
|
-
just_added_space = true
|
67
|
-
elsif (c >= _a && c <= _z) || (c == _space && !just_added_space)
|
68
|
-
# Add letters and spaces
|
69
|
-
output << _space.chr if just_added_period
|
70
|
-
output << c.chr
|
71
|
-
line_length += 1
|
72
|
-
just_added_space = (c == _space)
|
73
|
-
just_added_period = false
|
74
|
-
join_lines = false
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
wrapped_output = String.new
|
79
|
-
begin
|
80
|
-
output_line, remainder = split_near(output, wrap)
|
81
|
-
wrapped_output << output_line + "\n"
|
82
|
-
output = remainder
|
83
|
-
end while remainder.size > wrap
|
84
|
-
wrapped_output << remainder + "\n" unless remainder.empty?
|
85
|
-
|
86
|
-
return wrapped_output
|
87
|
-
end
|
88
|
-
|
89
|
-
def self.each_ngram(input, n=1, &block)
|
90
|
-
onegram_re = /([^ \n]+[ \n])/
|
91
|
-
ngram_re = /([^ \n]+[ \n]){#{n},#{n}}/
|
92
|
-
s = StringScanner.new(input)
|
93
|
-
while !s.eos?
|
94
|
-
if words = s.scan(ngram_re)
|
95
|
-
yield words.rstrip.tr("\n", " ") if block_given?
|
96
|
-
# Move back to beginning of n-word sequence
|
97
|
-
s.unscan
|
98
|
-
end
|
99
|
-
# Move forward one word
|
100
|
-
if !s.scan(onegram_re)
|
101
|
-
# if we can't find a word, let's try to recover by scanning one char at a time
|
102
|
-
s.scan(/./m)
|
103
|
-
end
|
104
|
-
end
|
105
|
-
end
|
106
|
-
end
|
107
|
-
end
|
@@ -1,89 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
require 'wordtree/text_utils'
|
3
|
-
require 'timeout'
|
4
|
-
|
5
|
-
describe WordTree::TextUtils do
|
6
|
-
context "#split_near" do
|
7
|
-
it "splits on spaces" do
|
8
|
-
line, rem = WordTree::TextUtils.split_near("it is near", 7)
|
9
|
-
expect(line).to eq("it is")
|
10
|
-
expect(rem).to eq("near")
|
11
|
-
end
|
12
|
-
|
13
|
-
it "removes a space if index lands on one" do
|
14
|
-
line, rem = WordTree::TextUtils.split_near("it is near", 5)
|
15
|
-
expect(line).to eq("it is")
|
16
|
-
expect(rem).to eq("near")
|
17
|
-
end
|
18
|
-
|
19
|
-
it "keeps the whole line if index is >= length of line" do
|
20
|
-
line, rem = WordTree::TextUtils.split_near("it is near", 10)
|
21
|
-
expect(line).to eq("it is near")
|
22
|
-
expect(rem).to eq("")
|
23
|
-
|
24
|
-
line, rem = WordTree::TextUtils.split_near("it is near", 11)
|
25
|
-
expect(line).to eq("it is near")
|
26
|
-
expect(rem).to eq("")
|
27
|
-
end
|
28
|
-
|
29
|
-
it "splits at the index anyway if no spaces are found" do
|
30
|
-
line, rem = WordTree::TextUtils.split_near("itisnear", 4)
|
31
|
-
expect(line).to eq("itis")
|
32
|
-
expect(rem).to eq("near")
|
33
|
-
end
|
34
|
-
end
|
35
|
-
|
36
|
-
context "#clean_text" do
|
37
|
-
it "wraps" do
|
38
|
-
sample_text = "This, [here] is awesome, right"
|
39
|
-
cleaned = WordTree::TextUtils.clean_text(sample_text, 10)
|
40
|
-
expect(cleaned).to eq("this here\nis awesome\nright\n")
|
41
|
-
|
42
|
-
cleaned = WordTree::TextUtils.clean_text(sample_text, 15)
|
43
|
-
expect(cleaned).to eq("this here is\nawesome right\n")
|
44
|
-
|
45
|
-
cleaned = WordTree::TextUtils.clean_text(sample_text, 150)
|
46
|
-
expect(cleaned).to eq("this here is awesome right\n")
|
47
|
-
end
|
48
|
-
|
49
|
-
it "joins lines ending in -" do
|
50
|
-
sample_text = "What-\never\ndo you\n mean?"
|
51
|
-
cleaned = WordTree::TextUtils.clean_text(sample_text, 10)
|
52
|
-
expect(cleaned).to eq("whatever\ndo you\nmean .\n")
|
53
|
-
end
|
54
|
-
|
55
|
-
it "does not ignore sentence boundaries" do
|
56
|
-
sample_text = "This is a sentence. And so is this? Keep the dots."
|
57
|
-
cleaned = WordTree::TextUtils.clean_text(sample_text, 150)
|
58
|
-
expect(cleaned).to eq("this is a sentence . and so is this . keep the dots .\n")
|
59
|
-
cleaned = WordTree::TextUtils.clean_text(sample_text, 10)
|
60
|
-
expect(cleaned).to eq("this is a\nsentence .\nand so is\nthis .\nkeep the\ndots .\n")
|
61
|
-
end
|
62
|
-
|
63
|
-
it "compresses sentence boundary punctuation and spaces" do
|
64
|
-
sample_text = "words . . and.. stuff"
|
65
|
-
cleaned = WordTree::TextUtils.clean_text(sample_text, 150)
|
66
|
-
expect(cleaned).to eq("words . and . stuff\n")
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
|
-
context "#each_ngram" do
|
71
|
-
it "yields ngrams in succession" do
|
72
|
-
sample_text = "one word\n. two\n"
|
73
|
-
expect{ |b| WordTree::TextUtils.each_ngram(sample_text, 1, &b) }.to \
|
74
|
-
yield_successive_args("one", "word", ".", "two")
|
75
|
-
expect{ |b| WordTree::TextUtils.each_ngram(sample_text, 2, &b) }.to \
|
76
|
-
yield_successive_args("one word", "word .", ". two")
|
77
|
-
expect{ |b| WordTree::TextUtils.each_ngram(sample_text, 3, &b) }.to \
|
78
|
-
yield_successive_args("one word .", "word . two")
|
79
|
-
end
|
80
|
-
|
81
|
-
it "doesn't hang on unexpected input" do
|
82
|
-
sample_text = "one word\n. two \n"
|
83
|
-
Timeout.timeout(3) do
|
84
|
-
expect{ |b| WordTree::TextUtils.each_ngram(sample_text, 1, &b) }.to \
|
85
|
-
yield_successive_args("one", "word", ".", "two")
|
86
|
-
end
|
87
|
-
end
|
88
|
-
end
|
89
|
-
end
|