wordtree 0.3.1 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,107 +0,0 @@
1
- require 'strscan'
2
-
3
- module WordTree
4
- module TextUtils
5
- def self.split_near(text, split_index)
6
- if split_index >= text.size
7
- return [text, ""]
8
- else
9
- index = split_index
10
- while index >= 0
11
- if text[index] == ' '
12
- return [text[0...index], text[(index+1)..-1]]
13
- end
14
- index -= 1
15
- end
16
- return [text[0...split_index], text[split_index..-1]]
17
- end
18
- end
19
-
20
- # Remove punctuation an non-alphabetical characters from a text, and return
21
- # a cleaned-up version wrapped at +wrap+ characters per line.
22
- def self.clean_text(input, wrap=120)
23
- join = nil
24
- output = String.new
25
- output_line = String.new
26
-
27
- # Ignore non-UTF-8 characters
28
- input = input.encode('UTF-8', :invalid => :replace, :undef => :replace).downcase
29
-
30
- _0 = '0'.ord
31
- _9 = '9'.ord
32
- _a = 'a'.ord
33
- _z = 'z'.ord
34
- _A = 'A'.ord
35
- _Z = 'Z'.ord
36
- _dash = '-'.ord
37
- _space = ' '.ord
38
- _newline = "\n".ord
39
- _period = '.'.ord
40
- _question = '?'.ord
41
-
42
- join_lines = false
43
- just_added_space = false
44
- just_added_period = false
45
- line_length = 0
46
- input.each_char do |c|
47
- c = c.ord
48
- # Change upper-case to lower-case
49
- c -= 32 if (c >= _A && c <= _Z)
50
- # Change newlines to spaces
51
- c = _space if c == _newline
52
- # Change question marks to periods (i.e. both count as sentence boundaries)
53
- c = _period if c == _question
54
-
55
- if c == _dash
56
- # In case of a dash, set the scoop-spaces-up flag
57
- join_lines = true
58
- elsif join_lines && (c == _space)
59
- # ignore
60
- elsif (c == _period) && !just_added_period
61
- if !just_added_space
62
- output << _space.chr
63
- end
64
- output << c.chr
65
- just_added_period = true
66
- just_added_space = true
67
- elsif (c >= _a && c <= _z) || (c == _space && !just_added_space)
68
- # Add letters and spaces
69
- output << _space.chr if just_added_period
70
- output << c.chr
71
- line_length += 1
72
- just_added_space = (c == _space)
73
- just_added_period = false
74
- join_lines = false
75
- end
76
- end
77
-
78
- wrapped_output = String.new
79
- begin
80
- output_line, remainder = split_near(output, wrap)
81
- wrapped_output << output_line + "\n"
82
- output = remainder
83
- end while remainder.size > wrap
84
- wrapped_output << remainder + "\n" unless remainder.empty?
85
-
86
- return wrapped_output
87
- end
88
-
89
- def self.each_ngram(input, n=1, &block)
90
- onegram_re = /([^ \n]+[ \n])/
91
- ngram_re = /([^ \n]+[ \n]){#{n},#{n}}/
92
- s = StringScanner.new(input)
93
- while !s.eos?
94
- if words = s.scan(ngram_re)
95
- yield words.rstrip.tr("\n", " ") if block_given?
96
- # Move back to beginning of n-word sequence
97
- s.unscan
98
- end
99
- # Move forward one word
100
- if !s.scan(onegram_re)
101
- # if we can't find a word, let's try to recover by scanning one char at a time
102
- s.scan(/./m)
103
- end
104
- end
105
- end
106
- end
107
- end
@@ -1,89 +0,0 @@
1
- require 'spec_helper'
2
- require 'wordtree/text_utils'
3
- require 'timeout'
4
-
5
- describe WordTree::TextUtils do
6
- context "#split_near" do
7
- it "splits on spaces" do
8
- line, rem = WordTree::TextUtils.split_near("it is near", 7)
9
- expect(line).to eq("it is")
10
- expect(rem).to eq("near")
11
- end
12
-
13
- it "removes a space if index lands on one" do
14
- line, rem = WordTree::TextUtils.split_near("it is near", 5)
15
- expect(line).to eq("it is")
16
- expect(rem).to eq("near")
17
- end
18
-
19
- it "keeps the whole line if index is >= length of line" do
20
- line, rem = WordTree::TextUtils.split_near("it is near", 10)
21
- expect(line).to eq("it is near")
22
- expect(rem).to eq("")
23
-
24
- line, rem = WordTree::TextUtils.split_near("it is near", 11)
25
- expect(line).to eq("it is near")
26
- expect(rem).to eq("")
27
- end
28
-
29
- it "splits at the index anyway if no spaces are found" do
30
- line, rem = WordTree::TextUtils.split_near("itisnear", 4)
31
- expect(line).to eq("itis")
32
- expect(rem).to eq("near")
33
- end
34
- end
35
-
36
- context "#clean_text" do
37
- it "wraps" do
38
- sample_text = "This, [here] is awesome, right"
39
- cleaned = WordTree::TextUtils.clean_text(sample_text, 10)
40
- expect(cleaned).to eq("this here\nis awesome\nright\n")
41
-
42
- cleaned = WordTree::TextUtils.clean_text(sample_text, 15)
43
- expect(cleaned).to eq("this here is\nawesome right\n")
44
-
45
- cleaned = WordTree::TextUtils.clean_text(sample_text, 150)
46
- expect(cleaned).to eq("this here is awesome right\n")
47
- end
48
-
49
- it "joins lines ending in -" do
50
- sample_text = "What-\never\ndo you\n mean?"
51
- cleaned = WordTree::TextUtils.clean_text(sample_text, 10)
52
- expect(cleaned).to eq("whatever\ndo you\nmean .\n")
53
- end
54
-
55
- it "does not ignore sentence boundaries" do
56
- sample_text = "This is a sentence. And so is this? Keep the dots."
57
- cleaned = WordTree::TextUtils.clean_text(sample_text, 150)
58
- expect(cleaned).to eq("this is a sentence . and so is this . keep the dots .\n")
59
- cleaned = WordTree::TextUtils.clean_text(sample_text, 10)
60
- expect(cleaned).to eq("this is a\nsentence .\nand so is\nthis .\nkeep the\ndots .\n")
61
- end
62
-
63
- it "compresses sentence boundary punctuation and spaces" do
64
- sample_text = "words . . and.. stuff"
65
- cleaned = WordTree::TextUtils.clean_text(sample_text, 150)
66
- expect(cleaned).to eq("words . and . stuff\n")
67
- end
68
- end
69
-
70
- context "#each_ngram" do
71
- it "yields ngrams in succession" do
72
- sample_text = "one word\n. two\n"
73
- expect{ |b| WordTree::TextUtils.each_ngram(sample_text, 1, &b) }.to \
74
- yield_successive_args("one", "word", ".", "two")
75
- expect{ |b| WordTree::TextUtils.each_ngram(sample_text, 2, &b) }.to \
76
- yield_successive_args("one word", "word .", ". two")
77
- expect{ |b| WordTree::TextUtils.each_ngram(sample_text, 3, &b) }.to \
78
- yield_successive_args("one word .", "word . two")
79
- end
80
-
81
- it "doesn't hang on unexpected input" do
82
- sample_text = "one word\n. two \n"
83
- Timeout.timeout(3) do
84
- expect{ |b| WordTree::TextUtils.each_ngram(sample_text, 1, &b) }.to \
85
- yield_successive_args("one", "word", ".", "two")
86
- end
87
- end
88
- end
89
- end