wordtree 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,107 +0,0 @@
1
- require 'strscan'
2
-
3
- module WordTree
4
- module TextUtils
5
- def self.split_near(text, split_index)
6
- if split_index >= text.size
7
- return [text, ""]
8
- else
9
- index = split_index
10
- while index >= 0
11
- if text[index] == ' '
12
- return [text[0...index], text[(index+1)..-1]]
13
- end
14
- index -= 1
15
- end
16
- return [text[0...split_index], text[split_index..-1]]
17
- end
18
- end
19
-
20
- # Remove punctuation an non-alphabetical characters from a text, and return
21
- # a cleaned-up version wrapped at +wrap+ characters per line.
22
- def self.clean_text(input, wrap=120)
23
- join = nil
24
- output = String.new
25
- output_line = String.new
26
-
27
- # Ignore non-UTF-8 characters
28
- input = input.encode('UTF-8', :invalid => :replace, :undef => :replace).downcase
29
-
30
- _0 = '0'.ord
31
- _9 = '9'.ord
32
- _a = 'a'.ord
33
- _z = 'z'.ord
34
- _A = 'A'.ord
35
- _Z = 'Z'.ord
36
- _dash = '-'.ord
37
- _space = ' '.ord
38
- _newline = "\n".ord
39
- _period = '.'.ord
40
- _question = '?'.ord
41
-
42
- join_lines = false
43
- just_added_space = false
44
- just_added_period = false
45
- line_length = 0
46
- input.each_char do |c|
47
- c = c.ord
48
- # Change upper-case to lower-case
49
- c -= 32 if (c >= _A && c <= _Z)
50
- # Change newlines to spaces
51
- c = _space if c == _newline
52
- # Change question marks to periods (i.e. both count as sentence boundaries)
53
- c = _period if c == _question
54
-
55
- if c == _dash
56
- # In case of a dash, set the scoop-spaces-up flag
57
- join_lines = true
58
- elsif join_lines && (c == _space)
59
- # ignore
60
- elsif (c == _period) && !just_added_period
61
- if !just_added_space
62
- output << _space.chr
63
- end
64
- output << c.chr
65
- just_added_period = true
66
- just_added_space = true
67
- elsif (c >= _a && c <= _z) || (c == _space && !just_added_space)
68
- # Add letters and spaces
69
- output << _space.chr if just_added_period
70
- output << c.chr
71
- line_length += 1
72
- just_added_space = (c == _space)
73
- just_added_period = false
74
- join_lines = false
75
- end
76
- end
77
-
78
- wrapped_output = String.new
79
- begin
80
- output_line, remainder = split_near(output, wrap)
81
- wrapped_output << output_line + "\n"
82
- output = remainder
83
- end while remainder.size > wrap
84
- wrapped_output << remainder + "\n" unless remainder.empty?
85
-
86
- return wrapped_output
87
- end
88
-
89
- def self.each_ngram(input, n=1, &block)
90
- onegram_re = /([^ \n]+[ \n])/
91
- ngram_re = /([^ \n]+[ \n]){#{n},#{n}}/
92
- s = StringScanner.new(input)
93
- while !s.eos?
94
- if words = s.scan(ngram_re)
95
- yield words.rstrip.tr("\n", " ") if block_given?
96
- # Move back to beginning of n-word sequence
97
- s.unscan
98
- end
99
- # Move forward one word
100
- if !s.scan(onegram_re)
101
- # if we can't find a word, let's try to recover by scanning one char at a time
102
- s.scan(/./m)
103
- end
104
- end
105
- end
106
- end
107
- end
@@ -1,89 +0,0 @@
1
- require 'spec_helper'
2
- require 'wordtree/text_utils'
3
- require 'timeout'
4
-
5
- describe WordTree::TextUtils do
6
- context "#split_near" do
7
- it "splits on spaces" do
8
- line, rem = WordTree::TextUtils.split_near("it is near", 7)
9
- expect(line).to eq("it is")
10
- expect(rem).to eq("near")
11
- end
12
-
13
- it "removes a space if index lands on one" do
14
- line, rem = WordTree::TextUtils.split_near("it is near", 5)
15
- expect(line).to eq("it is")
16
- expect(rem).to eq("near")
17
- end
18
-
19
- it "keeps the whole line if index is >= length of line" do
20
- line, rem = WordTree::TextUtils.split_near("it is near", 10)
21
- expect(line).to eq("it is near")
22
- expect(rem).to eq("")
23
-
24
- line, rem = WordTree::TextUtils.split_near("it is near", 11)
25
- expect(line).to eq("it is near")
26
- expect(rem).to eq("")
27
- end
28
-
29
- it "splits at the index anyway if no spaces are found" do
30
- line, rem = WordTree::TextUtils.split_near("itisnear", 4)
31
- expect(line).to eq("itis")
32
- expect(rem).to eq("near")
33
- end
34
- end
35
-
36
- context "#clean_text" do
37
- it "wraps" do
38
- sample_text = "This, [here] is awesome, right"
39
- cleaned = WordTree::TextUtils.clean_text(sample_text, 10)
40
- expect(cleaned).to eq("this here\nis awesome\nright\n")
41
-
42
- cleaned = WordTree::TextUtils.clean_text(sample_text, 15)
43
- expect(cleaned).to eq("this here is\nawesome right\n")
44
-
45
- cleaned = WordTree::TextUtils.clean_text(sample_text, 150)
46
- expect(cleaned).to eq("this here is awesome right\n")
47
- end
48
-
49
- it "joins lines ending in -" do
50
- sample_text = "What-\never\ndo you\n mean?"
51
- cleaned = WordTree::TextUtils.clean_text(sample_text, 10)
52
- expect(cleaned).to eq("whatever\ndo you\nmean .\n")
53
- end
54
-
55
- it "does not ignore sentence boundaries" do
56
- sample_text = "This is a sentence. And so is this? Keep the dots."
57
- cleaned = WordTree::TextUtils.clean_text(sample_text, 150)
58
- expect(cleaned).to eq("this is a sentence . and so is this . keep the dots .\n")
59
- cleaned = WordTree::TextUtils.clean_text(sample_text, 10)
60
- expect(cleaned).to eq("this is a\nsentence .\nand so is\nthis .\nkeep the\ndots .\n")
61
- end
62
-
63
- it "compresses sentence boundary punctuation and spaces" do
64
- sample_text = "words . . and.. stuff"
65
- cleaned = WordTree::TextUtils.clean_text(sample_text, 150)
66
- expect(cleaned).to eq("words . and . stuff\n")
67
- end
68
- end
69
-
70
- context "#each_ngram" do
71
- it "yields ngrams in succession" do
72
- sample_text = "one word\n. two\n"
73
- expect{ |b| WordTree::TextUtils.each_ngram(sample_text, 1, &b) }.to \
74
- yield_successive_args("one", "word", ".", "two")
75
- expect{ |b| WordTree::TextUtils.each_ngram(sample_text, 2, &b) }.to \
76
- yield_successive_args("one word", "word .", ". two")
77
- expect{ |b| WordTree::TextUtils.each_ngram(sample_text, 3, &b) }.to \
78
- yield_successive_args("one word .", "word . two")
79
- end
80
-
81
- it "doesn't hang on unexpected input" do
82
- sample_text = "one word\n. two \n"
83
- Timeout.timeout(3) do
84
- expect{ |b| WordTree::TextUtils.each_ngram(sample_text, 1, &b) }.to \
85
- yield_successive_args("one", "word", ".", "two")
86
- end
87
- end
88
- end
89
- end