incollege-text 1.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,128 @@
1
+ $TESTING=true
2
+
3
+ # Ruby 1.8 encoding check
4
+ major, minor, patch = RUBY_VERSION.split('.')
5
+ if major.to_i == 1 && minor.to_i < 9
6
+ $KCODE='u'
7
+ end
8
+
9
+ $:.push File.join(File.dirname(__FILE__), '..', 'lib')
10
+
11
+ require 'nokogiri'
12
+ require 'json'
13
+ require 'simplecov'
14
+ SimpleCov.start do
15
+ add_group 'Libraries', 'lib'
16
+ end
17
+
18
+ require File.expand_path('../../lib/incollege-text', __FILE__)
19
+ require File.expand_path('../test_urls', __FILE__)
20
+
21
+ RSpec.configure do |config|
22
+ config.include TestUrls
23
+ end
24
+
25
+ RSpec::Matchers.define :match_autolink_expression do
26
+ match do |string|
27
+ !Twitter::Extractor.extract_urls(string).empty?
28
+ end
29
+ end
30
+
31
+ RSpec::Matchers.define :match_autolink_expression_in do |text|
32
+ match do |url|
33
+ @match_data = Twitter::Regex[:valid_url].match(text)
34
+ @match_data && @match_data.to_s.strip == url
35
+ end
36
+
37
+ failure_message_for_should do |url|
38
+ "Expected to find url '#{url}' in text '#{text}', but the match was #{@match_data.captures}'"
39
+ end
40
+ end
41
+
42
+ RSpec::Matchers.define :have_autolinked_url do |url, inner_text|
43
+ match do |text|
44
+ @link = Nokogiri::HTML(text).search("a[@href='#{url}']")
45
+ @link &&
46
+ @link.inner_text &&
47
+ (inner_text && @link.inner_text == inner_text) || (!inner_text && @link.inner_text == url)
48
+ end
49
+
50
+ failure_message_for_should do |text|
51
+ "Expected url '#{url}'#{", inner_text '#{inner_text}'" if inner_text} to be autolinked in '#{text}'"
52
+ end
53
+ end
54
+
55
+ RSpec::Matchers.define :link_to_screen_name do |screen_name, inner_text|
56
+ expected = inner_text ? inner_text : screen_name
57
+
58
+ match do |text|
59
+ @link = Nokogiri::HTML(text).search("a.username")
60
+ @link &&
61
+ @link.inner_text == expected &&
62
+ "https://incollege.com/#{screen_name}".should == @link.first['href']
63
+ end
64
+
65
+ failure_message_for_should do |text|
66
+ if @link.first
67
+ "Expected link '#{@link.inner_text}' with href '#{@link.first['href']}' to match screen_name '#{expected}', but it does not."
68
+ else
69
+ "Expected screen name '#{screen_name}' to be autolinked in '#{text}', but no link was found."
70
+ end
71
+ end
72
+
73
+ failure_message_for_should_not do |text|
74
+ "Expected link '#{@link.inner_text}' with href '#{@link.first['href']}' not to match screen_name '#{expected}', but it does."
75
+ end
76
+
77
+ description do
78
+ "contain a link with the name and href pointing to the expected screen_name"
79
+ end
80
+ end
81
+
82
+ RSpec::Matchers.define :link_to_list_path do |list_path, inner_text|
83
+ expected = inner_text ? inner_text : list_path
84
+
85
+ match do |text|
86
+ @link = Nokogiri::HTML(text).search("a.list-slug")
87
+ @link &&
88
+ @link.inner_text == expected &&
89
+ "https://incollege.com/#{list_path}".downcase.should == @link.first['href']
90
+ end
91
+
92
+ failure_message_for_should do |text|
93
+ if @link.first
94
+ "Expected link '#{@link.inner_text}' with href '#{@link.first['href']}' to match the list path '#{expected}', but it does not."
95
+ else
96
+ "Expected list path '#{list_path}' to be autolinked in '#{text}', but no link was found."
97
+ end
98
+ end
99
+
100
+ failure_message_for_should_not do |text|
101
+ "Expected link '#{@link.inner_text}' with href '#{@link.first['href']}' not to match the list path '#{expected}', but it does."
102
+ end
103
+
104
+ description do
105
+ "contain a link with the list title and an href pointing to the list path"
106
+ end
107
+ end
108
+
109
+ RSpec::Matchers.define :have_autolinked_hashtag do |hashtag|
110
+ match do |text|
111
+ @link = Nokogiri::HTML(text).search("a[@href='https://incollege.com/#!/search?q=#{hashtag.sub(/^#/, '%23')}']")
112
+ @link &&
113
+ @link.inner_text &&
114
+ @link.inner_text == hashtag
115
+ end
116
+
117
+ failure_message_for_should do |text|
118
+ if @link.first
119
+ "Expected link text to be [#{hashtag}], but it was [#{@link.inner_text}] in #{text}"
120
+ else
121
+ "Expected hashtag #{hashtag} to be autolinked in '#{text}', but no link was found."
122
+ end
123
+ end
124
+
125
+ failure_message_for_should_not do |text|
126
+ "Expected link '#{@link.inner_text}' with href '#{@link.first['href']}' not to match the hashtag '#{hashtag}', but it does."
127
+ end
128
+ end
@@ -0,0 +1,84 @@
1
+ # encoding: utf-8
2
+
3
+ module TestUrls
4
+ VALID = [
5
+ "http://google.com",
6
+ "http://foobar.com/#",
7
+ "http://google.com/#foo",
8
+ "http://google.com/#search?q=iphone%20-filter%3Alinks",
9
+ "http://twitter.com/#search?q=iphone%20-filter%3Alinks",
10
+ "http://somedomain.com/index.php?path=/abc/def/",
11
+ "http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html",
12
+ "http://somehost.com:3000",
13
+ "http://xo.com/~matthew+%-x",
14
+ "http://en.wikipedia.org/wiki/Primer_(film)",
15
+ "http://www.ams.org/bookstore-getitem/item=mbk-59",
16
+ "http://chilp.it/?77e8fd",
17
+ "http://tell.me/why",
18
+ "http://longtlds.info",
19
+ "http://✪df.ws/ejp",
20
+ "http://日本.com",
21
+ "http://search.twitter.com/search?q=avro&lang=en",
22
+ "http://mrs.domain-dash.biz",
23
+ "http://x.com/has/one/char/domain",
24
+ "http://t.co/nwcLTFF",
25
+ "http://sub_domain-dash.twitter.com",
26
+ "http://a.b.cd",
27
+ "http://a_b.c-d.com",
28
+ "http://a-b.b.com",
29
+ "http://twitter-dash.com",
30
+ "http://msdn.microsoft.com/ja-jp/library/system.net.httpwebrequest(v=VS.100).aspx",
31
+ "www.foobar.com",
32
+ "WWW.FOOBAR.COM",
33
+ "www.foobar.co.jp",
34
+ "http://t.co",
35
+ "t.co/nwcLTFF",
36
+ "http://foobar.みんな",
37
+ "http://foobar.中国",
38
+ "http://foobar.پاکستان",
39
+ "https://www.youtube.com/playlist?list=PL0ZPu8XSRTB7wZzn0mLHMvyzVFeRxbWn-"
40
+ ] unless defined?(TestUrls::VALID)
41
+
42
+ INVALID = [
43
+ "http://no-tld",
44
+ "http://tld-too-short.x",
45
+ "http://-doman_dash.com",
46
+ "http://_leadingunderscore.twitter.com",
47
+ "http://trailingunderscore_.twitter.com",
48
+ "http://-leadingdash.twitter.com",
49
+ "http://trailingdash-.twitter.com",
50
+ "http://-leadingdash.com",
51
+ "http://trailingdash-.com",
52
+ "http://no_underscores.com",
53
+ "http://test.c_o_m",
54
+ "http://test.c-o-m",
55
+ "http://twitt#{[0x202A].pack('U')}er.com",
56
+ "http://twitt#{[0x202B].pack('U')}er.com",
57
+ "http://twitt#{[0x202C].pack('U')}er.com",
58
+ "http://twitt#{[0x202D].pack('U')}er.com",
59
+ "http://twitt#{[0x202E].pack('U')}er.com"
60
+ ] unless defined?(TestUrls::INVALID)
61
+
62
+ TCO = [
63
+ "http://t.co/P53cv5yO!",
64
+ "http://t.co/fQJmiPGg***",
65
+ "http://t.co/pbY2NfTZ's",
66
+ "http://t.co/2vYHpAc5;",
67
+ "http://t.co/ulYGBYSo:",
68
+ "http://t.co/GeT4bSiw=win",
69
+ "http://t.co/8MkmHU0k+fun",
70
+ "http://t.co/TKLp64dY.yes,",
71
+ "http://t.co/8vuO27cI$$",
72
+ "http://t.co/rPYTvdA8/",
73
+ "http://t.co/WvtMw5ku%",
74
+ "http://t.co/8t7G3ddS#",
75
+ "http://t.co/nfHNJDV2/#!",
76
+ "http://t.co/gK6NOXHs[good]",
77
+ "http://t.co/dMrT0o1Y]bad",
78
+ "http://t.co/FNkPfmii-",
79
+ "http://t.co/sMgS3pjI_oh",
80
+ "http://t.co/F8Dq3Plb~",
81
+ "http://t.co/ivvH58vC&help",
82
+ "http://t.co/iUBL15zD|NZ5KYLQ8"
83
+ ] unless defined?(TestUrls::TCO)
84
+ end
@@ -0,0 +1,31 @@
1
+ # encoding: utf-8
2
+ require File.dirname(__FILE__) + '/spec_helper'
3
+
4
+ describe Incollege::Unicode do
5
+
6
+ it "should lazy-init constants" do
7
+ Incollege::Unicode.const_defined?(:UFEB6).should == false
8
+ Incollege::Unicode::UFEB6.should_not be_nil
9
+ Incollege::Unicode::UFEB6.should be_kind_of(String)
10
+ Incollege::Unicode.const_defined?(:UFEB6).should == true
11
+ end
12
+
13
+ it "should return corresponding character" do
14
+ Incollege::Unicode::UFEB6.should == [0xfeb6].pack('U')
15
+ end
16
+
17
+ it "should allow lowercase notation" do
18
+ Incollege::Unicode::Ufeb6.should == Incollege::Unicode::UFEB6
19
+ Incollege::Unicode::Ufeb6.should === Incollege::Unicode::UFEB6
20
+ end
21
+
22
+ it "should allow underscore notation" do
23
+ Incollege::Unicode::U_FEB6.should == Incollege::Unicode::UFEB6
24
+ Incollege::Unicode::U_FEB6.should === Incollege::Unicode::UFEB6
25
+ end
26
+
27
+ it "should raise on invalid codepoints" do
28
+ lambda { Incollege::Unicode::FFFFFF }.should raise_error(NameError)
29
+ end
30
+
31
+ end
@@ -0,0 +1,43 @@
1
+ # encoding: binary
2
+ require File.dirname(__FILE__) + '/spec_helper'
3
+
4
+ class TestValidation
5
+ include Incollege::Validation
6
+ end
7
+
8
+ describe Incollege::Validation do
9
+
10
+ it "should disallow invalid BOM character" do
11
+ TestValidation.new.tweet_invalid?("Bom:#{Incollege::Unicode::UFFFE}").should == :invalid_characters
12
+ TestValidation.new.tweet_invalid?("Bom:#{Incollege::Unicode::UFEFF}").should == :invalid_characters
13
+ end
14
+
15
+ it "should disallow invalid U+FFFF character" do
16
+ TestValidation.new.tweet_invalid?("Bom:#{Incollege::Unicode::UFFFF}").should == :invalid_characters
17
+ end
18
+
19
+ it "should disallow direction change characters" do
20
+ [0x202A, 0x202B, 0x202C, 0x202D, 0x202E].map{|cp| [cp].pack('U') }.each do |char|
21
+ TestValidation.new.tweet_invalid?("Invalid:#{char}").should == :invalid_characters
22
+ end
23
+ end
24
+
25
+ it "should disallow non-Unicode" do
26
+ TestValidation.new.tweet_invalid?("not-Unicode:\xfff0").should == :invalid_characters
27
+ end
28
+
29
+ it "should allow <= 140 combined accent characters" do
30
+ char = [0x65, 0x0301].pack('U')
31
+ TestValidation.new.tweet_invalid?(char * 139).should == false
32
+ TestValidation.new.tweet_invalid?(char * 140).should == false
33
+ TestValidation.new.tweet_invalid?(char * 141).should == :too_long
34
+ end
35
+
36
+ it "should allow <= 140 multi-byte characters" do
37
+ char = [ 0x1d106 ].pack('U')
38
+ TestValidation.new.tweet_invalid?(char * 139).should == false
39
+ TestValidation.new.tweet_invalid?(char * 140).should == false
40
+ TestValidation.new.tweet_invalid?(char * 141).should == :too_long
41
+ end
42
+
43
+ end
@@ -0,0 +1,207 @@
1
+ require 'multi_json'
2
+ require 'nokogiri'
3
+ require 'test/unit'
4
+ require 'yaml'
5
+
6
+ # Detect Ruby 1.8 and older to apply necessary encoding fixes
7
+ major, minor, patch = RUBY_VERSION.split('.')
8
+ OLD_RUBY = major.to_i == 1 && minor.to_i < 9
9
+
10
+ if OLD_RUBY
11
+ $KCODE='u'
12
+ end
13
+
14
+ $:.unshift File.join(File.dirname(__FILE__), '..', 'lib')
15
+ require 'incollege-text'
16
+
17
+ class ConformanceTest < Test::Unit::TestCase
18
+ include Incollege::Extractor
19
+ include Incollege::Autolink
20
+ include Incollege::HitHighlighter
21
+ include Incollege::Validation
22
+
23
+ private
24
+
25
+ %w(description expected json hits).each do |key|
26
+ define_method key.to_sym do
27
+ @test_info[key]
28
+ end
29
+ end
30
+
31
+ if OLD_RUBY
32
+ def text
33
+ @test_info['text'].gsub(/\\u([0-9a-f]{8})/i) do
34
+ [$1.to_i(16)].pack('U*')
35
+ end
36
+ end
37
+ else
38
+ def text
39
+ @test_info['text']
40
+ end
41
+ end
42
+
43
+ def assert_equal_without_attribute_order(expected, actual, failure_message = nil)
44
+ assert_block(build_message(failure_message, "<?> expected but was\n<?>", expected, actual)) do
45
+ equal_nodes?(Nokogiri::HTML(expected).root, Nokogiri::HTML(actual).root)
46
+ end
47
+ end
48
+
49
+ def equal_nodes?(expected, actual)
50
+ return false unless expected.name == actual.name
51
+ return false unless ordered_attributes(expected) == ordered_attributes(actual)
52
+ return false if expected.text? && actual.text? && expected.content != actual.content
53
+
54
+ expected.children.each_with_index do |child, index|
55
+ return false unless equal_nodes?(child, actual.children[index])
56
+ end
57
+
58
+ true
59
+ end
60
+
61
+ def ordered_attributes(element)
62
+ element.attribute_nodes.map{|attr| [attr.name, attr.value]}.sort
63
+ end
64
+
65
+ CONFORMANCE_DIR = ENV['CONFORMANCE_DIR'] || File.expand_path("../../../conformance", __FILE__)
66
+
67
+ def self.def_conformance_test(file, test_type, &block)
68
+ yaml = YAML.load_file(File.join(CONFORMANCE_DIR, file))
69
+ raise "No such test suite: #{test_type.to_s}" unless yaml["tests"][test_type.to_s]
70
+
71
+ file_name = file.split('.').first
72
+
73
+ yaml["tests"][test_type.to_s].each do |test_info|
74
+ name = :"test_#{file_name}_#{test_type} #{test_info['description']}"
75
+ define_method name do
76
+ @test_info = test_info
77
+ instance_eval(&block)
78
+ end
79
+ end
80
+ end
81
+
82
+ public
83
+
84
+ # Extractor Conformance
85
+ def_conformance_test("extract.yml", :replies) do
86
+ assert_equal expected, extract_reply_screen_name(text), description
87
+ end
88
+
89
+ def_conformance_test("extract.yml", :mentions) do
90
+ assert_equal expected, extract_mentioned_screen_names(text), description
91
+ end
92
+
93
+ def_conformance_test("extract.yml", :mentions_with_indices) do
94
+ e = expected.map{|elem| elem.inject({}){|h, (k,v)| h[k.to_sym] = v; h} }
95
+ assert_equal e, extract_mentioned_screen_names_with_indices(text), description
96
+ end
97
+
98
+ def_conformance_test("extract.yml", :mentions_or_lists_with_indices) do
99
+ e = expected.map{|elem| elem.inject({}){|h, (k,v)| h[k.to_sym] = v; h} }
100
+ assert_equal e, extract_mentions_or_lists_with_indices(text), description
101
+ end
102
+
103
+ def_conformance_test("extract.yml", :urls) do
104
+ assert_equal expected, extract_urls(text), description
105
+ expected.each do |expected_url|
106
+ assert_equal true, valid_url?(expected_url, true, false), "expected url [#{expected_url}] not valid"
107
+ end
108
+ end
109
+
110
+ def_conformance_test("tlds.yml", :generic) do
111
+ assert_equal expected, extract_urls(text), description
112
+ end
113
+
114
+ def_conformance_test("tlds.yml", :country) do
115
+ assert_equal expected, extract_urls(text), description
116
+ end
117
+
118
+ def_conformance_test("extract.yml", :urls_with_indices) do
119
+ e = expected.map{|elem| elem.inject({}){|h, (k,v)| h[k.to_sym] = v; h} }
120
+ assert_equal e, extract_urls_with_indices(text), description
121
+ end
122
+
123
+ def_conformance_test("extract.yml", :hashtags) do
124
+ assert_equal expected, extract_hashtags(text), description
125
+ end
126
+
127
+ def_conformance_test("extract.yml", :hashtags_with_indices) do
128
+ e = expected.map{|elem| elem.inject({}){|h, (k,v)| h[k.to_sym] = v; h} }
129
+ assert_equal e, extract_hashtags_with_indices(text), description
130
+ end
131
+
132
+ def_conformance_test("extract.yml", :cashtags) do
133
+ assert_equal expected, extract_cashtags(text), description
134
+ end
135
+
136
+ def_conformance_test("extract.yml", :cashtags_with_indices) do
137
+ e = expected.map{|elem| elem.inject({}){|h, (k,v)| h[k.to_sym] = v; h} }
138
+ assert_equal e, extract_cashtags_with_indices(text), description
139
+ end
140
+
141
+ # Autolink Conformance
142
+ def_conformance_test("autolink.yml", :usernames) do
143
+ assert_equal_without_attribute_order expected, auto_link_usernames_or_lists(text, :suppress_no_follow => true), description
144
+ end
145
+
146
+ def_conformance_test("autolink.yml", :lists) do
147
+ assert_equal_without_attribute_order expected, auto_link_usernames_or_lists(text, :suppress_no_follow => true), description
148
+ end
149
+
150
+ def_conformance_test("autolink.yml", :urls) do
151
+ assert_equal_without_attribute_order expected, auto_link_urls(text, :suppress_no_follow => true), description
152
+ end
153
+
154
+ def_conformance_test("autolink.yml", :hashtags) do
155
+ assert_equal_without_attribute_order expected, auto_link_hashtags(text, :suppress_no_follow => true), description
156
+ end
157
+
158
+ def_conformance_test("autolink.yml", :cashtags) do
159
+ assert_equal_without_attribute_order expected, auto_link_cashtags(text, :suppress_no_follow => true), description
160
+ end
161
+
162
+ def_conformance_test("autolink.yml", :all) do
163
+ assert_equal_without_attribute_order expected, auto_link(text, :suppress_no_follow => true), description
164
+ end
165
+
166
+ def_conformance_test("autolink.yml", :json) do
167
+ assert_equal_without_attribute_order expected, auto_link_with_json(text, MultiJson.load(json), :suppress_no_follow => true), description
168
+ end
169
+
170
+ # HitHighlighter Conformance
171
+ def_conformance_test("hit_highlighting.yml", :plain_text) do
172
+ assert_equal expected, hit_highlight(text, hits), description
173
+ end
174
+
175
+ def_conformance_test("hit_highlighting.yml", :with_links) do
176
+ assert_equal expected, hit_highlight(text, hits), description
177
+ end
178
+
179
+ # Validation Conformance
180
+ def_conformance_test("validate.yml", :tweets) do
181
+ assert_equal expected, valid_tweet_text?(text), description
182
+ end
183
+
184
+ def_conformance_test("validate.yml", :usernames) do
185
+ assert_equal expected, valid_username?(text), description
186
+ end
187
+
188
+ def_conformance_test("validate.yml", :lists) do
189
+ assert_equal expected, valid_list?(text), description
190
+ end
191
+
192
+ def_conformance_test("validate.yml", :urls) do
193
+ assert_equal expected, valid_url?(text), description
194
+ end
195
+
196
+ def_conformance_test("validate.yml", :urls_without_protocol) do
197
+ assert_equal expected, valid_url?(text, true, false), description
198
+ end
199
+
200
+ def_conformance_test("validate.yml", :hashtags) do
201
+ assert_equal expected, valid_hashtag?(text), description
202
+ end
203
+
204
+ def_conformance_test("validate.yml", :lengths) do
205
+ assert_equal expected, tweet_length(text), description
206
+ end
207
+ end