wuclan 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.textile +20 -0
- data/README.textile +28 -0
- data/examples/analyze/strong_links/gen_multi_edge.rb +103 -0
- data/examples/analyze/strong_links/main.rb +51 -0
- data/examples/analyze/word_count/dump_schema.rb +13 -0
- data/examples/analyze/word_count/freq_user.rb +31 -0
- data/examples/analyze/word_count/freq_whole_corpus.rb +27 -0
- data/examples/analyze/word_count/word_count.pig +43 -0
- data/examples/analyze/word_count/word_count.rb +34 -0
- data/examples/lastfm/scrape/load_lastfm.rb +31 -0
- data/examples/lastfm/scrape/scrape_lastfm.rb +47 -0
- data/examples/lastfm/scrape/seed.tsv +147 -0
- data/examples/twitter/old/load_twitter_search_jobs.rb +157 -0
- data/examples/twitter/old/scrape_twitter_api.rb +104 -0
- data/examples/twitter/old/scrape_twitter_search.rb +57 -0
- data/examples/twitter/old/scrape_twitter_trending.rb +73 -0
- data/examples/twitter/parse/parse_twitter_requests.rb +81 -0
- data/examples/twitter/parse/parse_twitter_search_requests.rb +28 -0
- data/examples/twitter/scrape_twitter_api/scrape_twitter_api.rb +61 -0
- data/examples/twitter/scrape_twitter_api/seed.tsv +4 -0
- data/examples/twitter/scrape_twitter_api/start_cache_twitter.sh +2 -0
- data/examples/twitter/scrape_twitter_api/support/make_request_stats.rb +291 -0
- data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_1.rb +98 -0
- data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_2.pig +4 -0
- data/examples/twitter/scrape_twitter_api/support/twitter_search_jobs.tsv +6 -0
- data/examples/twitter/scrape_twitter_api/support/twitter_trending_seed.tsv +725 -0
- data/examples/twitter/scrape_twitter_hosebird/edamame-killall +4 -0
- data/examples/twitter/scrape_twitter_hosebird/foo.rb +19 -0
- data/examples/twitter/scrape_twitter_hosebird/ps_emulation.rb +111 -0
- data/examples/twitter/scrape_twitter_hosebird/scrape_twitter_hosebird.rb +110 -0
- data/examples/twitter/scrape_twitter_hosebird/test_spewer.rb +20 -0
- data/examples/twitter/scrape_twitter_hosebird/twitter_hosebird_god.yaml +10 -0
- data/examples/twitter/scrape_twitter_search/dump_twitter_search_jobs.rb +38 -0
- data/examples/twitter/scrape_twitter_search/load_twitter_search_jobs.rb +63 -0
- data/examples/twitter/scrape_twitter_search/scrape_twitter_search.rb +44 -0
- data/examples/twitter/scrape_twitter_search/twitter_search_daemons.god +25 -0
- data/lib/old/twitter_api.rb +88 -0
- data/lib/wuclan/delicious/delicious_html_request.rb +31 -0
- data/lib/wuclan/delicious/delicious_models.rb +26 -0
- data/lib/wuclan/delicious/delicious_request.rb +65 -0
- data/lib/wuclan/friendfeed/scrape/friendfeed_search_request.rb +60 -0
- data/lib/wuclan/friendster.rb +7 -0
- data/lib/wuclan/lastfm/model/base.rb +49 -0
- data/lib/wuclan/lastfm/model/sample_responses.txt +16 -0
- data/lib/wuclan/lastfm/scrape/base.rb +195 -0
- data/lib/wuclan/lastfm/scrape/concrete.rb +143 -0
- data/lib/wuclan/lastfm/scrape/lastfm_job.rb +12 -0
- data/lib/wuclan/lastfm/scrape/lastfm_request_stream.rb +17 -0
- data/lib/wuclan/lastfm/scrape/recursive_requests.rb +154 -0
- data/lib/wuclan/lastfm/scrape.rb +12 -0
- data/lib/wuclan/lastfm.rb +7 -0
- data/lib/wuclan/metrics/user_graph_metrics.rb +99 -0
- data/lib/wuclan/metrics/user_metrics.rb +443 -0
- data/lib/wuclan/metrics/user_metrics_basic.rb +277 -0
- data/lib/wuclan/metrics/user_scraping_metrics.rb +64 -0
- data/lib/wuclan/metrics.rb +0 -0
- data/lib/wuclan/myspace.rb +21 -0
- data/lib/wuclan/open_social/model/base.rb +0 -0
- data/lib/wuclan/open_social/scrape/base.rb +111 -0
- data/lib/wuclan/open_social/scrape_request.rb +6 -0
- data/lib/wuclan/open_social.rb +0 -0
- data/lib/wuclan/rdf_output/relationship_rdf.rb +47 -0
- data/lib/wuclan/rdf_output/text_element_rdf.rb +64 -0
- data/lib/wuclan/rdf_output/tweet_rdf.rb +10 -0
- data/lib/wuclan/rdf_output/twitter_rdf.rb +84 -0
- data/lib/wuclan/rdf_output/twitter_user_rdf.rb +12 -0
- data/lib/wuclan/shorturl/shorturl_request.rb +271 -0
- data/lib/wuclan/twitter/api_response_examples.textile +300 -0
- data/lib/wuclan/twitter/model/base.rb +72 -0
- data/lib/wuclan/twitter/model/multi_edge.rb +31 -0
- data/lib/wuclan/twitter/model/relationship.rb +176 -0
- data/lib/wuclan/twitter/model/text_element/extract_info_tests.rb +83 -0
- data/lib/wuclan/twitter/model/text_element/grok_tweets.rb +96 -0
- data/lib/wuclan/twitter/model/text_element/more_regexes.rb +370 -0
- data/lib/wuclan/twitter/model/text_element.rb +38 -0
- data/lib/wuclan/twitter/model/tweet/tokenize.rb +38 -0
- data/lib/wuclan/twitter/model/tweet/tweet_regexes.rb +202 -0
- data/lib/wuclan/twitter/model/tweet/tweet_token.rb +79 -0
- data/lib/wuclan/twitter/model/tweet.rb +74 -0
- data/lib/wuclan/twitter/model/twitter_user/style/color_to_hsv.rb +57 -0
- data/lib/wuclan/twitter/model/twitter_user.rb +145 -0
- data/lib/wuclan/twitter/model.rb +21 -0
- data/lib/wuclan/twitter/parse/ff_ids_parser.rb +27 -0
- data/lib/wuclan/twitter/parse/friends_followers_parser.rb +52 -0
- data/lib/wuclan/twitter/parse/generic_json_parser.rb +26 -0
- data/lib/wuclan/twitter/parse/json_tweet.rb +63 -0
- data/lib/wuclan/twitter/parse/json_twitter_user.rb +122 -0
- data/lib/wuclan/twitter/parse/public_timeline_parser.rb +54 -0
- data/lib/wuclan/twitter/parse/twitter_search_parse.rb +60 -0
- data/lib/wuclan/twitter/parse/user_parser.rb +30 -0
- data/lib/wuclan/twitter/scrape/base.rb +97 -0
- data/lib/wuclan/twitter/scrape/old_skool_request_classes.rb +40 -0
- data/lib/wuclan/twitter/scrape/twitter_fake_fetcher.rb +31 -0
- data/lib/wuclan/twitter/scrape/twitter_ff_ids_request.rb +75 -0
- data/lib/wuclan/twitter/scrape/twitter_followers_request.rb +135 -0
- data/lib/wuclan/twitter/scrape/twitter_json_response.rb +124 -0
- data/lib/wuclan/twitter/scrape/twitter_request_stream.rb +44 -0
- data/lib/wuclan/twitter/scrape/twitter_search_fake_fetcher.rb +44 -0
- data/lib/wuclan/twitter/scrape/twitter_search_flat_stream.rb +30 -0
- data/lib/wuclan/twitter/scrape/twitter_search_job.rb +25 -0
- data/lib/wuclan/twitter/scrape/twitter_search_request.rb +70 -0
- data/lib/wuclan/twitter/scrape/twitter_search_request_stream.rb +19 -0
- data/lib/wuclan/twitter/scrape/twitter_timeline_request.rb +72 -0
- data/lib/wuclan/twitter/scrape/twitter_user_request.rb +64 -0
- data/lib/wuclan/twitter/scrape.rb +27 -0
- data/lib/wuclan/twitter.rb +7 -0
- data/lib/wuclan.rb +1 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/wuclan_spec.rb +7 -0
- data/wuclan.gemspec +184 -0
- metadata +219 -0
|
@@ -0,0 +1,370 @@
|
|
|
1
|
+
|
|
2
|
+
# http://github.com/Empact/html_test/tree/master
|
|
3
|
+
# http://github.com/michaeledgar/validates_not_profane
|
|
4
|
+
#
|
|
5
|
+
# http://github.com/porras/livevalidation/tree/master
|
|
6
|
+
# Rails plugin which allows automatic integration of your Rails application with Javascript library LiveValidation. This library implements client-side form validation and you can
|
|
7
|
+
#
|
|
8
|
+
# http://github.com/cainlevy/semantic-attributes
|
|
9
|
+
#
|
|
10
|
+
# git://github.com/alexdunae/validates_email_format_of.git
|
|
11
|
+
# Validate e-mail addreses against RFC 2822 and RFC 3696 with this popular Ruby on Rails plugin and gem.
|
|
12
|
+
#
|
|
13
|
+
# http://github.com/freelancing-god/active-matchers/tree/master
|
|
14
|
+
# Helpful rspec matchers for testing validations and associations.
|
|
15
|
+
#
|
|
16
|
+
# http://github.com/redinger/validation_reflection/tree/master
|
|
17
|
+
# refl = Person.reflect_on_validations_for(:name)
|
|
18
|
+
# refl[0].macro
|
|
19
|
+
# => :validates_presence_of
|
|
20
|
+
#
|
|
21
|
+
# http://github.com/augustl/live-validations/tree/master
|
|
22
|
+
# Reads Active Record's validations and makes them available to live client side javascript validation scripts
|
|
23
|
+
#
|
|
24
|
+
# http://github.com/adzap/validates_timeliness/tree/master
|
|
25
|
+
# Date and time validation plugin for Rails 2.x and allows custom date/time formats
|
|
26
|
+
|
|
27
|
+
# http://github.com/matthewrudy/regexpert/tree/master
|
|
28
|
+
# Description: A collection of common Regexps for Ruby. Validation for emails, uk postcode, etc.
|
|
29
|
+
#
|
|
30
|
+
|
|
31
|
+
# http://plugins.jquery.com/project/validate
|
|
32
|
+
#
|
|
33
|
+
#
|
|
34
|
+
|
|
35
|
+
# ===========================================================================
|
|
36
|
+
#
|
|
37
|
+
# # http://github.com/matthewrudy/regexpert/blob/master/lib/regexpert.rb
|
|
38
|
+
#
|
|
39
|
+
# module Format
|
|
40
|
+
# # This is taken from dm-more - http://github.com/sam/dm-more/tree/master/dm-validations/lib/dm-validations/formats/email.rb
|
|
41
|
+
# # RFC2822 (No attribution reference available)
|
|
42
|
+
# #
|
|
43
|
+
# # doctest: email_address
|
|
44
|
+
# # >> "MatthewRudyJacobs@gmail.com" =~ Regexpert::Format::EmailAddress
|
|
45
|
+
# # => 0
|
|
46
|
+
# #
|
|
47
|
+
# # >> "dev@" =~ Regexpert::Format::EmailAddress
|
|
48
|
+
# # => nil
|
|
49
|
+
# #
|
|
50
|
+
# EmailAddress = begin
|
|
51
|
+
# alpha = "a-zA-Z"
|
|
52
|
+
# digit = "0-9"
|
|
53
|
+
# atext = "[#{alpha}#{digit}\!\#\$\%\&\'\*+\/\=\?\^\_\`\{\|\}\~\-]"
|
|
54
|
+
# dot_atom_text = "#{atext}+([.]#{atext}*)*"
|
|
55
|
+
# dot_atom = "#{dot_atom_text}"
|
|
56
|
+
# qtext = '[^\\x0d\\x22\\x5c\\x80-\\xff]'
|
|
57
|
+
# text = "[\\x01-\\x09\\x11\\x12\\x14-\\x7f]"
|
|
58
|
+
# quoted_pair = "(\\x5c#{text})"
|
|
59
|
+
# qcontent = "(?:#{qtext}|#{quoted_pair})"
|
|
60
|
+
# quoted_string = "[\"]#{qcontent}+[\"]"
|
|
61
|
+
# atom = "#{atext}+"
|
|
62
|
+
# word = "(?:#{atom}|#{quoted_string})"
|
|
63
|
+
# obs_local_part = "#{word}([.]#{word})*"
|
|
64
|
+
# local_part = "(?:#{dot_atom}|#{quoted_string}|#{obs_local_part})"
|
|
65
|
+
# no_ws_ctl = "\\x01-\\x08\\x11\\x12\\x14-\\x1f\\x7f"
|
|
66
|
+
# dtext = "[#{no_ws_ctl}\\x21-\\x5a\\x5e-\\x7e]"
|
|
67
|
+
# dcontent = "(?:#{dtext}|#{quoted_pair})"
|
|
68
|
+
# domain_literal = "\\[#{dcontent}+\\]"
|
|
69
|
+
# obs_domain = "#{atom}([.]#{atom})*"
|
|
70
|
+
# domain = "(?:#{dot_atom}|#{domain_literal}|#{obs_domain})"
|
|
71
|
+
# addr_spec = "#{local_part}\@#{domain}"
|
|
72
|
+
# pattern = /^#{addr_spec}$/
|
|
73
|
+
# end
|
|
74
|
+
#
|
|
75
|
+
# # This is taken from dm-more http://github.com/sam/dm-more/tree/master/dm-validations/lib/dm-validations/formats/url.rb
|
|
76
|
+
# # Regex from http://www.igvita.com/2006/09/07/validating-url-in-ruby-on-rails/
|
|
77
|
+
# #
|
|
78
|
+
# # doctest: url # examples from Rails auto_link tests
|
|
79
|
+
# # >> "http://www.rubyonrails.com/contact;new" =~ Regexpert::Format::Url
|
|
80
|
+
# # => 0
|
|
81
|
+
# # >> "http://maps.google.co.uk/maps?f=q&q=the+london+eye&ie=UTF8&ll=51.503373,-0.11939&spn=0.007052,0.012767&z=16&iwloc=A" =~ Regexpert::Format::Url
|
|
82
|
+
# # => 0
|
|
83
|
+
# # >> "http://en.wikipedia.org/wiki/Sprite_(computer_graphics)" =~ Regexpert::Format::Url
|
|
84
|
+
# # => 0
|
|
85
|
+
# # TODO: think of a good example of a bad url
|
|
86
|
+
# Url = begin
|
|
87
|
+
# /(^$)|(^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(([0-9]{1,5})?\/.*)?$)/ix
|
|
88
|
+
# end
|
|
89
|
+
#
|
|
90
|
+
# # This is taken from Django.Contrib.Localflavor.uk
|
|
91
|
+
# # The regular expression used is sourced from the schema for British Standard
|
|
92
|
+
# # BS7666 address types: http://www.govtalk.gov.uk/gdsc/schemas/bs7666-v2-0.xsd
|
|
93
|
+
# #
|
|
94
|
+
# # doctest: ukpostcode
|
|
95
|
+
# # >> "GIR 0AA" =~ Regexpert::Format::UKPostcode # GIR 0AA is a special GIRO postcode
|
|
96
|
+
# # => 0
|
|
97
|
+
# # >> "AL40XB" =~ Regexpert::Format::UKPostcode
|
|
98
|
+
# # => 0
|
|
99
|
+
# # >> "CB4 1TL" =~ Regexpert::Format::UKPostcode
|
|
100
|
+
# # => 0
|
|
101
|
+
# #
|
|
102
|
+
# # >> "AL44 NOP" =~ Regexpert::Format::UKPostcode
|
|
103
|
+
# # => nil
|
|
104
|
+
# # >> "CB4-1TL" =~ Regexpert::Format::UKPostcode
|
|
105
|
+
# # => nil
|
|
106
|
+
# #
|
|
107
|
+
# UKPostcode = begin
|
|
108
|
+
# outcode_pattern = '[A-PR-UWYZ]([0-9]{1,2}|([A-HIK-Y][0-9](|[0-9]|[ABEHMNPRVWXY]))|[0-9][A-HJKSTUW])'
|
|
109
|
+
# incode_pattern = '[0-9][ABD-HJLNP-UW-Z]{2}'
|
|
110
|
+
# postcode_regex = Regexp.new("^(GIR *0AA|#{outcode_pattern} *#{incode_pattern})$", Regexp::IGNORECASE)
|
|
111
|
+
# end
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
# ===========================================================================
|
|
115
|
+
#
|
|
116
|
+
# http://www.botvector.net/2008/05/regular-expression-samples.html
|
|
117
|
+
#
|
|
118
|
+
#
|
|
119
|
+
# //Address: State code (US)
|
|
120
|
+
# '/\\b(?:A[KLRZ]|C[AOT]|D[CE]|FL|GA|HI|I[ADLN]|K[SY]|LA|M[ADEINOST]|N[CDEHJMVY]|O[HKR]|PA|RI|S[CD]|T[NX]|UT|V[AT]|W[AIVY])\\b/'
|
|
121
|
+
#
|
|
122
|
+
# //Address: ZIP code (US)
|
|
123
|
+
# '\b[0-9]{5}(?:-[0-9]{4})?\b'
|
|
124
|
+
#
|
|
125
|
+
# //Credit card: All major cards
|
|
126
|
+
# '^(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|6011[0-9]{12}|3(?:0[0-5]|[68][0-9])[0-9]{11}|3[47][0-9]{13})$'
|
|
127
|
+
#
|
|
128
|
+
# //Credit card: American Express
|
|
129
|
+
# '^3[47][0-9]{13}$'
|
|
130
|
+
#
|
|
131
|
+
# //Credit card: Diners Club
|
|
132
|
+
# '^3(?:0[0-5]|[68][0-9])[0-9]{11}$'
|
|
133
|
+
#
|
|
134
|
+
# //Credit card: Discover
|
|
135
|
+
# '^6011[0-9]{12}$'
|
|
136
|
+
#
|
|
137
|
+
# //Credit card: MasterCard
|
|
138
|
+
# '^5[1-5][0-9]{14}$'
|
|
139
|
+
#
|
|
140
|
+
# //Credit card: Visa
|
|
141
|
+
# '^4[0-9]{12}(?:[0-9]{3})?$'
|
|
142
|
+
#
|
|
143
|
+
# //Credit card: remove non-digits
|
|
144
|
+
# '/[^0-9]+/'
|
|
145
|
+
#
|
|
146
|
+
# //Date d/m/yy and dd/mm/yyyy
|
|
147
|
+
# //1/1/00 through 31/12/99 and 01/01/1900 through 31/12/2099
|
|
148
|
+
# //Matches invalid dates such as February 31st
|
|
149
|
+
# '\b(0?[1-9]|[12][0-9]|3[01])[- /.](0?[1-9]|1[012])[- /.](19|20)?[0-9]{2}\b'
|
|
150
|
+
#
|
|
151
|
+
# //Date dd/mm/yyyy
|
|
152
|
+
# //01/01/1900 through 31/12/2099
|
|
153
|
+
# //Matches invalid dates such as February 31st
|
|
154
|
+
# '(0[1-9]|[12][0-9]|3[01])[- /.](0[1-9]|1[012])[- /.](19|20)[0-9]{2}'
|
|
155
|
+
#
|
|
156
|
+
# //Date m/d/y and mm/dd/yyyy
|
|
157
|
+
# //1/1/99 through 12/31/99 and 01/01/1900 through 12/31/2099
|
|
158
|
+
# //Matches invalid dates such as February 31st
|
|
159
|
+
# //Accepts dashes, spaces, forward slashes and dots as date separators
|
|
160
|
+
# '\b(0?[1-9]|1[012])[- /.](0?[1-9]|[12][0-9]|3[01])[- /.](19|20)?[0-9]{2}\b'
|
|
161
|
+
#
|
|
162
|
+
# //Date mm/dd/yyyy
|
|
163
|
+
# //01/01/1900 through 12/31/2099
|
|
164
|
+
# //Matches invalid dates such as February 31st
|
|
165
|
+
# '(0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])[- /.](19|20)[0-9]{2}'
|
|
166
|
+
#
|
|
167
|
+
# //Date yy-m-d or yyyy-mm-dd
|
|
168
|
+
# //00-1-1 through 99-12-31 and 1900-01-01 through 2099-12-31
|
|
169
|
+
# //Matches invalid dates such as February 31st
|
|
170
|
+
# '\b(19|20)?[0-9]{2}[- /.](0?[1-9]|1[012])[- /.](0?[1-9]|[12][0-9]|3[01])\b'
|
|
171
|
+
#
|
|
172
|
+
# //Date yyyy-mm-dd
|
|
173
|
+
# //1900-01-01 through 2099-12-31
|
|
174
|
+
# //Matches invalid dates such as February 31st
|
|
175
|
+
# '(19|20)[0-9]{2}[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])'
|
|
176
|
+
#
|
|
177
|
+
#
|
|
178
|
+
# //IP address
|
|
179
|
+
# //Matches 0.0.0.0 through 999.999.999.999
|
|
180
|
+
# //Use this fast and simple regex if you know the data does not contain invalid IP addresses.
|
|
181
|
+
# '\b([0-9]{1,3})\.([0-9]{1,3})\.([0-9]{1,3})\.([0-9]{1,3})\b'
|
|
182
|
+
#
|
|
183
|
+
# //IP address
|
|
184
|
+
# //Matches 0.0.0.0 through 999.999.999.999
|
|
185
|
+
# //Use this fast and simple regex if you know the data does not contain invalid IP addresses,
|
|
186
|
+
# //and you don't need access to the individual IP numbers.
|
|
187
|
+
# '\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b'
|
|
188
|
+
#
|
|
189
|
+
# //IP address
|
|
190
|
+
# //Matches 0.0.0.0 through 255.255.255.255
|
|
191
|
+
# //Use this regex to match IP numbers with accurracy, without access to the individual IP numbers.
|
|
192
|
+
# '\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b'
|
|
193
|
+
#
|
|
194
|
+
# //IP address
|
|
195
|
+
# //Matches 0.0.0.0 through 255.255.255.255
|
|
196
|
+
# //Use this regex to match IP numbers with accurracy.
|
|
197
|
+
# //Each of the 4 numbers is stored into a capturing group, so you can access them for further processing.
|
|
198
|
+
# '\b(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b'
|
|
199
|
+
#
|
|
200
|
+
#
|
|
201
|
+
# //Number: Currency amount
|
|
202
|
+
# //Optional thousands separators; optional two-digit fraction
|
|
203
|
+
# '\b[0-9]{1,3}(?:,?[0-9]{3})*(?:\.[0-9]{2})?\b'
|
|
204
|
+
#
|
|
205
|
+
# //Number: Currency amount
|
|
206
|
+
# //Optional thousands separators; mandatory two-digit fraction
|
|
207
|
+
# '\b[0-9]{1,3}(?:,?[0-9]{3})*\.[0-9]{2}\b'
|
|
208
|
+
#
|
|
209
|
+
# //Number: floating point
|
|
210
|
+
# //Matches an integer or a floating point number with mandatory integer part. The sign is optional.
|
|
211
|
+
# '[-+]?\b[0-9]+(\.[0-9]+)?\b'
|
|
212
|
+
#
|
|
213
|
+
# //Number: floating point
|
|
214
|
+
# //Matches an integer or a floating point number with optional integer part. The sign is optional.
|
|
215
|
+
# '[-+]?\b[0-9]*\.?[0-9]+\b'
|
|
216
|
+
#
|
|
217
|
+
# //Number: hexadecimal (C-style)
|
|
218
|
+
# '\b0[xX][0-9a-fA-F]+\b'
|
|
219
|
+
#
|
|
220
|
+
# //Number: Insert thousands separators
|
|
221
|
+
# //Replaces 123456789.00 with 123,456,789.00
|
|
222
|
+
# '(?<=[0-9])(?=(?:[0-9]{3})+(?![0-9]))' //Number: integer //Will match 123 and 456 as separate integer numbers in 123.456 '\b\d+\b' //Number: integer //Does not match numbers like 123.456 '(?
|
|
223
|
+
#
|
|
224
|
+
# Passwords
|
|
225
|
+
#
|
|
226
|
+
#
|
|
227
|
+
# //Password complexity
|
|
228
|
+
# //Tests if the input consists of 6 or more letters, digits, underscores and hyphens.
|
|
229
|
+
# //The input must contain at least one upper case letter, one lower case letter and one digit.
|
|
230
|
+
# '\A(?=[-_a-zA-Z0-9]*?[A-Z])(?=[-_a-zA-Z0-9]*?[a-z])(?=[-_a-zA-Z0-9]*?[0-9])[-_a-zA-Z0-9]{6,}\z'
|
|
231
|
+
#
|
|
232
|
+
# //Password complexity
|
|
233
|
+
# //Tests if the input consists of 6 or more characters.
|
|
234
|
+
# //The input must contain at least one upper case letter, one lower case letter and one digit.
|
|
235
|
+
# '\A(?=[-_a-zA-Z0-9]*?[A-Z])(?=[-_a-zA-Z0-9]*?[a-z])(?=[-_a-zA-Z0-9]*?[0-9])\S{6,}\z'
|
|
236
|
+
#
|
|
237
|
+
# //Path: Windows
|
|
238
|
+
# '\b[a-z]:\\[^/:*?"<>|\r\n]*'
|
|
239
|
+
#
|
|
240
|
+
# //Path: Windows
|
|
241
|
+
# //Different elements of the path are captured into backreferences.
|
|
242
|
+
# '\b((?#drive)[a-z]):\\((?#folder)[^/:*?"<>|\r\n]*\\)?((?#file)[^\\/:*?"<>|\r\n]*)'
|
|
243
|
+
#
|
|
244
|
+
# //Path: Windows or UNC
|
|
245
|
+
# '(?:(?#drive)\b[a-z]:|\\\\[a-z0-9]+)\\[^/:*?"<>|\r\n]*'
|
|
246
|
+
#
|
|
247
|
+
# //Path: Windows or UNC
|
|
248
|
+
# //Different elements of the path are captured into backreferences.
|
|
249
|
+
# '((?#drive)\b[a-z]:|\\\\[a-z0-9]+)\\((?#folder)[^/:*?"<>|\r\n]*\\)?((?#file)[^\\/:*?"<>|\r\n]*)'
|
|
250
|
+
|
|
251
|
+
# //Phone Number (North America)
|
|
252
|
+
# //Matches 3334445555, 333.444.5555, 333-444-5555, 333 444 5555, (333) 444 5555 and all combinations thereof.
|
|
253
|
+
# //Replaces all those with (333) 444-5555
|
|
254
|
+
# preg_replace('\(?([0-9]{3})\)?[-. ]?([0-9]{3})[-. ]?([0-9]{4})', '(\1) \2-\3', $text);
|
|
255
|
+
#
|
|
256
|
+
# //Phone Number (North America)
|
|
257
|
+
# //Matches 3334445555, 333.444.5555, 333-444-5555, 333 444 5555, (333) 444 5555 and all combinations thereof.
|
|
258
|
+
# '\(?[0-9]{3}\)?[-. ]?[0-9]{3}[-. ]?[0-9]{4}'
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
# Postal codes
|
|
262
|
+
#
|
|
263
|
+
#
|
|
264
|
+
# //Postal code (Canada)
|
|
265
|
+
# '\b[ABCEGHJKLMNPRSTVXY][0-9][A-Z] [0-9][A-Z][0-9]\b'
|
|
266
|
+
#
|
|
267
|
+
# //Postal code (UK)
|
|
268
|
+
# '\b[A-Z]{1,2}[0-9][A-Z0-9]? [0-9][ABD-HJLNP-UW-Z]{2}\b'
|
|
269
|
+
#
|
|
270
|
+
|
|
271
|
+
#
|
|
272
|
+
# Programming
|
|
273
|
+
#
|
|
274
|
+
# //Programming: GUID
|
|
275
|
+
# //Microsoft-style GUID, numbers only.
|
|
276
|
+
# '[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}'
|
|
277
|
+
#
|
|
278
|
+
# //Programming: GUID
|
|
279
|
+
# //Microsoft-style GUID, with optional parentheses or braces.
|
|
280
|
+
# //(Long version, if your regex flavor doesn't support conditionals.)
|
|
281
|
+
# '[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}|\([A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}\)|\{[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}\}'
|
|
282
|
+
#
|
|
283
|
+
# //Programming: GUID
|
|
284
|
+
# //Microsoft-style GUID, with optional parentheses or braces.
|
|
285
|
+
# //Short version, illustrating the use of regex conditionals. Not all regex flavors support conditionals.
|
|
286
|
+
# //Also, when applied to large chunks of data, the regex using conditionals will likely be slower
|
|
287
|
+
# //than the long version. Straight alternation is much easier to optimize for a regex engine.
|
|
288
|
+
# '(?:(\()|(\{))?[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}(?(1)\))(?(2)\})'
|
|
289
|
+
#
|
|
290
|
+
# //Programming: Remove escapes
|
|
291
|
+
# //Remove backslashes used to escape other characters
|
|
292
|
+
# preg_replace('\\(.)', '\1', $text);
|
|
293
|
+
#
|
|
294
|
+
# //Programming: String
|
|
295
|
+
# //Quotes may appear in the string when escaped with a backslash.
|
|
296
|
+
# //The string may span multiple lines.
|
|
297
|
+
# '"[^"\\]*(?:\\.[^"\\]*)*"'
|
|
298
|
+
|
|
299
|
+
#
|
|
300
|
+
# Escape
|
|
301
|
+
#
|
|
302
|
+
# //Regex: Escape metacharacters
|
|
303
|
+
# //Place a backslash in front of the regular expression metacharacters
|
|
304
|
+
# gsub("[][{}()*+?.\\^$|]", "\\$0", $text);
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
# 3530588 3.4G /workspace/data lab13
|
|
309
|
+
# 2242028 2.2G /workspace/data lab17
|
|
310
|
+
# 3530588 3.4G /workspace/data lab16
|
|
311
|
+
# 3530588 3.4G /workspace/data lab21
|
|
312
|
+
# 3530588 3.4G /workspace/data lab14
|
|
313
|
+
# 4 4.0K /workspace/data lab12
|
|
314
|
+
# 3530588 3.4G /workspace/data lab15
|
|
315
|
+
# 20 20K /workspace/data lab23
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
# Security
|
|
320
|
+
#
|
|
321
|
+
#
|
|
322
|
+
# //Security: ASCII code characters excl. tab and CRLF
|
|
323
|
+
# //Matches any single non-printable code character that may cause trouble in certain situations.
|
|
324
|
+
# //Excludes tabs and line breaks.
|
|
325
|
+
# '[\x00\x08\x0B\x0C\x0E-\x1F]'
|
|
326
|
+
#
|
|
327
|
+
# //Security: ASCII code characters incl. tab and CRLF
|
|
328
|
+
# //Matches any single non-printable code character that may cause trouble in certain situations.
|
|
329
|
+
# //Includes tabs and line breaks.
|
|
330
|
+
# '[\x00-\x1F]'
|
|
331
|
+
#
|
|
332
|
+
# //Security: Escape quotes and backslashes
|
|
333
|
+
# //E.g. escape user input before inserting it into a SQL statement
|
|
334
|
+
# gsub("\\$0", "\\$0", $text);
|
|
335
|
+
#
|
|
336
|
+
# //Security: Unicode code and unassigned characters excl. tab and CRLF
|
|
337
|
+
# //Matches any single non-printable code character that may cause trouble in certain situations.
|
|
338
|
+
# //Also matches any Unicode code point that is unused in the current Unicode standard,
|
|
339
|
+
# //and thus should not occur in text as it cannot be displayed.
|
|
340
|
+
# //Excludes tabs and line breaks.
|
|
341
|
+
# '[^\P{C}\t\r\n]'
|
|
342
|
+
#
|
|
343
|
+
# //Security: Unicode code and unassigned characters incl. tab and CRLF
|
|
344
|
+
# //Matches any single non-printable code character that may cause trouble in certain situations.
|
|
345
|
+
# //Also matches any Unicode code point that is unused in the current Unicode standard,
|
|
346
|
+
# //and thus should not occur in text as it cannot be displayed.
|
|
347
|
+
# //Includes tabs and line breaks.
|
|
348
|
+
# '\p{C}'
|
|
349
|
+
#
|
|
350
|
+
# //Security: Unicode code characters excl. tab and CRLF
|
|
351
|
+
# //Matches any single non-printable code character that may cause trouble in certain situations.
|
|
352
|
+
# //Excludes tabs and line breaks.
|
|
353
|
+
# '[^\P{Cc}\t\r\n]'
|
|
354
|
+
#
|
|
355
|
+
# //Security: Unicode code characters incl. tab and CRLF
|
|
356
|
+
# //Matches any single non-printable code character that may cause trouble in certain situations.
|
|
357
|
+
# //Includes tabs and line breaks.
|
|
358
|
+
# '\p{Cc}'
|
|
359
|
+
#
|
|
360
|
+
#
|
|
361
|
+
#
|
|
362
|
+
# SSN (Social security numbers)
|
|
363
|
+
#
|
|
364
|
+
#
|
|
365
|
+
# //Social security number (US)
|
|
366
|
+
# '\b[0-9]{3}-[0-9]{2}-[0-9]{4}\b'
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
module Wuclan::Models
|
|
2
|
+
|
|
3
|
+
#
|
|
4
|
+
#
|
|
5
|
+
#
|
|
6
|
+
module TextElementCommon
|
|
7
|
+
# Key on text-status_id
|
|
8
|
+
def num_key_fields() 2 end
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
#
|
|
12
|
+
# Topical #hashtags extracted from tweet text
|
|
13
|
+
#
|
|
14
|
+
# the twitter_user_id is denormalized
|
|
15
|
+
# but is often what we wnat: saves a join
|
|
16
|
+
#
|
|
17
|
+
class Hashtag < TypedStruct.new(
|
|
18
|
+
[:hashtag, String ],
|
|
19
|
+
[:status_id, Integer ],
|
|
20
|
+
[:twitter_user_id, Integer ]
|
|
21
|
+
)
|
|
22
|
+
include ModelCommon
|
|
23
|
+
include TextElementCommon
|
|
24
|
+
alias_method :text, :hashtag
|
|
25
|
+
def numeric_id_fields() [:twitter_user_id, :status_id] ; end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
class TweetUrl < TypedStruct.new(
|
|
29
|
+
[:tweet_url, String ],
|
|
30
|
+
[:status_id, Integer ],
|
|
31
|
+
[:twitter_user_id, Integer ]
|
|
32
|
+
)
|
|
33
|
+
include ModelCommon
|
|
34
|
+
include TextElementCommon
|
|
35
|
+
alias_method :text, :tweet_url
|
|
36
|
+
def numeric_id_fields() [:twitter_user_id, :status_id] ; end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
require 'wuclan/models/tweet/tweet_token'
|
|
2
|
+
require 'wukong/encoding'
|
|
3
|
+
module Wuclan::Models
|
|
4
|
+
Tweet.class_eval do
|
|
5
|
+
def string_for_tokenizing
|
|
6
|
+
# simpleminded test for non-latin script: don't bother if > 20 entities
|
|
7
|
+
return if (text.count('&') > 20)
|
|
8
|
+
# skip default message from early days
|
|
9
|
+
return if (text =~ /just setting up my twttr/);
|
|
10
|
+
# return decoded, whitespace-flattened text
|
|
11
|
+
self.decoded_text.gsub(/\s+/s, ' ').strip
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def tokens_for klass, str
|
|
15
|
+
klass.extract_tokens!(str).map do |word|
|
|
16
|
+
klass.new(word, twitter_user_id, id, 1)
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def tokenize extract_word_tokens=nil
|
|
21
|
+
str = string_for_tokenizing
|
|
22
|
+
return [] if str.blank?
|
|
23
|
+
toks = []
|
|
24
|
+
# Case-sensitive tokens
|
|
25
|
+
[ SmilieToken, UrlToken ].each do |klass|
|
|
26
|
+
toks += tokens_for klass, str
|
|
27
|
+
end
|
|
28
|
+
# Case-insensitive tokens
|
|
29
|
+
str.downcase!
|
|
30
|
+
[ RtToken, AtsignToken, HashtagToken ].each do |klass| # ,
|
|
31
|
+
toks += tokens_for klass, str
|
|
32
|
+
end
|
|
33
|
+
toks += tokens_for WordToken, str if extract_word_tokens
|
|
34
|
+
toks
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
end
|
|
38
|
+
end
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
module Wuclan
|
|
3
|
+
module Models
|
|
4
|
+
module TweetRegexes
|
|
5
|
+
# ===========================================================================
|
|
6
|
+
#
|
|
7
|
+
# Twitter accepts URLs somewhat idiosyncratically, probably for good reason --
|
|
8
|
+
# we rarely see ()![] in urls; more likely in a status they are punctuation.
|
|
9
|
+
#
|
|
10
|
+
# This is what I've reverse engineered.
|
|
11
|
+
#
|
|
12
|
+
#
|
|
13
|
+
# Notes:
|
|
14
|
+
#
|
|
15
|
+
# * is.gd uses a trailing '-' (to indicate 'preview mode'): clever.
|
|
16
|
+
# * pastoid.com uses a trailing '+', and idek.net a trailing ~ for no reason. annoying.
|
|
17
|
+
#
|
|
18
|
+
# Counterexamples:
|
|
19
|
+
# * http://www.5irecipe.cn/recipe_content/2307/'/
|
|
20
|
+
# * http://www.facebook.com/groups.php?id=1347199977&gv=12#/group.php?gid=18183539495
|
|
21
|
+
#
|
|
22
|
+
RE_DOMAIN_HEAD = '(?:[a-zA-Z0-9\-]+\.)+'
|
|
23
|
+
RE_DOMAIN_TLD = '(?:com|org|net|edu|gov|mil|biz|info|mobi|name|aero|jobs|museum|[a-zA-Z]{2})'
|
|
24
|
+
# RE_URL_SCHEME = '[a-zA-Z][a-zA-Z0-9\-\+\.]+'
|
|
25
|
+
RE_URL_SCHEME_STRICT = '[a-zA-Z]{3,6}'
|
|
26
|
+
RE_URL_UNRESERVED = 'a-zA-Z0-9' + '\-\._~'
|
|
27
|
+
RE_URL_OKCHARS = RE_URL_UNRESERVED + '\'\+\,\;=' + '/%:@' # not !$&()* [] \|
|
|
28
|
+
RE_URL_QUERYCHARS = RE_URL_OKCHARS + '&='
|
|
29
|
+
RE_URL_HOSTPART = "#{RE_URL_SCHEME_STRICT}://#{RE_DOMAIN_HEAD}#{RE_DOMAIN_TLD}"
|
|
30
|
+
RE_URL = %r{(
|
|
31
|
+
#{RE_URL_HOSTPART} # Host
|
|
32
|
+
(?:(?: \/ [#{RE_URL_OKCHARS}]+? )*? # path: / delimited path segments
|
|
33
|
+
(?: \/ [#{RE_URL_OKCHARS}]*[\w\-\+\~] ) # where the last one ends in a non-punctuation.
|
|
34
|
+
| # ... or no path segment
|
|
35
|
+
)\/? # with an optional trailing slash
|
|
36
|
+
(?: \? [#{RE_URL_QUERYCHARS}]+ )? # query: introduced by a ?, with &foo= delimited segments
|
|
37
|
+
(?: \# [#{RE_URL_OKCHARS}]+ )? # frag: introduced by a #
|
|
38
|
+
)}x
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
#
|
|
42
|
+
# Technically a scheme can allow the characters '+', '-' and '.' within
|
|
43
|
+
# it. In practice you can not only ignore those characters but all but a
|
|
44
|
+
# few specific schemes.
|
|
45
|
+
#
|
|
46
|
+
# From a collection of ~9M tweeted urls, 99.4% were http://, with only the additional
|
|
47
|
+
# https, mms, ftp, git, irc, feed, itpc, rtsp, hxxp, gopher, telnet, itms, ssh, webcal, svn
|
|
48
|
+
# seemingly worth finding:
|
|
49
|
+
#
|
|
50
|
+
# 8925742 http
|
|
51
|
+
# 6026 https 1841 ivo 122 mms 85 ftp 61 git 53 irc 45 feed 31 itpc 12 www
|
|
52
|
+
# 12 rtsp 12 hxxp 12 gopher 9 telnet 9 itms 7 ssh 5 webcal 5 sop 4 wiie
|
|
53
|
+
# 3 svn 3 sssp 3 file 2 res 1 xttp 1 xmlrpc 1 ssl 1 smb
|
|
54
|
+
#
|
|
55
|
+
# An hxxp http://en.wikipedia.org/wiki/Hxxp is used to obscure a link, so
|
|
56
|
+
# take of that what you may.
|
|
57
|
+
#
|
|
58
|
+
# The ivo:// scheme is used by virtual astronomical observatories; as its
|
|
59
|
+
# hostnames are given in reverse-dotted notation (uk.org.estar) these URIs
|
|
60
|
+
# are imperfectly recognized. Twitter doesn't accept them at all:
|
|
61
|
+
# http://twitter.com/eSTAR_Project/status/1113930948
|
|
62
|
+
#
|
|
63
|
+
#
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# ===========================================================================
|
|
67
|
+
#
|
|
68
|
+
# A hash following a non-alphanum_ (or at the start of the line
|
|
69
|
+
# followed by (any number of alpha, num, -_.+:=) and ending in an alphanum_
|
|
70
|
+
#
|
|
71
|
+
# This is overly generous to those dorky triple tags (geo:lat=69.3), but we'll soldier on somehow.
|
|
72
|
+
#
|
|
73
|
+
RE_HASHTAGS = %r{(?:^|\W)\#([a-zA-Z0-9\-_\.+:=]+\w)(?:\W|$)}
|
|
74
|
+
|
|
75
|
+
# ===========================================================================
|
|
76
|
+
#
|
|
77
|
+
# Retweets and Retweet Whores
|
|
78
|
+
#
|
|
79
|
+
# See ARetweetsB for more info.
|
|
80
|
+
#
|
|
81
|
+
# A retweet
|
|
82
|
+
# RT @interesting_user Something so witty Dorothy Parker would just give up
|
|
83
|
+
# Oh yeah and so's your mom (via @sixth_grader)
|
|
84
|
+
# retweeting @ogre: KEGGER TONITE RT pls
|
|
85
|
+
# ^^^ this is not a rtwhore; it matches first as a retweet
|
|
86
|
+
#
|
|
87
|
+
# and rtwhores
|
|
88
|
+
# retweet please: Hey here's something I'm whoring xxx
|
|
89
|
+
# KEGGER TONITE RT pls
|
|
90
|
+
#
|
|
91
|
+
# or semantically-incorrect matches such as (actual example):
|
|
92
|
+
# @somebody lol, love the 'please retweet' ending!
|
|
93
|
+
#
|
|
94
|
+
# Things that don't match:
|
|
95
|
+
# retweet is silly, @i_think_youre_dumb
|
|
96
|
+
# misspell the name of my Sony Via
|
|
97
|
+
#
|
|
98
|
+
RE_RETWEET_WORDS = 'rt|retweet|retweeting'
|
|
99
|
+
RE_RETWEET_ONLY = %r{(?:#{RE_RETWEET_WORDS})}
|
|
100
|
+
RE_RETWEET_OR_VIA = %r{(?:#{RE_RETWEET_WORDS}|via|from)}
|
|
101
|
+
RE_PLEASE = %r{(?:please|plz|pls)}
|
|
102
|
+
RE_RETWEET = %r{\b#{RE_RETWEET_OR_VIA}\W*@(\w+)\b}i
|
|
103
|
+
RE_RTWHORE = %r{
|
|
104
|
+
\b#{RE_RETWEET_ONLY}\W*#{RE_PLEASE}\b
|
|
105
|
+
| \b#{RE_PLEASE}\W*#{RE_RETWEET_ONLY}\b}ix
|
|
106
|
+
|
|
107
|
+
# ===========================================================================
|
|
108
|
+
#
|
|
109
|
+
# following either the start of the line, or a non-alphanum_ character
|
|
110
|
+
# the string of following [a-zA-Z0-9_]
|
|
111
|
+
#
|
|
112
|
+
# Note carefully: we _demand_ a preceding character (or start of line):
|
|
113
|
+
# \b would match email@address.com, which we don't want.
|
|
114
|
+
#
|
|
115
|
+
# Making an exception for RT@im_cramped_for_space.
|
|
116
|
+
#
|
|
117
|
+
# All retweets
|
|
118
|
+
#
|
|
119
|
+
RE_ATSIGNS = %r{(?:^|\W|#{RE_RETWEET_OR_VIA})@(\w+)\b}
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
# ===========================================================================
|
|
124
|
+
#
|
|
125
|
+
# Smilies !!! ^_^
|
|
126
|
+
#
|
|
127
|
+
|
|
128
|
+
# RE_NUMBERS = %r{
|
|
129
|
+
# (?:^|\D) # non-number
|
|
130
|
+
# (
|
|
131
|
+
# |(?:\(\d{3}\)[\ \-]?\d{3}[\ \-]\d{4})
|
|
132
|
+
# |(?: (?:\d{1,3}\.)(?:\d{3},)*\.?\d+) # decimal number
|
|
133
|
+
# |(?: (?:\d{1,3}\.)(?:\d{3}\.)*,?\d+) # euro-style
|
|
134
|
+
# \d+
|
|
135
|
+
# )
|
|
136
|
+
# }x
|
|
137
|
+
#
|
|
138
|
+
# # IP address
|
|
139
|
+
# \b(?:\d{1,3}\.){3}\d{1,3}\b
|
|
140
|
+
# credit card: (lax)
|
|
141
|
+
# \b(?:\d[ -]*){13,16}\b
|
|
142
|
+
# \b(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|6(?:011|5[0-9][0-9])[0-9]{12}|3[47][0-9]{13}|3(?:0[0-5]|[68][0-9])[0-9]{11}|(?:2131|1800|35\d{3})\d{11})\b
|
|
143
|
+
#
|
|
144
|
+
# [-+]?[0-9,]*\.?[0-9]*
|
|
145
|
+
# [-+]?[0-9]*(\.[0-9]+)?([eE][-+]?[0-9]+)?
|
|
146
|
+
|
|
147
|
+
# ===========================================================================
|
|
148
|
+
#
|
|
149
|
+
# Smilies !!! ^_^
|
|
150
|
+
#
|
|
151
|
+
RE_SMILIES_EYES = "\\:8;"
|
|
152
|
+
RE_SMILIES_NOSE = "\\-=\\*o"
|
|
153
|
+
RE_SMILIES_MOUTH = "DP@Oo\\(\\)\\[\\]\\|\\{\\}\\/\\\\"
|
|
154
|
+
RE_SMILIES = %r{
|
|
155
|
+
(?:^|\W) # non-smilie character
|
|
156
|
+
( (?:
|
|
157
|
+
>?
|
|
158
|
+
[#{RE_SMILIES_EYES}] # eyes
|
|
159
|
+
[#{RE_SMILIES_NOSE}]? # nose, maybe
|
|
160
|
+
[#{RE_SMILIES_MOUTH}] ) # mouth
|
|
161
|
+
|(?:
|
|
162
|
+
[#{RE_SMILIES_MOUTH}] # mouth
|
|
163
|
+
[#{RE_SMILIES_NOSE}]? # nose, maybe
|
|
164
|
+
[#{RE_SMILIES_EYES}] # eyes
|
|
165
|
+
<? )
|
|
166
|
+
|(?: =[#{RE_SMILIES_MOUTH}]) # =) (=
|
|
167
|
+
|(?: [#{RE_SMILIES_MOUTH}]=) # =) (=
|
|
168
|
+
|(?: \^[_\-]\^ ) # kawaaaaiiii!
|
|
169
|
+
|(?: :[,\']\( ) # snif
|
|
170
|
+
|(?: <3 ) # heart
|
|
171
|
+
|(?: \\m/ ) # rawk
|
|
172
|
+
|(?: x-\( ) # dead
|
|
173
|
+
)
|
|
174
|
+
(?:\W|$)
|
|
175
|
+
}x
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
# http://mail.google.com/support/bin/answer.py?hl=en&answer=34056
|
|
182
|
+
# http://en.wikipedia.org/wiki/Emoticons
|
|
183
|
+
#
|
|
184
|
+
# :-) :) =] =) Smiling, happy
|
|
185
|
+
# :-( =( :[ :< frowning, Sad
|
|
186
|
+
# ;-) ;) ;] Wink
|
|
187
|
+
# :D =D XD BD Large grin or laugh
|
|
188
|
+
# :P =P XP Tongue out, or after a joke
|
|
189
|
+
# <3 S2 :> Love
|
|
190
|
+
# :O =O Shocked or surprised
|
|
191
|
+
# =I :/ :-\ Bored, annoyed or awkward; concerned.
|
|
192
|
+
# :S =S :? Confused, embarrassed or uneasy
|
|
193
|
+
|
|
194
|
+
# Icon Meaning Icon Meaning Icon Meaning
|
|
195
|
+
# (^_^) smile (^o^) laughing out loud d(^_^)b thumbs up (not ears)
|
|
196
|
+
# (T_T) sad (crying face) (-.-)Zzz sleeping (Z.Z) sleepy person
|
|
197
|
+
# \(^_^)/ cheers, "Hurrah!" (*^^*) shyness (-_-); sweating (as in ashamed), or exasperated.
|
|
198
|
+
# (*3*) "Surprise !." (?_?) "Nonsense, I don't know." (^_~) wink
|
|
199
|
+
# (o.O) shocked/disturbed (<.<) shifty, suspicious v(^_^)v peace
|
|
200
|
+
#
|
|
201
|
+
# [\\dv](^_^)[bv/]
|
|
202
|
+
#
|