wuclan 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (111) hide show
  1. data/LICENSE.textile +20 -0
  2. data/README.textile +28 -0
  3. data/examples/analyze/strong_links/gen_multi_edge.rb +103 -0
  4. data/examples/analyze/strong_links/main.rb +51 -0
  5. data/examples/analyze/word_count/dump_schema.rb +13 -0
  6. data/examples/analyze/word_count/freq_user.rb +31 -0
  7. data/examples/analyze/word_count/freq_whole_corpus.rb +27 -0
  8. data/examples/analyze/word_count/word_count.pig +43 -0
  9. data/examples/analyze/word_count/word_count.rb +34 -0
  10. data/examples/lastfm/scrape/load_lastfm.rb +31 -0
  11. data/examples/lastfm/scrape/scrape_lastfm.rb +47 -0
  12. data/examples/lastfm/scrape/seed.tsv +147 -0
  13. data/examples/twitter/old/load_twitter_search_jobs.rb +157 -0
  14. data/examples/twitter/old/scrape_twitter_api.rb +104 -0
  15. data/examples/twitter/old/scrape_twitter_search.rb +57 -0
  16. data/examples/twitter/old/scrape_twitter_trending.rb +73 -0
  17. data/examples/twitter/parse/parse_twitter_requests.rb +81 -0
  18. data/examples/twitter/parse/parse_twitter_search_requests.rb +28 -0
  19. data/examples/twitter/scrape_twitter_api/scrape_twitter_api.rb +61 -0
  20. data/examples/twitter/scrape_twitter_api/seed.tsv +4 -0
  21. data/examples/twitter/scrape_twitter_api/start_cache_twitter.sh +2 -0
  22. data/examples/twitter/scrape_twitter_api/support/make_request_stats.rb +291 -0
  23. data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_1.rb +98 -0
  24. data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_2.pig +4 -0
  25. data/examples/twitter/scrape_twitter_api/support/twitter_search_jobs.tsv +6 -0
  26. data/examples/twitter/scrape_twitter_api/support/twitter_trending_seed.tsv +725 -0
  27. data/examples/twitter/scrape_twitter_hosebird/edamame-killall +4 -0
  28. data/examples/twitter/scrape_twitter_hosebird/foo.rb +19 -0
  29. data/examples/twitter/scrape_twitter_hosebird/ps_emulation.rb +111 -0
  30. data/examples/twitter/scrape_twitter_hosebird/scrape_twitter_hosebird.rb +110 -0
  31. data/examples/twitter/scrape_twitter_hosebird/test_spewer.rb +20 -0
  32. data/examples/twitter/scrape_twitter_hosebird/twitter_hosebird_god.yaml +10 -0
  33. data/examples/twitter/scrape_twitter_search/dump_twitter_search_jobs.rb +38 -0
  34. data/examples/twitter/scrape_twitter_search/load_twitter_search_jobs.rb +63 -0
  35. data/examples/twitter/scrape_twitter_search/scrape_twitter_search.rb +44 -0
  36. data/examples/twitter/scrape_twitter_search/twitter_search_daemons.god +25 -0
  37. data/lib/old/twitter_api.rb +88 -0
  38. data/lib/wuclan/delicious/delicious_html_request.rb +31 -0
  39. data/lib/wuclan/delicious/delicious_models.rb +26 -0
  40. data/lib/wuclan/delicious/delicious_request.rb +65 -0
  41. data/lib/wuclan/friendfeed/scrape/friendfeed_search_request.rb +60 -0
  42. data/lib/wuclan/friendster.rb +7 -0
  43. data/lib/wuclan/lastfm/model/base.rb +49 -0
  44. data/lib/wuclan/lastfm/model/sample_responses.txt +16 -0
  45. data/lib/wuclan/lastfm/scrape/base.rb +195 -0
  46. data/lib/wuclan/lastfm/scrape/concrete.rb +143 -0
  47. data/lib/wuclan/lastfm/scrape/lastfm_job.rb +12 -0
  48. data/lib/wuclan/lastfm/scrape/lastfm_request_stream.rb +17 -0
  49. data/lib/wuclan/lastfm/scrape/recursive_requests.rb +154 -0
  50. data/lib/wuclan/lastfm/scrape.rb +12 -0
  51. data/lib/wuclan/lastfm.rb +7 -0
  52. data/lib/wuclan/metrics/user_graph_metrics.rb +99 -0
  53. data/lib/wuclan/metrics/user_metrics.rb +443 -0
  54. data/lib/wuclan/metrics/user_metrics_basic.rb +277 -0
  55. data/lib/wuclan/metrics/user_scraping_metrics.rb +64 -0
  56. data/lib/wuclan/metrics.rb +0 -0
  57. data/lib/wuclan/myspace.rb +21 -0
  58. data/lib/wuclan/open_social/model/base.rb +0 -0
  59. data/lib/wuclan/open_social/scrape/base.rb +111 -0
  60. data/lib/wuclan/open_social/scrape_request.rb +6 -0
  61. data/lib/wuclan/open_social.rb +0 -0
  62. data/lib/wuclan/rdf_output/relationship_rdf.rb +47 -0
  63. data/lib/wuclan/rdf_output/text_element_rdf.rb +64 -0
  64. data/lib/wuclan/rdf_output/tweet_rdf.rb +10 -0
  65. data/lib/wuclan/rdf_output/twitter_rdf.rb +84 -0
  66. data/lib/wuclan/rdf_output/twitter_user_rdf.rb +12 -0
  67. data/lib/wuclan/shorturl/shorturl_request.rb +271 -0
  68. data/lib/wuclan/twitter/api_response_examples.textile +300 -0
  69. data/lib/wuclan/twitter/model/base.rb +72 -0
  70. data/lib/wuclan/twitter/model/multi_edge.rb +31 -0
  71. data/lib/wuclan/twitter/model/relationship.rb +176 -0
  72. data/lib/wuclan/twitter/model/text_element/extract_info_tests.rb +83 -0
  73. data/lib/wuclan/twitter/model/text_element/grok_tweets.rb +96 -0
  74. data/lib/wuclan/twitter/model/text_element/more_regexes.rb +370 -0
  75. data/lib/wuclan/twitter/model/text_element.rb +38 -0
  76. data/lib/wuclan/twitter/model/tweet/tokenize.rb +38 -0
  77. data/lib/wuclan/twitter/model/tweet/tweet_regexes.rb +202 -0
  78. data/lib/wuclan/twitter/model/tweet/tweet_token.rb +79 -0
  79. data/lib/wuclan/twitter/model/tweet.rb +74 -0
  80. data/lib/wuclan/twitter/model/twitter_user/style/color_to_hsv.rb +57 -0
  81. data/lib/wuclan/twitter/model/twitter_user.rb +145 -0
  82. data/lib/wuclan/twitter/model.rb +21 -0
  83. data/lib/wuclan/twitter/parse/ff_ids_parser.rb +27 -0
  84. data/lib/wuclan/twitter/parse/friends_followers_parser.rb +52 -0
  85. data/lib/wuclan/twitter/parse/generic_json_parser.rb +26 -0
  86. data/lib/wuclan/twitter/parse/json_tweet.rb +63 -0
  87. data/lib/wuclan/twitter/parse/json_twitter_user.rb +122 -0
  88. data/lib/wuclan/twitter/parse/public_timeline_parser.rb +54 -0
  89. data/lib/wuclan/twitter/parse/twitter_search_parse.rb +60 -0
  90. data/lib/wuclan/twitter/parse/user_parser.rb +30 -0
  91. data/lib/wuclan/twitter/scrape/base.rb +97 -0
  92. data/lib/wuclan/twitter/scrape/old_skool_request_classes.rb +40 -0
  93. data/lib/wuclan/twitter/scrape/twitter_fake_fetcher.rb +31 -0
  94. data/lib/wuclan/twitter/scrape/twitter_ff_ids_request.rb +75 -0
  95. data/lib/wuclan/twitter/scrape/twitter_followers_request.rb +135 -0
  96. data/lib/wuclan/twitter/scrape/twitter_json_response.rb +124 -0
  97. data/lib/wuclan/twitter/scrape/twitter_request_stream.rb +44 -0
  98. data/lib/wuclan/twitter/scrape/twitter_search_fake_fetcher.rb +44 -0
  99. data/lib/wuclan/twitter/scrape/twitter_search_flat_stream.rb +30 -0
  100. data/lib/wuclan/twitter/scrape/twitter_search_job.rb +25 -0
  101. data/lib/wuclan/twitter/scrape/twitter_search_request.rb +70 -0
  102. data/lib/wuclan/twitter/scrape/twitter_search_request_stream.rb +19 -0
  103. data/lib/wuclan/twitter/scrape/twitter_timeline_request.rb +72 -0
  104. data/lib/wuclan/twitter/scrape/twitter_user_request.rb +64 -0
  105. data/lib/wuclan/twitter/scrape.rb +27 -0
  106. data/lib/wuclan/twitter.rb +7 -0
  107. data/lib/wuclan.rb +1 -0
  108. data/spec/spec_helper.rb +9 -0
  109. data/spec/wuclan_spec.rb +7 -0
  110. data/wuclan.gemspec +184 -0
  111. metadata +219 -0
@@ -0,0 +1,370 @@
1
+
2
+ # http://github.com/Empact/html_test/tree/master
3
+ # http://github.com/michaeledgar/validates_not_profane
4
+ #
5
+ # http://github.com/porras/livevalidation/tree/master
6
+ # Rails plugin which allows automatic integration of your Rails application with Javascript library LiveValidation. This library implements client-side form validation and you can
7
+ #
8
+ # http://github.com/cainlevy/semantic-attributes
9
+ #
10
+ # git://github.com/alexdunae/validates_email_format_of.git
11
+ # Validate e-mail addreses against RFC 2822 and RFC 3696 with this popular Ruby on Rails plugin and gem.
12
+ #
13
+ # http://github.com/freelancing-god/active-matchers/tree/master
14
+ # Helpful rspec matchers for testing validations and associations.
15
+ #
16
+ # http://github.com/redinger/validation_reflection/tree/master
17
+ # refl = Person.reflect_on_validations_for(:name)
18
+ # refl[0].macro
19
+ # => :validates_presence_of
20
+ #
21
+ # http://github.com/augustl/live-validations/tree/master
22
+ # Reads Active Record's validations and makes them available to live client side javascript validation scripts
23
+ #
24
+ # http://github.com/adzap/validates_timeliness/tree/master
25
+ # Date and time validation plugin for Rails 2.x and allows custom date/time formats
26
+
27
+ # http://github.com/matthewrudy/regexpert/tree/master
28
+ # Description: A collection of common Regexps for Ruby. Validation for emails, uk postcode, etc.
29
+ #
30
+
31
+ # http://plugins.jquery.com/project/validate
32
+ #
33
+ #
34
+
35
+ # ===========================================================================
36
+ #
37
+ # # http://github.com/matthewrudy/regexpert/blob/master/lib/regexpert.rb
38
+ #
39
+ # module Format
40
+ # # This is taken from dm-more - http://github.com/sam/dm-more/tree/master/dm-validations/lib/dm-validations/formats/email.rb
41
+ # # RFC2822 (No attribution reference available)
42
+ # #
43
+ # # doctest: email_address
44
+ # # >> "MatthewRudyJacobs@gmail.com" =~ Regexpert::Format::EmailAddress
45
+ # # => 0
46
+ # #
47
+ # # >> "dev@" =~ Regexpert::Format::EmailAddress
48
+ # # => nil
49
+ # #
50
+ # EmailAddress = begin
51
+ # alpha = "a-zA-Z"
52
+ # digit = "0-9"
53
+ # atext = "[#{alpha}#{digit}\!\#\$\%\&\'\*+\/\=\?\^\_\`\{\|\}\~\-]"
54
+ # dot_atom_text = "#{atext}+([.]#{atext}*)*"
55
+ # dot_atom = "#{dot_atom_text}"
56
+ # qtext = '[^\\x0d\\x22\\x5c\\x80-\\xff]'
57
+ # text = "[\\x01-\\x09\\x11\\x12\\x14-\\x7f]"
58
+ # quoted_pair = "(\\x5c#{text})"
59
+ # qcontent = "(?:#{qtext}|#{quoted_pair})"
60
+ # quoted_string = "[\"]#{qcontent}+[\"]"
61
+ # atom = "#{atext}+"
62
+ # word = "(?:#{atom}|#{quoted_string})"
63
+ # obs_local_part = "#{word}([.]#{word})*"
64
+ # local_part = "(?:#{dot_atom}|#{quoted_string}|#{obs_local_part})"
65
+ # no_ws_ctl = "\\x01-\\x08\\x11\\x12\\x14-\\x1f\\x7f"
66
+ # dtext = "[#{no_ws_ctl}\\x21-\\x5a\\x5e-\\x7e]"
67
+ # dcontent = "(?:#{dtext}|#{quoted_pair})"
68
+ # domain_literal = "\\[#{dcontent}+\\]"
69
+ # obs_domain = "#{atom}([.]#{atom})*"
70
+ # domain = "(?:#{dot_atom}|#{domain_literal}|#{obs_domain})"
71
+ # addr_spec = "#{local_part}\@#{domain}"
72
+ # pattern = /^#{addr_spec}$/
73
+ # end
74
+ #
75
+ # # This is taken from dm-more http://github.com/sam/dm-more/tree/master/dm-validations/lib/dm-validations/formats/url.rb
76
+ # # Regex from http://www.igvita.com/2006/09/07/validating-url-in-ruby-on-rails/
77
+ # #
78
+ # # doctest: url # examples from Rails auto_link tests
79
+ # # >> "http://www.rubyonrails.com/contact;new" =~ Regexpert::Format::Url
80
+ # # => 0
81
+ # # >> "http://maps.google.co.uk/maps?f=q&q=the+london+eye&ie=UTF8&ll=51.503373,-0.11939&spn=0.007052,0.012767&z=16&iwloc=A" =~ Regexpert::Format::Url
82
+ # # => 0
83
+ # # >> "http://en.wikipedia.org/wiki/Sprite_(computer_graphics)" =~ Regexpert::Format::Url
84
+ # # => 0
85
+ # # TODO: think of a good example of a bad url
86
+ # Url = begin
87
+ # /(^$)|(^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(([0-9]{1,5})?\/.*)?$)/ix
88
+ # end
89
+ #
90
+ # # This is taken from Django.Contrib.Localflavor.uk
91
+ # # The regular expression used is sourced from the schema for British Standard
92
+ # # BS7666 address types: http://www.govtalk.gov.uk/gdsc/schemas/bs7666-v2-0.xsd
93
+ # #
94
+ # # doctest: ukpostcode
95
+ # # >> "GIR 0AA" =~ Regexpert::Format::UKPostcode # GIR 0AA is a special GIRO postcode
96
+ # # => 0
97
+ # # >> "AL40XB" =~ Regexpert::Format::UKPostcode
98
+ # # => 0
99
+ # # >> "CB4 1TL" =~ Regexpert::Format::UKPostcode
100
+ # # => 0
101
+ # #
102
+ # # >> "AL44 NOP" =~ Regexpert::Format::UKPostcode
103
+ # # => nil
104
+ # # >> "CB4-1TL" =~ Regexpert::Format::UKPostcode
105
+ # # => nil
106
+ # #
107
+ # UKPostcode = begin
108
+ # outcode_pattern = '[A-PR-UWYZ]([0-9]{1,2}|([A-HIK-Y][0-9](|[0-9]|[ABEHMNPRVWXY]))|[0-9][A-HJKSTUW])'
109
+ # incode_pattern = '[0-9][ABD-HJLNP-UW-Z]{2}'
110
+ # postcode_regex = Regexp.new("^(GIR *0AA|#{outcode_pattern} *#{incode_pattern})$", Regexp::IGNORECASE)
111
+ # end
112
+
113
+
114
+ # ===========================================================================
115
+ #
116
+ # http://www.botvector.net/2008/05/regular-expression-samples.html
117
+ #
118
+ #
119
+ # //Address: State code (US)
120
+ # '/\\b(?:A[KLRZ]|C[AOT]|D[CE]|FL|GA|HI|I[ADLN]|K[SY]|LA|M[ADEINOST]|N[CDEHJMVY]|O[HKR]|PA|RI|S[CD]|T[NX]|UT|V[AT]|W[AIVY])\\b/'
121
+ #
122
+ # //Address: ZIP code (US)
123
+ # '\b[0-9]{5}(?:-[0-9]{4})?\b'
124
+ #
125
+ # //Credit card: All major cards
126
+ # '^(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|6011[0-9]{12}|3(?:0[0-5]|[68][0-9])[0-9]{11}|3[47][0-9]{13})$'
127
+ #
128
+ # //Credit card: American Express
129
+ # '^3[47][0-9]{13}$'
130
+ #
131
+ # //Credit card: Diners Club
132
+ # '^3(?:0[0-5]|[68][0-9])[0-9]{11}$'
133
+ #
134
+ # //Credit card: Discover
135
+ # '^6011[0-9]{12}$'
136
+ #
137
+ # //Credit card: MasterCard
138
+ # '^5[1-5][0-9]{14}$'
139
+ #
140
+ # //Credit card: Visa
141
+ # '^4[0-9]{12}(?:[0-9]{3})?$'
142
+ #
143
+ # //Credit card: remove non-digits
144
+ # '/[^0-9]+/'
145
+ #
146
+ # //Date d/m/yy and dd/mm/yyyy
147
+ # //1/1/00 through 31/12/99 and 01/01/1900 through 31/12/2099
148
+ # //Matches invalid dates such as February 31st
149
+ # '\b(0?[1-9]|[12][0-9]|3[01])[- /.](0?[1-9]|1[012])[- /.](19|20)?[0-9]{2}\b'
150
+ #
151
+ # //Date dd/mm/yyyy
152
+ # //01/01/1900 through 31/12/2099
153
+ # //Matches invalid dates such as February 31st
154
+ # '(0[1-9]|[12][0-9]|3[01])[- /.](0[1-9]|1[012])[- /.](19|20)[0-9]{2}'
155
+ #
156
+ # //Date m/d/y and mm/dd/yyyy
157
+ # //1/1/99 through 12/31/99 and 01/01/1900 through 12/31/2099
158
+ # //Matches invalid dates such as February 31st
159
+ # //Accepts dashes, spaces, forward slashes and dots as date separators
160
+ # '\b(0?[1-9]|1[012])[- /.](0?[1-9]|[12][0-9]|3[01])[- /.](19|20)?[0-9]{2}\b'
161
+ #
162
+ # //Date mm/dd/yyyy
163
+ # //01/01/1900 through 12/31/2099
164
+ # //Matches invalid dates such as February 31st
165
+ # '(0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])[- /.](19|20)[0-9]{2}'
166
+ #
167
+ # //Date yy-m-d or yyyy-mm-dd
168
+ # //00-1-1 through 99-12-31 and 1900-01-01 through 2099-12-31
169
+ # //Matches invalid dates such as February 31st
170
+ # '\b(19|20)?[0-9]{2}[- /.](0?[1-9]|1[012])[- /.](0?[1-9]|[12][0-9]|3[01])\b'
171
+ #
172
+ # //Date yyyy-mm-dd
173
+ # //1900-01-01 through 2099-12-31
174
+ # //Matches invalid dates such as February 31st
175
+ # '(19|20)[0-9]{2}[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])'
176
+ #
177
+ #
178
+ # //IP address
179
+ # //Matches 0.0.0.0 through 999.999.999.999
180
+ # //Use this fast and simple regex if you know the data does not contain invalid IP addresses.
181
+ # '\b([0-9]{1,3})\.([0-9]{1,3})\.([0-9]{1,3})\.([0-9]{1,3})\b'
182
+ #
183
+ # //IP address
184
+ # //Matches 0.0.0.0 through 999.999.999.999
185
+ # //Use this fast and simple regex if you know the data does not contain invalid IP addresses,
186
+ # //and you don't need access to the individual IP numbers.
187
+ # '\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b'
188
+ #
189
+ # //IP address
190
+ # //Matches 0.0.0.0 through 255.255.255.255
191
+ # //Use this regex to match IP numbers with accurracy, without access to the individual IP numbers.
192
+ # '\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b'
193
+ #
194
+ # //IP address
195
+ # //Matches 0.0.0.0 through 255.255.255.255
196
+ # //Use this regex to match IP numbers with accurracy.
197
+ # //Each of the 4 numbers is stored into a capturing group, so you can access them for further processing.
198
+ # '\b(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b'
199
+ #
200
+ #
201
+ # //Number: Currency amount
202
+ # //Optional thousands separators; optional two-digit fraction
203
+ # '\b[0-9]{1,3}(?:,?[0-9]{3})*(?:\.[0-9]{2})?\b'
204
+ #
205
+ # //Number: Currency amount
206
+ # //Optional thousands separators; mandatory two-digit fraction
207
+ # '\b[0-9]{1,3}(?:,?[0-9]{3})*\.[0-9]{2}\b'
208
+ #
209
+ # //Number: floating point
210
+ # //Matches an integer or a floating point number with mandatory integer part. The sign is optional.
211
+ # '[-+]?\b[0-9]+(\.[0-9]+)?\b'
212
+ #
213
+ # //Number: floating point
214
+ # //Matches an integer or a floating point number with optional integer part. The sign is optional.
215
+ # '[-+]?\b[0-9]*\.?[0-9]+\b'
216
+ #
217
+ # //Number: hexadecimal (C-style)
218
+ # '\b0[xX][0-9a-fA-F]+\b'
219
+ #
220
+ # //Number: Insert thousands separators
221
+ # //Replaces 123456789.00 with 123,456,789.00
222
+ # '(?<=[0-9])(?=(?:[0-9]{3})+(?![0-9]))' //Number: integer //Will match 123 and 456 as separate integer numbers in 123.456 '\b\d+\b' //Number: integer //Does not match numbers like 123.456 '(?
223
+ #
224
+ # Passwords
225
+ #
226
+ #
227
+ # //Password complexity
228
+ # //Tests if the input consists of 6 or more letters, digits, underscores and hyphens.
229
+ # //The input must contain at least one upper case letter, one lower case letter and one digit.
230
+ # '\A(?=[-_a-zA-Z0-9]*?[A-Z])(?=[-_a-zA-Z0-9]*?[a-z])(?=[-_a-zA-Z0-9]*?[0-9])[-_a-zA-Z0-9]{6,}\z'
231
+ #
232
+ # //Password complexity
233
+ # //Tests if the input consists of 6 or more characters.
234
+ # //The input must contain at least one upper case letter, one lower case letter and one digit.
235
+ # '\A(?=[-_a-zA-Z0-9]*?[A-Z])(?=[-_a-zA-Z0-9]*?[a-z])(?=[-_a-zA-Z0-9]*?[0-9])\S{6,}\z'
236
+ #
237
+ # //Path: Windows
238
+ # '\b[a-z]:\\[^/:*?"<>|\r\n]*'
239
+ #
240
+ # //Path: Windows
241
+ # //Different elements of the path are captured into backreferences.
242
+ # '\b((?#drive)[a-z]):\\((?#folder)[^/:*?"<>|\r\n]*\\)?((?#file)[^\\/:*?"<>|\r\n]*)'
243
+ #
244
+ # //Path: Windows or UNC
245
+ # '(?:(?#drive)\b[a-z]:|\\\\[a-z0-9]+)\\[^/:*?"<>|\r\n]*'
246
+ #
247
+ # //Path: Windows or UNC
248
+ # //Different elements of the path are captured into backreferences.
249
+ # '((?#drive)\b[a-z]:|\\\\[a-z0-9]+)\\((?#folder)[^/:*?"<>|\r\n]*\\)?((?#file)[^\\/:*?"<>|\r\n]*)'
250
+
251
+ # //Phone Number (North America)
252
+ # //Matches 3334445555, 333.444.5555, 333-444-5555, 333 444 5555, (333) 444 5555 and all combinations thereof.
253
+ # //Replaces all those with (333) 444-5555
254
+ # preg_replace('\(?([0-9]{3})\)?[-. ]?([0-9]{3})[-. ]?([0-9]{4})', '(\1) \2-\3', $text);
255
+ #
256
+ # //Phone Number (North America)
257
+ # //Matches 3334445555, 333.444.5555, 333-444-5555, 333 444 5555, (333) 444 5555 and all combinations thereof.
258
+ # '\(?[0-9]{3}\)?[-. ]?[0-9]{3}[-. ]?[0-9]{4}'
259
+
260
+
261
+ # Postal codes
262
+ #
263
+ #
264
+ # //Postal code (Canada)
265
+ # '\b[ABCEGHJKLMNPRSTVXY][0-9][A-Z] [0-9][A-Z][0-9]\b'
266
+ #
267
+ # //Postal code (UK)
268
+ # '\b[A-Z]{1,2}[0-9][A-Z0-9]? [0-9][ABD-HJLNP-UW-Z]{2}\b'
269
+ #
270
+
271
+ #
272
+ # Programming
273
+ #
274
+ # //Programming: GUID
275
+ # //Microsoft-style GUID, numbers only.
276
+ # '[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}'
277
+ #
278
+ # //Programming: GUID
279
+ # //Microsoft-style GUID, with optional parentheses or braces.
280
+ # //(Long version, if your regex flavor doesn't support conditionals.)
281
+ # '[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}|\([A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}\)|\{[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}\}'
282
+ #
283
+ # //Programming: GUID
284
+ # //Microsoft-style GUID, with optional parentheses or braces.
285
+ # //Short version, illustrating the use of regex conditionals. Not all regex flavors support conditionals.
286
+ # //Also, when applied to large chunks of data, the regex using conditionals will likely be slower
287
+ # //than the long version. Straight alternation is much easier to optimize for a regex engine.
288
+ # '(?:(\()|(\{))?[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}(?(1)\))(?(2)\})'
289
+ #
290
+ # //Programming: Remove escapes
291
+ # //Remove backslashes used to escape other characters
292
+ # preg_replace('\\(.)', '\1', $text);
293
+ #
294
+ # //Programming: String
295
+ # //Quotes may appear in the string when escaped with a backslash.
296
+ # //The string may span multiple lines.
297
+ # '"[^"\\]*(?:\\.[^"\\]*)*"'
298
+
299
+ #
300
+ # Escape
301
+ #
302
+ # //Regex: Escape metacharacters
303
+ # //Place a backslash in front of the regular expression metacharacters
304
+ # gsub("[][{}()*+?.\\^$|]", "\\$0", $text);
305
+
306
+
307
+
308
+ # 3530588 3.4G /workspace/data lab13
309
+ # 2242028 2.2G /workspace/data lab17
310
+ # 3530588 3.4G /workspace/data lab16
311
+ # 3530588 3.4G /workspace/data lab21
312
+ # 3530588 3.4G /workspace/data lab14
313
+ # 4 4.0K /workspace/data lab12
314
+ # 3530588 3.4G /workspace/data lab15
315
+ # 20 20K /workspace/data lab23
316
+
317
+
318
+
319
+ # Security
320
+ #
321
+ #
322
+ # //Security: ASCII code characters excl. tab and CRLF
323
+ # //Matches any single non-printable code character that may cause trouble in certain situations.
324
+ # //Excludes tabs and line breaks.
325
+ # '[\x00\x08\x0B\x0C\x0E-\x1F]'
326
+ #
327
+ # //Security: ASCII code characters incl. tab and CRLF
328
+ # //Matches any single non-printable code character that may cause trouble in certain situations.
329
+ # //Includes tabs and line breaks.
330
+ # '[\x00-\x1F]'
331
+ #
332
+ # //Security: Escape quotes and backslashes
333
+ # //E.g. escape user input before inserting it into a SQL statement
334
+ # gsub("\\$0", "\\$0", $text);
335
+ #
336
+ # //Security: Unicode code and unassigned characters excl. tab and CRLF
337
+ # //Matches any single non-printable code character that may cause trouble in certain situations.
338
+ # //Also matches any Unicode code point that is unused in the current Unicode standard,
339
+ # //and thus should not occur in text as it cannot be displayed.
340
+ # //Excludes tabs and line breaks.
341
+ # '[^\P{C}\t\r\n]'
342
+ #
343
+ # //Security: Unicode code and unassigned characters incl. tab and CRLF
344
+ # //Matches any single non-printable code character that may cause trouble in certain situations.
345
+ # //Also matches any Unicode code point that is unused in the current Unicode standard,
346
+ # //and thus should not occur in text as it cannot be displayed.
347
+ # //Includes tabs and line breaks.
348
+ # '\p{C}'
349
+ #
350
+ # //Security: Unicode code characters excl. tab and CRLF
351
+ # //Matches any single non-printable code character that may cause trouble in certain situations.
352
+ # //Excludes tabs and line breaks.
353
+ # '[^\P{Cc}\t\r\n]'
354
+ #
355
+ # //Security: Unicode code characters incl. tab and CRLF
356
+ # //Matches any single non-printable code character that may cause trouble in certain situations.
357
+ # //Includes tabs and line breaks.
358
+ # '\p{Cc}'
359
+ #
360
+ #
361
+ #
362
+ # SSN (Social security numbers)
363
+ #
364
+ #
365
+ # //Social security number (US)
366
+ # '\b[0-9]{3}-[0-9]{2}-[0-9]{4}\b'
367
+
368
+
369
+
370
+
@@ -0,0 +1,38 @@
1
+ module Wuclan::Models
2
+
3
+ #
4
+ #
5
+ #
6
+ module TextElementCommon
7
+ # Key on text-status_id
8
+ def num_key_fields() 2 end
9
+ end
10
+
11
+ #
12
+ # Topical #hashtags extracted from tweet text
13
+ #
14
+ # the twitter_user_id is denormalized
15
+ # but is often what we wnat: saves a join
16
+ #
17
+ class Hashtag < TypedStruct.new(
18
+ [:hashtag, String ],
19
+ [:status_id, Integer ],
20
+ [:twitter_user_id, Integer ]
21
+ )
22
+ include ModelCommon
23
+ include TextElementCommon
24
+ alias_method :text, :hashtag
25
+ def numeric_id_fields() [:twitter_user_id, :status_id] ; end
26
+ end
27
+
28
+ class TweetUrl < TypedStruct.new(
29
+ [:tweet_url, String ],
30
+ [:status_id, Integer ],
31
+ [:twitter_user_id, Integer ]
32
+ )
33
+ include ModelCommon
34
+ include TextElementCommon
35
+ alias_method :text, :tweet_url
36
+ def numeric_id_fields() [:twitter_user_id, :status_id] ; end
37
+ end
38
+ end
@@ -0,0 +1,38 @@
1
+ require 'wuclan/models/tweet/tweet_token'
2
+ require 'wukong/encoding'
3
+ module Wuclan::Models
4
+ Tweet.class_eval do
5
+ def string_for_tokenizing
6
+ # simpleminded test for non-latin script: don't bother if > 20 entities
7
+ return if (text.count('&') > 20)
8
+ # skip default message from early days
9
+ return if (text =~ /just setting up my twttr/);
10
+ # return decoded, whitespace-flattened text
11
+ self.decoded_text.gsub(/\s+/s, ' ').strip
12
+ end
13
+
14
+ def tokens_for klass, str
15
+ klass.extract_tokens!(str).map do |word|
16
+ klass.new(word, twitter_user_id, id, 1)
17
+ end
18
+ end
19
+
20
+ def tokenize extract_word_tokens=nil
21
+ str = string_for_tokenizing
22
+ return [] if str.blank?
23
+ toks = []
24
+ # Case-sensitive tokens
25
+ [ SmilieToken, UrlToken ].each do |klass|
26
+ toks += tokens_for klass, str
27
+ end
28
+ # Case-insensitive tokens
29
+ str.downcase!
30
+ [ RtToken, AtsignToken, HashtagToken ].each do |klass| # ,
31
+ toks += tokens_for klass, str
32
+ end
33
+ toks += tokens_for WordToken, str if extract_word_tokens
34
+ toks
35
+ end
36
+
37
+ end
38
+ end
@@ -0,0 +1,202 @@
1
+ #!/usr/bin/env ruby
2
+ module Wuclan
3
+ module Models
4
+ module TweetRegexes
5
+ # ===========================================================================
6
+ #
7
+ # Twitter accepts URLs somewhat idiosyncratically, probably for good reason --
8
+ # we rarely see ()![] in urls; more likely in a status they are punctuation.
9
+ #
10
+ # This is what I've reverse engineered.
11
+ #
12
+ #
13
+ # Notes:
14
+ #
15
+ # * is.gd uses a trailing '-' (to indicate 'preview mode'): clever.
16
+ # * pastoid.com uses a trailing '+', and idek.net a trailing ~ for no reason. annoying.
17
+ #
18
+ # Counterexamples:
19
+ # * http://www.5irecipe.cn/recipe_content/2307/'/
20
+ # * http://www.facebook.com/groups.php?id=1347199977&gv=12#/group.php?gid=18183539495
21
+ #
22
+ RE_DOMAIN_HEAD = '(?:[a-zA-Z0-9\-]+\.)+'
23
+ RE_DOMAIN_TLD = '(?:com|org|net|edu|gov|mil|biz|info|mobi|name|aero|jobs|museum|[a-zA-Z]{2})'
24
+ # RE_URL_SCHEME = '[a-zA-Z][a-zA-Z0-9\-\+\.]+'
25
+ RE_URL_SCHEME_STRICT = '[a-zA-Z]{3,6}'
26
+ RE_URL_UNRESERVED = 'a-zA-Z0-9' + '\-\._~'
27
+ RE_URL_OKCHARS = RE_URL_UNRESERVED + '\'\+\,\;=' + '/%:@' # not !$&()* [] \|
28
+ RE_URL_QUERYCHARS = RE_URL_OKCHARS + '&='
29
+ RE_URL_HOSTPART = "#{RE_URL_SCHEME_STRICT}://#{RE_DOMAIN_HEAD}#{RE_DOMAIN_TLD}"
30
+ RE_URL = %r{(
31
+ #{RE_URL_HOSTPART} # Host
32
+ (?:(?: \/ [#{RE_URL_OKCHARS}]+? )*? # path: / delimited path segments
33
+ (?: \/ [#{RE_URL_OKCHARS}]*[\w\-\+\~] ) # where the last one ends in a non-punctuation.
34
+ | # ... or no path segment
35
+ )\/? # with an optional trailing slash
36
+ (?: \? [#{RE_URL_QUERYCHARS}]+ )? # query: introduced by a ?, with &foo= delimited segments
37
+ (?: \# [#{RE_URL_OKCHARS}]+ )? # frag: introduced by a #
38
+ )}x
39
+
40
+
41
+ #
42
+ # Technically a scheme can allow the characters '+', '-' and '.' within
43
+ # it. In practice you can not only ignore those characters but all but a
44
+ # few specific schemes.
45
+ #
46
+ # From a collection of ~9M tweeted urls, 99.4% were http://, with only the additional
47
+ # https, mms, ftp, git, irc, feed, itpc, rtsp, hxxp, gopher, telnet, itms, ssh, webcal, svn
48
+ # seemingly worth finding:
49
+ #
50
+ # 8925742 http
51
+ # 6026 https 1841 ivo 122 mms 85 ftp 61 git 53 irc 45 feed 31 itpc 12 www
52
+ # 12 rtsp 12 hxxp 12 gopher 9 telnet 9 itms 7 ssh 5 webcal 5 sop 4 wiie
53
+ # 3 svn 3 sssp 3 file 2 res 1 xttp 1 xmlrpc 1 ssl 1 smb
54
+ #
55
+ # An hxxp http://en.wikipedia.org/wiki/Hxxp is used to obscure a link, so
56
+ # take of that what you may.
57
+ #
58
+ # The ivo:// scheme is used by virtual astronomical observatories; as its
59
+ # hostnames are given in reverse-dotted notation (uk.org.estar) these URIs
60
+ # are imperfectly recognized. Twitter doesn't accept them at all:
61
+ # http://twitter.com/eSTAR_Project/status/1113930948
62
+ #
63
+ #
64
+
65
+
66
+ # ===========================================================================
67
+ #
68
+ # A hash following a non-alphanum_ (or at the start of the line
69
+ # followed by (any number of alpha, num, -_.+:=) and ending in an alphanum_
70
+ #
71
+ # This is overly generous to those dorky triple tags (geo:lat=69.3), but we'll soldier on somehow.
72
+ #
73
+ RE_HASHTAGS = %r{(?:^|\W)\#([a-zA-Z0-9\-_\.+:=]+\w)(?:\W|$)}
74
+
75
+ # ===========================================================================
76
+ #
77
+ # Retweets and Retweet Whores
78
+ #
79
+ # See ARetweetsB for more info.
80
+ #
81
+ # A retweet
82
+ # RT @interesting_user Something so witty Dorothy Parker would just give up
83
+ # Oh yeah and so's your mom (via @sixth_grader)
84
+ # retweeting @ogre: KEGGER TONITE RT pls
85
+ # ^^^ this is not a rtwhore; it matches first as a retweet
86
+ #
87
+ # and rtwhores
88
+ # retweet please: Hey here's something I'm whoring xxx
89
+ # KEGGER TONITE RT pls
90
+ #
91
+ # or semantically-incorrect matches such as (actual example):
92
+ # @somebody lol, love the 'please retweet' ending!
93
+ #
94
+ # Things that don't match:
95
+ # retweet is silly, @i_think_youre_dumb
96
+ # misspell the name of my Sony Via
97
+ #
98
+ RE_RETWEET_WORDS = 'rt|retweet|retweeting'
99
+ RE_RETWEET_ONLY = %r{(?:#{RE_RETWEET_WORDS})}
100
+ RE_RETWEET_OR_VIA = %r{(?:#{RE_RETWEET_WORDS}|via|from)}
101
+ RE_PLEASE = %r{(?:please|plz|pls)}
102
+ RE_RETWEET = %r{\b#{RE_RETWEET_OR_VIA}\W*@(\w+)\b}i
103
+ RE_RTWHORE = %r{
104
+ \b#{RE_RETWEET_ONLY}\W*#{RE_PLEASE}\b
105
+ | \b#{RE_PLEASE}\W*#{RE_RETWEET_ONLY}\b}ix
106
+
107
+ # ===========================================================================
108
+ #
109
+ # following either the start of the line, or a non-alphanum_ character
110
+ # the string of following [a-zA-Z0-9_]
111
+ #
112
+ # Note carefully: we _demand_ a preceding character (or start of line):
113
+ # \b would match email@address.com, which we don't want.
114
+ #
115
+ # Making an exception for RT@im_cramped_for_space.
116
+ #
117
+ # All retweets
118
+ #
119
+ RE_ATSIGNS = %r{(?:^|\W|#{RE_RETWEET_OR_VIA})@(\w+)\b}
120
+
121
+
122
+
123
+ # ===========================================================================
124
+ #
125
+ # Smilies !!! ^_^
126
+ #
127
+
128
+ # RE_NUMBERS = %r{
129
+ # (?:^|\D) # non-number
130
+ # (
131
+ # |(?:\(\d{3}\)[\ \-]?\d{3}[\ \-]\d{4})
132
+ # |(?: (?:\d{1,3}\.)(?:\d{3},)*\.?\d+) # decimal number
133
+ # |(?: (?:\d{1,3}\.)(?:\d{3}\.)*,?\d+) # euro-style
134
+ # \d+
135
+ # )
136
+ # }x
137
+ #
138
+ # # IP address
139
+ # \b(?:\d{1,3}\.){3}\d{1,3}\b
140
+ # credit card: (lax)
141
+ # \b(?:\d[ -]*){13,16}\b
142
+ # \b(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|6(?:011|5[0-9][0-9])[0-9]{12}|3[47][0-9]{13}|3(?:0[0-5]|[68][0-9])[0-9]{11}|(?:2131|1800|35\d{3})\d{11})\b
143
+ #
144
+ # [-+]?[0-9,]*\.?[0-9]*
145
+ # [-+]?[0-9]*(\.[0-9]+)?([eE][-+]?[0-9]+)?
146
+
147
+ # ===========================================================================
148
+ #
149
+ # Smilies !!! ^_^
150
+ #
151
+ RE_SMILIES_EYES = "\\:8;"
152
+ RE_SMILIES_NOSE = "\\-=\\*o"
153
+ RE_SMILIES_MOUTH = "DP@Oo\\(\\)\\[\\]\\|\\{\\}\\/\\\\"
154
+ RE_SMILIES = %r{
155
+ (?:^|\W) # non-smilie character
156
+ ( (?:
157
+ >?
158
+ [#{RE_SMILIES_EYES}] # eyes
159
+ [#{RE_SMILIES_NOSE}]? # nose, maybe
160
+ [#{RE_SMILIES_MOUTH}] ) # mouth
161
+ |(?:
162
+ [#{RE_SMILIES_MOUTH}] # mouth
163
+ [#{RE_SMILIES_NOSE}]? # nose, maybe
164
+ [#{RE_SMILIES_EYES}] # eyes
165
+ <? )
166
+ |(?: =[#{RE_SMILIES_MOUTH}]) # =) (=
167
+ |(?: [#{RE_SMILIES_MOUTH}]=) # =) (=
168
+ |(?: \^[_\-]\^ ) # kawaaaaiiii!
169
+ |(?: :[,\']\( ) # snif
170
+ |(?: <3 ) # heart
171
+ |(?: \\m/ ) # rawk
172
+ |(?: x-\( ) # dead
173
+ )
174
+ (?:\W|$)
175
+ }x
176
+ end
177
+ end
178
+ end
179
+
180
+
181
+ # http://mail.google.com/support/bin/answer.py?hl=en&answer=34056
182
+ # http://en.wikipedia.org/wiki/Emoticons
183
+ #
184
+ # :-) :) =] =) Smiling, happy
185
+ # :-( =( :[ :< frowning, Sad
186
+ # ;-) ;) ;] Wink
187
+ # :D =D XD BD Large grin or laugh
188
+ # :P =P XP Tongue out, or after a joke
189
+ # <3 S2 :> Love
190
+ # :O =O Shocked or surprised
191
+ # =I :/ :-\ Bored, annoyed or awkward; concerned.
192
+ # :S =S :? Confused, embarrassed or uneasy
193
+
194
+ # Icon Meaning Icon Meaning Icon Meaning
195
+ # (^_^) smile (^o^) laughing out loud d(^_^)b thumbs up (not ears)
196
+ # (T_T) sad (crying face) (-.-)Zzz sleeping (Z.Z) sleepy person
197
+ # \(^_^)/ cheers, "Hurrah!" (*^^*) shyness (-_-); sweating (as in ashamed), or exasperated.
198
+ # (*3*) "Surprise !." (?_?) "Nonsense, I don't know." (^_~) wink
199
+ # (o.O) shocked/disturbed (<.<) shifty, suspicious v(^_^)v peace
200
+ #
201
+ # [\\dv](^_^)[bv/]
202
+ #