wuclan 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. data/LICENSE.textile +20 -0
  2. data/README.textile +28 -0
  3. data/examples/analyze/strong_links/gen_multi_edge.rb +103 -0
  4. data/examples/analyze/strong_links/main.rb +51 -0
  5. data/examples/analyze/word_count/dump_schema.rb +13 -0
  6. data/examples/analyze/word_count/freq_user.rb +31 -0
  7. data/examples/analyze/word_count/freq_whole_corpus.rb +27 -0
  8. data/examples/analyze/word_count/word_count.pig +43 -0
  9. data/examples/analyze/word_count/word_count.rb +34 -0
  10. data/examples/lastfm/scrape/load_lastfm.rb +31 -0
  11. data/examples/lastfm/scrape/scrape_lastfm.rb +47 -0
  12. data/examples/lastfm/scrape/seed.tsv +147 -0
  13. data/examples/twitter/old/load_twitter_search_jobs.rb +157 -0
  14. data/examples/twitter/old/scrape_twitter_api.rb +104 -0
  15. data/examples/twitter/old/scrape_twitter_search.rb +57 -0
  16. data/examples/twitter/old/scrape_twitter_trending.rb +73 -0
  17. data/examples/twitter/parse/parse_twitter_requests.rb +81 -0
  18. data/examples/twitter/parse/parse_twitter_search_requests.rb +28 -0
  19. data/examples/twitter/scrape_twitter_api/scrape_twitter_api.rb +61 -0
  20. data/examples/twitter/scrape_twitter_api/seed.tsv +4 -0
  21. data/examples/twitter/scrape_twitter_api/start_cache_twitter.sh +2 -0
  22. data/examples/twitter/scrape_twitter_api/support/make_request_stats.rb +291 -0
  23. data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_1.rb +98 -0
  24. data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_2.pig +4 -0
  25. data/examples/twitter/scrape_twitter_api/support/twitter_search_jobs.tsv +6 -0
  26. data/examples/twitter/scrape_twitter_api/support/twitter_trending_seed.tsv +725 -0
  27. data/examples/twitter/scrape_twitter_hosebird/edamame-killall +4 -0
  28. data/examples/twitter/scrape_twitter_hosebird/foo.rb +19 -0
  29. data/examples/twitter/scrape_twitter_hosebird/ps_emulation.rb +111 -0
  30. data/examples/twitter/scrape_twitter_hosebird/scrape_twitter_hosebird.rb +110 -0
  31. data/examples/twitter/scrape_twitter_hosebird/test_spewer.rb +20 -0
  32. data/examples/twitter/scrape_twitter_hosebird/twitter_hosebird_god.yaml +10 -0
  33. data/examples/twitter/scrape_twitter_search/dump_twitter_search_jobs.rb +38 -0
  34. data/examples/twitter/scrape_twitter_search/load_twitter_search_jobs.rb +63 -0
  35. data/examples/twitter/scrape_twitter_search/scrape_twitter_search.rb +44 -0
  36. data/examples/twitter/scrape_twitter_search/twitter_search_daemons.god +25 -0
  37. data/lib/old/twitter_api.rb +88 -0
  38. data/lib/wuclan/delicious/delicious_html_request.rb +31 -0
  39. data/lib/wuclan/delicious/delicious_models.rb +26 -0
  40. data/lib/wuclan/delicious/delicious_request.rb +65 -0
  41. data/lib/wuclan/friendfeed/scrape/friendfeed_search_request.rb +60 -0
  42. data/lib/wuclan/friendster.rb +7 -0
  43. data/lib/wuclan/lastfm/model/base.rb +49 -0
  44. data/lib/wuclan/lastfm/model/sample_responses.txt +16 -0
  45. data/lib/wuclan/lastfm/scrape/base.rb +195 -0
  46. data/lib/wuclan/lastfm/scrape/concrete.rb +143 -0
  47. data/lib/wuclan/lastfm/scrape/lastfm_job.rb +12 -0
  48. data/lib/wuclan/lastfm/scrape/lastfm_request_stream.rb +17 -0
  49. data/lib/wuclan/lastfm/scrape/recursive_requests.rb +154 -0
  50. data/lib/wuclan/lastfm/scrape.rb +12 -0
  51. data/lib/wuclan/lastfm.rb +7 -0
  52. data/lib/wuclan/metrics/user_graph_metrics.rb +99 -0
  53. data/lib/wuclan/metrics/user_metrics.rb +443 -0
  54. data/lib/wuclan/metrics/user_metrics_basic.rb +277 -0
  55. data/lib/wuclan/metrics/user_scraping_metrics.rb +64 -0
  56. data/lib/wuclan/metrics.rb +0 -0
  57. data/lib/wuclan/myspace.rb +21 -0
  58. data/lib/wuclan/open_social/model/base.rb +0 -0
  59. data/lib/wuclan/open_social/scrape/base.rb +111 -0
  60. data/lib/wuclan/open_social/scrape_request.rb +6 -0
  61. data/lib/wuclan/open_social.rb +0 -0
  62. data/lib/wuclan/rdf_output/relationship_rdf.rb +47 -0
  63. data/lib/wuclan/rdf_output/text_element_rdf.rb +64 -0
  64. data/lib/wuclan/rdf_output/tweet_rdf.rb +10 -0
  65. data/lib/wuclan/rdf_output/twitter_rdf.rb +84 -0
  66. data/lib/wuclan/rdf_output/twitter_user_rdf.rb +12 -0
  67. data/lib/wuclan/shorturl/shorturl_request.rb +271 -0
  68. data/lib/wuclan/twitter/api_response_examples.textile +300 -0
  69. data/lib/wuclan/twitter/model/base.rb +72 -0
  70. data/lib/wuclan/twitter/model/multi_edge.rb +31 -0
  71. data/lib/wuclan/twitter/model/relationship.rb +176 -0
  72. data/lib/wuclan/twitter/model/text_element/extract_info_tests.rb +83 -0
  73. data/lib/wuclan/twitter/model/text_element/grok_tweets.rb +96 -0
  74. data/lib/wuclan/twitter/model/text_element/more_regexes.rb +370 -0
  75. data/lib/wuclan/twitter/model/text_element.rb +38 -0
  76. data/lib/wuclan/twitter/model/tweet/tokenize.rb +38 -0
  77. data/lib/wuclan/twitter/model/tweet/tweet_regexes.rb +202 -0
  78. data/lib/wuclan/twitter/model/tweet/tweet_token.rb +79 -0
  79. data/lib/wuclan/twitter/model/tweet.rb +74 -0
  80. data/lib/wuclan/twitter/model/twitter_user/style/color_to_hsv.rb +57 -0
  81. data/lib/wuclan/twitter/model/twitter_user.rb +145 -0
  82. data/lib/wuclan/twitter/model.rb +21 -0
  83. data/lib/wuclan/twitter/parse/ff_ids_parser.rb +27 -0
  84. data/lib/wuclan/twitter/parse/friends_followers_parser.rb +52 -0
  85. data/lib/wuclan/twitter/parse/generic_json_parser.rb +26 -0
  86. data/lib/wuclan/twitter/parse/json_tweet.rb +63 -0
  87. data/lib/wuclan/twitter/parse/json_twitter_user.rb +122 -0
  88. data/lib/wuclan/twitter/parse/public_timeline_parser.rb +54 -0
  89. data/lib/wuclan/twitter/parse/twitter_search_parse.rb +60 -0
  90. data/lib/wuclan/twitter/parse/user_parser.rb +30 -0
  91. data/lib/wuclan/twitter/scrape/base.rb +97 -0
  92. data/lib/wuclan/twitter/scrape/old_skool_request_classes.rb +40 -0
  93. data/lib/wuclan/twitter/scrape/twitter_fake_fetcher.rb +31 -0
  94. data/lib/wuclan/twitter/scrape/twitter_ff_ids_request.rb +75 -0
  95. data/lib/wuclan/twitter/scrape/twitter_followers_request.rb +135 -0
  96. data/lib/wuclan/twitter/scrape/twitter_json_response.rb +124 -0
  97. data/lib/wuclan/twitter/scrape/twitter_request_stream.rb +44 -0
  98. data/lib/wuclan/twitter/scrape/twitter_search_fake_fetcher.rb +44 -0
  99. data/lib/wuclan/twitter/scrape/twitter_search_flat_stream.rb +30 -0
  100. data/lib/wuclan/twitter/scrape/twitter_search_job.rb +25 -0
  101. data/lib/wuclan/twitter/scrape/twitter_search_request.rb +70 -0
  102. data/lib/wuclan/twitter/scrape/twitter_search_request_stream.rb +19 -0
  103. data/lib/wuclan/twitter/scrape/twitter_timeline_request.rb +72 -0
  104. data/lib/wuclan/twitter/scrape/twitter_user_request.rb +64 -0
  105. data/lib/wuclan/twitter/scrape.rb +27 -0
  106. data/lib/wuclan/twitter.rb +7 -0
  107. data/lib/wuclan.rb +1 -0
  108. data/spec/spec_helper.rb +9 -0
  109. data/spec/wuclan_spec.rb +7 -0
  110. data/wuclan.gemspec +184 -0
  111. metadata +219 -0
@@ -0,0 +1,370 @@
1
+
2
+ # http://github.com/Empact/html_test/tree/master
3
+ # http://github.com/michaeledgar/validates_not_profane
4
+ #
5
+ # http://github.com/porras/livevalidation/tree/master
6
+ # Rails plugin which allows automatic integration of your Rails application with Javascript library LiveValidation. This library implements client-side form validation and you can
7
+ #
8
+ # http://github.com/cainlevy/semantic-attributes
9
+ #
10
+ # git://github.com/alexdunae/validates_email_format_of.git
11
+ # Validate e-mail addreses against RFC 2822 and RFC 3696 with this popular Ruby on Rails plugin and gem.
12
+ #
13
+ # http://github.com/freelancing-god/active-matchers/tree/master
14
+ # Helpful rspec matchers for testing validations and associations.
15
+ #
16
+ # http://github.com/redinger/validation_reflection/tree/master
17
+ # refl = Person.reflect_on_validations_for(:name)
18
+ # refl[0].macro
19
+ # => :validates_presence_of
20
+ #
21
+ # http://github.com/augustl/live-validations/tree/master
22
+ # Reads Active Record's validations and makes them available to live client side javascript validation scripts
23
+ #
24
+ # http://github.com/adzap/validates_timeliness/tree/master
25
+ # Date and time validation plugin for Rails 2.x and allows custom date/time formats
26
+
27
+ # http://github.com/matthewrudy/regexpert/tree/master
28
+ # Description: A collection of common Regexps for Ruby. Validation for emails, uk postcode, etc.
29
+ #
30
+
31
+ # http://plugins.jquery.com/project/validate
32
+ #
33
+ #
34
+
35
+ # ===========================================================================
36
+ #
37
+ # # http://github.com/matthewrudy/regexpert/blob/master/lib/regexpert.rb
38
+ #
39
+ # module Format
40
+ # # This is taken from dm-more - http://github.com/sam/dm-more/tree/master/dm-validations/lib/dm-validations/formats/email.rb
41
+ # # RFC2822 (No attribution reference available)
42
+ # #
43
+ # # doctest: email_address
44
+ # # >> "MatthewRudyJacobs@gmail.com" =~ Regexpert::Format::EmailAddress
45
+ # # => 0
46
+ # #
47
+ # # >> "dev@" =~ Regexpert::Format::EmailAddress
48
+ # # => nil
49
+ # #
50
+ # EmailAddress = begin
51
+ # alpha = "a-zA-Z"
52
+ # digit = "0-9"
53
+ # atext = "[#{alpha}#{digit}\!\#\$\%\&\'\*+\/\=\?\^\_\`\{\|\}\~\-]"
54
+ # dot_atom_text = "#{atext}+([.]#{atext}*)*"
55
+ # dot_atom = "#{dot_atom_text}"
56
+ # qtext = '[^\\x0d\\x22\\x5c\\x80-\\xff]'
57
+ # text = "[\\x01-\\x09\\x11\\x12\\x14-\\x7f]"
58
+ # quoted_pair = "(\\x5c#{text})"
59
+ # qcontent = "(?:#{qtext}|#{quoted_pair})"
60
+ # quoted_string = "[\"]#{qcontent}+[\"]"
61
+ # atom = "#{atext}+"
62
+ # word = "(?:#{atom}|#{quoted_string})"
63
+ # obs_local_part = "#{word}([.]#{word})*"
64
+ # local_part = "(?:#{dot_atom}|#{quoted_string}|#{obs_local_part})"
65
+ # no_ws_ctl = "\\x01-\\x08\\x11\\x12\\x14-\\x1f\\x7f"
66
+ # dtext = "[#{no_ws_ctl}\\x21-\\x5a\\x5e-\\x7e]"
67
+ # dcontent = "(?:#{dtext}|#{quoted_pair})"
68
+ # domain_literal = "\\[#{dcontent}+\\]"
69
+ # obs_domain = "#{atom}([.]#{atom})*"
70
+ # domain = "(?:#{dot_atom}|#{domain_literal}|#{obs_domain})"
71
+ # addr_spec = "#{local_part}\@#{domain}"
72
+ # pattern = /^#{addr_spec}$/
73
+ # end
74
+ #
75
+ # # This is taken from dm-more http://github.com/sam/dm-more/tree/master/dm-validations/lib/dm-validations/formats/url.rb
76
+ # # Regex from http://www.igvita.com/2006/09/07/validating-url-in-ruby-on-rails/
77
+ # #
78
+ # # doctest: url # examples from Rails auto_link tests
79
+ # # >> "http://www.rubyonrails.com/contact;new" =~ Regexpert::Format::Url
80
+ # # => 0
81
+ # # >> "http://maps.google.co.uk/maps?f=q&q=the+london+eye&ie=UTF8&ll=51.503373,-0.11939&spn=0.007052,0.012767&z=16&iwloc=A" =~ Regexpert::Format::Url
82
+ # # => 0
83
+ # # >> "http://en.wikipedia.org/wiki/Sprite_(computer_graphics)" =~ Regexpert::Format::Url
84
+ # # => 0
85
+ # # TODO: think of a good example of a bad url
86
+ # Url = begin
87
+ # /(^$)|(^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(([0-9]{1,5})?\/.*)?$)/ix
88
+ # end
89
+ #
90
+ # # This is taken from Django.Contrib.Localflavor.uk
91
+ # # The regular expression used is sourced from the schema for British Standard
92
+ # # BS7666 address types: http://www.govtalk.gov.uk/gdsc/schemas/bs7666-v2-0.xsd
93
+ # #
94
+ # # doctest: ukpostcode
95
+ # # >> "GIR 0AA" =~ Regexpert::Format::UKPostcode # GIR 0AA is a special GIRO postcode
96
+ # # => 0
97
+ # # >> "AL40XB" =~ Regexpert::Format::UKPostcode
98
+ # # => 0
99
+ # # >> "CB4 1TL" =~ Regexpert::Format::UKPostcode
100
+ # # => 0
101
+ # #
102
+ # # >> "AL44 NOP" =~ Regexpert::Format::UKPostcode
103
+ # # => nil
104
+ # # >> "CB4-1TL" =~ Regexpert::Format::UKPostcode
105
+ # # => nil
106
+ # #
107
+ # UKPostcode = begin
108
+ # outcode_pattern = '[A-PR-UWYZ]([0-9]{1,2}|([A-HIK-Y][0-9](|[0-9]|[ABEHMNPRVWXY]))|[0-9][A-HJKSTUW])'
109
+ # incode_pattern = '[0-9][ABD-HJLNP-UW-Z]{2}'
110
+ # postcode_regex = Regexp.new("^(GIR *0AA|#{outcode_pattern} *#{incode_pattern})$", Regexp::IGNORECASE)
111
+ # end
112
+
113
+
114
+ # ===========================================================================
115
+ #
116
+ # http://www.botvector.net/2008/05/regular-expression-samples.html
117
+ #
118
+ #
119
+ # //Address: State code (US)
120
+ # '/\\b(?:A[KLRZ]|C[AOT]|D[CE]|FL|GA|HI|I[ADLN]|K[SY]|LA|M[ADEINOST]|N[CDEHJMVY]|O[HKR]|PA|RI|S[CD]|T[NX]|UT|V[AT]|W[AIVY])\\b/'
121
+ #
122
+ # //Address: ZIP code (US)
123
+ # '\b[0-9]{5}(?:-[0-9]{4})?\b'
124
+ #
125
+ # //Credit card: All major cards
126
+ # '^(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|6011[0-9]{12}|3(?:0[0-5]|[68][0-9])[0-9]{11}|3[47][0-9]{13})$'
127
+ #
128
+ # //Credit card: American Express
129
+ # '^3[47][0-9]{13}$'
130
+ #
131
+ # //Credit card: Diners Club
132
+ # '^3(?:0[0-5]|[68][0-9])[0-9]{11}$'
133
+ #
134
+ # //Credit card: Discover
135
+ # '^6011[0-9]{12}$'
136
+ #
137
+ # //Credit card: MasterCard
138
+ # '^5[1-5][0-9]{14}$'
139
+ #
140
+ # //Credit card: Visa
141
+ # '^4[0-9]{12}(?:[0-9]{3})?$'
142
+ #
143
+ # //Credit card: remove non-digits
144
+ # '/[^0-9]+/'
145
+ #
146
+ # //Date d/m/yy and dd/mm/yyyy
147
+ # //1/1/00 through 31/12/99 and 01/01/1900 through 31/12/2099
148
+ # //Matches invalid dates such as February 31st
149
+ # '\b(0?[1-9]|[12][0-9]|3[01])[- /.](0?[1-9]|1[012])[- /.](19|20)?[0-9]{2}\b'
150
+ #
151
+ # //Date dd/mm/yyyy
152
+ # //01/01/1900 through 31/12/2099
153
+ # //Matches invalid dates such as February 31st
154
+ # '(0[1-9]|[12][0-9]|3[01])[- /.](0[1-9]|1[012])[- /.](19|20)[0-9]{2}'
155
+ #
156
+ # //Date m/d/y and mm/dd/yyyy
157
+ # //1/1/99 through 12/31/99 and 01/01/1900 through 12/31/2099
158
+ # //Matches invalid dates such as February 31st
159
+ # //Accepts dashes, spaces, forward slashes and dots as date separators
160
+ # '\b(0?[1-9]|1[012])[- /.](0?[1-9]|[12][0-9]|3[01])[- /.](19|20)?[0-9]{2}\b'
161
+ #
162
+ # //Date mm/dd/yyyy
163
+ # //01/01/1900 through 12/31/2099
164
+ # //Matches invalid dates such as February 31st
165
+ # '(0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])[- /.](19|20)[0-9]{2}'
166
+ #
167
+ # //Date yy-m-d or yyyy-mm-dd
168
+ # //00-1-1 through 99-12-31 and 1900-01-01 through 2099-12-31
169
+ # //Matches invalid dates such as February 31st
170
+ # '\b(19|20)?[0-9]{2}[- /.](0?[1-9]|1[012])[- /.](0?[1-9]|[12][0-9]|3[01])\b'
171
+ #
172
+ # //Date yyyy-mm-dd
173
+ # //1900-01-01 through 2099-12-31
174
+ # //Matches invalid dates such as February 31st
175
+ # '(19|20)[0-9]{2}[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])'
176
+ #
177
+ #
178
+ # //IP address
179
+ # //Matches 0.0.0.0 through 999.999.999.999
180
+ # //Use this fast and simple regex if you know the data does not contain invalid IP addresses.
181
+ # '\b([0-9]{1,3})\.([0-9]{1,3})\.([0-9]{1,3})\.([0-9]{1,3})\b'
182
+ #
183
+ # //IP address
184
+ # //Matches 0.0.0.0 through 999.999.999.999
185
+ # //Use this fast and simple regex if you know the data does not contain invalid IP addresses,
186
+ # //and you don't need access to the individual IP numbers.
187
+ # '\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b'
188
+ #
189
+ # //IP address
190
+ # //Matches 0.0.0.0 through 255.255.255.255
191
+ # //Use this regex to match IP numbers with accurracy, without access to the individual IP numbers.
192
+ # '\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b'
193
+ #
194
+ # //IP address
195
+ # //Matches 0.0.0.0 through 255.255.255.255
196
+ # //Use this regex to match IP numbers with accurracy.
197
+ # //Each of the 4 numbers is stored into a capturing group, so you can access them for further processing.
198
+ # '\b(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b'
199
+ #
200
+ #
201
+ # //Number: Currency amount
202
+ # //Optional thousands separators; optional two-digit fraction
203
+ # '\b[0-9]{1,3}(?:,?[0-9]{3})*(?:\.[0-9]{2})?\b'
204
+ #
205
+ # //Number: Currency amount
206
+ # //Optional thousands separators; mandatory two-digit fraction
207
+ # '\b[0-9]{1,3}(?:,?[0-9]{3})*\.[0-9]{2}\b'
208
+ #
209
+ # //Number: floating point
210
+ # //Matches an integer or a floating point number with mandatory integer part. The sign is optional.
211
+ # '[-+]?\b[0-9]+(\.[0-9]+)?\b'
212
+ #
213
+ # //Number: floating point
214
+ # //Matches an integer or a floating point number with optional integer part. The sign is optional.
215
+ # '[-+]?\b[0-9]*\.?[0-9]+\b'
216
+ #
217
+ # //Number: hexadecimal (C-style)
218
+ # '\b0[xX][0-9a-fA-F]+\b'
219
+ #
220
+ # //Number: Insert thousands separators
221
+ # //Replaces 123456789.00 with 123,456,789.00
222
+ # '(?<=[0-9])(?=(?:[0-9]{3})+(?![0-9]))' //Number: integer //Will match 123 and 456 as separate integer numbers in 123.456 '\b\d+\b' //Number: integer //Does not match numbers like 123.456 '(?
223
+ #
224
+ # Passwords
225
+ #
226
+ #
227
+ # //Password complexity
228
+ # //Tests if the input consists of 6 or more letters, digits, underscores and hyphens.
229
+ # //The input must contain at least one upper case letter, one lower case letter and one digit.
230
+ # '\A(?=[-_a-zA-Z0-9]*?[A-Z])(?=[-_a-zA-Z0-9]*?[a-z])(?=[-_a-zA-Z0-9]*?[0-9])[-_a-zA-Z0-9]{6,}\z'
231
+ #
232
+ # //Password complexity
233
+ # //Tests if the input consists of 6 or more characters.
234
+ # //The input must contain at least one upper case letter, one lower case letter and one digit.
235
+ # '\A(?=[-_a-zA-Z0-9]*?[A-Z])(?=[-_a-zA-Z0-9]*?[a-z])(?=[-_a-zA-Z0-9]*?[0-9])\S{6,}\z'
236
+ #
237
+ # //Path: Windows
238
+ # '\b[a-z]:\\[^/:*?"<>|\r\n]*'
239
+ #
240
+ # //Path: Windows
241
+ # //Different elements of the path are captured into backreferences.
242
+ # '\b((?#drive)[a-z]):\\((?#folder)[^/:*?"<>|\r\n]*\\)?((?#file)[^\\/:*?"<>|\r\n]*)'
243
+ #
244
+ # //Path: Windows or UNC
245
+ # '(?:(?#drive)\b[a-z]:|\\\\[a-z0-9]+)\\[^/:*?"<>|\r\n]*'
246
+ #
247
+ # //Path: Windows or UNC
248
+ # //Different elements of the path are captured into backreferences.
249
+ # '((?#drive)\b[a-z]:|\\\\[a-z0-9]+)\\((?#folder)[^/:*?"<>|\r\n]*\\)?((?#file)[^\\/:*?"<>|\r\n]*)'
250
+
251
+ # //Phone Number (North America)
252
+ # //Matches 3334445555, 333.444.5555, 333-444-5555, 333 444 5555, (333) 444 5555 and all combinations thereof.
253
+ # //Replaces all those with (333) 444-5555
254
+ # preg_replace('\(?([0-9]{3})\)?[-. ]?([0-9]{3})[-. ]?([0-9]{4})', '(\1) \2-\3', $text);
255
+ #
256
+ # //Phone Number (North America)
257
+ # //Matches 3334445555, 333.444.5555, 333-444-5555, 333 444 5555, (333) 444 5555 and all combinations thereof.
258
+ # '\(?[0-9]{3}\)?[-. ]?[0-9]{3}[-. ]?[0-9]{4}'
259
+
260
+
261
+ # Postal codes
262
+ #
263
+ #
264
+ # //Postal code (Canada)
265
+ # '\b[ABCEGHJKLMNPRSTVXY][0-9][A-Z] [0-9][A-Z][0-9]\b'
266
+ #
267
+ # //Postal code (UK)
268
+ # '\b[A-Z]{1,2}[0-9][A-Z0-9]? [0-9][ABD-HJLNP-UW-Z]{2}\b'
269
+ #
270
+
271
+ #
272
+ # Programming
273
+ #
274
+ # //Programming: GUID
275
+ # //Microsoft-style GUID, numbers only.
276
+ # '[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}'
277
+ #
278
+ # //Programming: GUID
279
+ # //Microsoft-style GUID, with optional parentheses or braces.
280
+ # //(Long version, if your regex flavor doesn't support conditionals.)
281
+ # '[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}|\([A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}\)|\{[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}\}'
282
+ #
283
+ # //Programming: GUID
284
+ # //Microsoft-style GUID, with optional parentheses or braces.
285
+ # //Short version, illustrating the use of regex conditionals. Not all regex flavors support conditionals.
286
+ # //Also, when applied to large chunks of data, the regex using conditionals will likely be slower
287
+ # //than the long version. Straight alternation is much easier to optimize for a regex engine.
288
+ # '(?:(\()|(\{))?[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}(?(1)\))(?(2)\})'
289
+ #
290
+ # //Programming: Remove escapes
291
+ # //Remove backslashes used to escape other characters
292
+ # preg_replace('\\(.)', '\1', $text);
293
+ #
294
+ # //Programming: String
295
+ # //Quotes may appear in the string when escaped with a backslash.
296
+ # //The string may span multiple lines.
297
+ # '"[^"\\]*(?:\\.[^"\\]*)*"'
298
+
299
+ #
300
+ # Escape
301
+ #
302
+ # //Regex: Escape metacharacters
303
+ # //Place a backslash in front of the regular expression metacharacters
304
+ # gsub("[][{}()*+?.\\^$|]", "\\$0", $text);
305
+
306
+
307
+
308
+ # 3530588 3.4G /workspace/data lab13
309
+ # 2242028 2.2G /workspace/data lab17
310
+ # 3530588 3.4G /workspace/data lab16
311
+ # 3530588 3.4G /workspace/data lab21
312
+ # 3530588 3.4G /workspace/data lab14
313
+ # 4 4.0K /workspace/data lab12
314
+ # 3530588 3.4G /workspace/data lab15
315
+ # 20 20K /workspace/data lab23
316
+
317
+
318
+
319
+ # Security
320
+ #
321
+ #
322
+ # //Security: ASCII code characters excl. tab and CRLF
323
+ # //Matches any single non-printable code character that may cause trouble in certain situations.
324
+ # //Excludes tabs and line breaks.
325
+ # '[\x00\x08\x0B\x0C\x0E-\x1F]'
326
+ #
327
+ # //Security: ASCII code characters incl. tab and CRLF
328
+ # //Matches any single non-printable code character that may cause trouble in certain situations.
329
+ # //Includes tabs and line breaks.
330
+ # '[\x00-\x1F]'
331
+ #
332
+ # //Security: Escape quotes and backslashes
333
+ # //E.g. escape user input before inserting it into a SQL statement
334
+ # gsub("\\$0", "\\$0", $text);
335
+ #
336
+ # //Security: Unicode code and unassigned characters excl. tab and CRLF
337
+ # //Matches any single non-printable code character that may cause trouble in certain situations.
338
+ # //Also matches any Unicode code point that is unused in the current Unicode standard,
339
+ # //and thus should not occur in text as it cannot be displayed.
340
+ # //Excludes tabs and line breaks.
341
+ # '[^\P{C}\t\r\n]'
342
+ #
343
+ # //Security: Unicode code and unassigned characters incl. tab and CRLF
344
+ # //Matches any single non-printable code character that may cause trouble in certain situations.
345
+ # //Also matches any Unicode code point that is unused in the current Unicode standard,
346
+ # //and thus should not occur in text as it cannot be displayed.
347
+ # //Includes tabs and line breaks.
348
+ # '\p{C}'
349
+ #
350
+ # //Security: Unicode code characters excl. tab and CRLF
351
+ # //Matches any single non-printable code character that may cause trouble in certain situations.
352
+ # //Excludes tabs and line breaks.
353
+ # '[^\P{Cc}\t\r\n]'
354
+ #
355
+ # //Security: Unicode code characters incl. tab and CRLF
356
+ # //Matches any single non-printable code character that may cause trouble in certain situations.
357
+ # //Includes tabs and line breaks.
358
+ # '\p{Cc}'
359
+ #
360
+ #
361
+ #
362
+ # SSN (Social security numbers)
363
+ #
364
+ #
365
+ # //Social security number (US)
366
+ # '\b[0-9]{3}-[0-9]{2}-[0-9]{4}\b'
367
+
368
+
369
+
370
+
@@ -0,0 +1,38 @@
1
+ module Wuclan::Models
2
+
3
+ #
4
+ #
5
+ #
6
+ module TextElementCommon
7
+ # Key on text-status_id
8
+ def num_key_fields() 2 end
9
+ end
10
+
11
+ #
12
+ # Topical #hashtags extracted from tweet text
13
+ #
14
+ # the twitter_user_id is denormalized
15
+ # but is often what we wnat: saves a join
16
+ #
17
+ class Hashtag < TypedStruct.new(
18
+ [:hashtag, String ],
19
+ [:status_id, Integer ],
20
+ [:twitter_user_id, Integer ]
21
+ )
22
+ include ModelCommon
23
+ include TextElementCommon
24
+ alias_method :text, :hashtag
25
+ def numeric_id_fields() [:twitter_user_id, :status_id] ; end
26
+ end
27
+
28
+ class TweetUrl < TypedStruct.new(
29
+ [:tweet_url, String ],
30
+ [:status_id, Integer ],
31
+ [:twitter_user_id, Integer ]
32
+ )
33
+ include ModelCommon
34
+ include TextElementCommon
35
+ alias_method :text, :tweet_url
36
+ def numeric_id_fields() [:twitter_user_id, :status_id] ; end
37
+ end
38
+ end
@@ -0,0 +1,38 @@
1
+ require 'wuclan/models/tweet/tweet_token'
2
+ require 'wukong/encoding'
3
+ module Wuclan::Models
4
+ Tweet.class_eval do
5
+ def string_for_tokenizing
6
+ # simpleminded test for non-latin script: don't bother if > 20 entities
7
+ return if (text.count('&') > 20)
8
+ # skip default message from early days
9
+ return if (text =~ /just setting up my twttr/);
10
+ # return decoded, whitespace-flattened text
11
+ self.decoded_text.gsub(/\s+/s, ' ').strip
12
+ end
13
+
14
+ def tokens_for klass, str
15
+ klass.extract_tokens!(str).map do |word|
16
+ klass.new(word, twitter_user_id, id, 1)
17
+ end
18
+ end
19
+
20
+ def tokenize extract_word_tokens=nil
21
+ str = string_for_tokenizing
22
+ return [] if str.blank?
23
+ toks = []
24
+ # Case-sensitive tokens
25
+ [ SmilieToken, UrlToken ].each do |klass|
26
+ toks += tokens_for klass, str
27
+ end
28
+ # Case-insensitive tokens
29
+ str.downcase!
30
+ [ RtToken, AtsignToken, HashtagToken ].each do |klass| # ,
31
+ toks += tokens_for klass, str
32
+ end
33
+ toks += tokens_for WordToken, str if extract_word_tokens
34
+ toks
35
+ end
36
+
37
+ end
38
+ end
@@ -0,0 +1,202 @@
1
+ #!/usr/bin/env ruby
2
+ module Wuclan
3
+ module Models
4
+ module TweetRegexes
5
+ # ===========================================================================
6
+ #
7
+ # Twitter accepts URLs somewhat idiosyncratically, probably for good reason --
8
+ # we rarely see ()![] in urls; more likely in a status they are punctuation.
9
+ #
10
+ # This is what I've reverse engineered.
11
+ #
12
+ #
13
+ # Notes:
14
+ #
15
+ # * is.gd uses a trailing '-' (to indicate 'preview mode'): clever.
16
+ # * pastoid.com uses a trailing '+', and idek.net a trailing ~ for no reason. annoying.
17
+ #
18
+ # Counterexamples:
19
+ # * http://www.5irecipe.cn/recipe_content/2307/'/
20
+ # * http://www.facebook.com/groups.php?id=1347199977&gv=12#/group.php?gid=18183539495
21
+ #
22
+ RE_DOMAIN_HEAD = '(?:[a-zA-Z0-9\-]+\.)+'
23
+ RE_DOMAIN_TLD = '(?:com|org|net|edu|gov|mil|biz|info|mobi|name|aero|jobs|museum|[a-zA-Z]{2})'
24
+ # RE_URL_SCHEME = '[a-zA-Z][a-zA-Z0-9\-\+\.]+'
25
+ RE_URL_SCHEME_STRICT = '[a-zA-Z]{3,6}'
26
+ RE_URL_UNRESERVED = 'a-zA-Z0-9' + '\-\._~'
27
+ RE_URL_OKCHARS = RE_URL_UNRESERVED + '\'\+\,\;=' + '/%:@' # not !$&()* [] \|
28
+ RE_URL_QUERYCHARS = RE_URL_OKCHARS + '&='
29
+ RE_URL_HOSTPART = "#{RE_URL_SCHEME_STRICT}://#{RE_DOMAIN_HEAD}#{RE_DOMAIN_TLD}"
30
+ RE_URL = %r{(
31
+ #{RE_URL_HOSTPART} # Host
32
+ (?:(?: \/ [#{RE_URL_OKCHARS}]+? )*? # path: / delimited path segments
33
+ (?: \/ [#{RE_URL_OKCHARS}]*[\w\-\+\~] ) # where the last one ends in a non-punctuation.
34
+ | # ... or no path segment
35
+ )\/? # with an optional trailing slash
36
+ (?: \? [#{RE_URL_QUERYCHARS}]+ )? # query: introduced by a ?, with &foo= delimited segments
37
+ (?: \# [#{RE_URL_OKCHARS}]+ )? # frag: introduced by a #
38
+ )}x
39
+
40
+
41
+ #
42
+ # Technically a scheme can allow the characters '+', '-' and '.' within
43
+ # it. In practice you can not only ignore those characters but all but a
44
+ # few specific schemes.
45
+ #
46
+ # From a collection of ~9M tweeted urls, 99.4% were http://, with only the additional
47
+ # https, mms, ftp, git, irc, feed, itpc, rtsp, hxxp, gopher, telnet, itms, ssh, webcal, svn
48
+ # seemingly worth finding:
49
+ #
50
+ # 8925742 http
51
+ # 6026 https 1841 ivo 122 mms 85 ftp 61 git 53 irc 45 feed 31 itpc 12 www
52
+ # 12 rtsp 12 hxxp 12 gopher 9 telnet 9 itms 7 ssh 5 webcal 5 sop 4 wiie
53
+ # 3 svn 3 sssp 3 file 2 res 1 xttp 1 xmlrpc 1 ssl 1 smb
54
+ #
55
+ # An hxxp http://en.wikipedia.org/wiki/Hxxp is used to obscure a link, so
56
+ # take of that what you may.
57
+ #
58
+ # The ivo:// scheme is used by virtual astronomical observatories; as its
59
+ # hostnames are given in reverse-dotted notation (uk.org.estar) these URIs
60
+ # are imperfectly recognized. Twitter doesn't accept them at all:
61
+ # http://twitter.com/eSTAR_Project/status/1113930948
62
+ #
63
+ #
64
+
65
+
66
+ # ===========================================================================
67
+ #
68
+ # A hash following a non-alphanum_ (or at the start of the line
69
+ # followed by (any number of alpha, num, -_.+:=) and ending in an alphanum_
70
+ #
71
+ # This is overly generous to those dorky triple tags (geo:lat=69.3), but we'll soldier on somehow.
72
+ #
73
+ RE_HASHTAGS = %r{(?:^|\W)\#([a-zA-Z0-9\-_\.+:=]+\w)(?:\W|$)}
74
+
75
+ # ===========================================================================
76
+ #
77
+ # Retweets and Retweet Whores
78
+ #
79
+ # See ARetweetsB for more info.
80
+ #
81
+ # A retweet
82
+ # RT @interesting_user Something so witty Dorothy Parker would just give up
83
+ # Oh yeah and so's your mom (via @sixth_grader)
84
+ # retweeting @ogre: KEGGER TONITE RT pls
85
+ # ^^^ this is not a rtwhore; it matches first as a retweet
86
+ #
87
+ # and rtwhores
88
+ # retweet please: Hey here's something I'm whoring xxx
89
+ # KEGGER TONITE RT pls
90
+ #
91
+ # or semantically-incorrect matches such as (actual example):
92
+ # @somebody lol, love the 'please retweet' ending!
93
+ #
94
+ # Things that don't match:
95
+ # retweet is silly, @i_think_youre_dumb
96
+ # misspell the name of my Sony Via
97
+ #
98
+ RE_RETWEET_WORDS = 'rt|retweet|retweeting'
99
+ RE_RETWEET_ONLY = %r{(?:#{RE_RETWEET_WORDS})}
100
+ RE_RETWEET_OR_VIA = %r{(?:#{RE_RETWEET_WORDS}|via|from)}
101
+ RE_PLEASE = %r{(?:please|plz|pls)}
102
+ RE_RETWEET = %r{\b#{RE_RETWEET_OR_VIA}\W*@(\w+)\b}i
103
+ RE_RTWHORE = %r{
104
+ \b#{RE_RETWEET_ONLY}\W*#{RE_PLEASE}\b
105
+ | \b#{RE_PLEASE}\W*#{RE_RETWEET_ONLY}\b}ix
106
+
107
+ # ===========================================================================
108
+ #
109
+ # following either the start of the line, or a non-alphanum_ character
110
+ # the string of following [a-zA-Z0-9_]
111
+ #
112
+ # Note carefully: we _demand_ a preceding character (or start of line):
113
+ # \b would match email@address.com, which we don't want.
114
+ #
115
+ # Making an exception for RT@im_cramped_for_space.
116
+ #
117
+ # All retweets
118
+ #
119
+ RE_ATSIGNS = %r{(?:^|\W|#{RE_RETWEET_OR_VIA})@(\w+)\b}
120
+
121
+
122
+
123
+ # ===========================================================================
124
+ #
125
+ # Smilies !!! ^_^
126
+ #
127
+
128
+ # RE_NUMBERS = %r{
129
+ # (?:^|\D) # non-number
130
+ # (
131
+ # |(?:\(\d{3}\)[\ \-]?\d{3}[\ \-]\d{4})
132
+ # |(?: (?:\d{1,3}\.)(?:\d{3},)*\.?\d+) # decimal number
133
+ # |(?: (?:\d{1,3}\.)(?:\d{3}\.)*,?\d+) # euro-style
134
+ # \d+
135
+ # )
136
+ # }x
137
+ #
138
+ # # IP address
139
+ # \b(?:\d{1,3}\.){3}\d{1,3}\b
140
+ # credit card: (lax)
141
+ # \b(?:\d[ -]*){13,16}\b
142
+ # \b(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|6(?:011|5[0-9][0-9])[0-9]{12}|3[47][0-9]{13}|3(?:0[0-5]|[68][0-9])[0-9]{11}|(?:2131|1800|35\d{3})\d{11})\b
143
+ #
144
+ # [-+]?[0-9,]*\.?[0-9]*
145
+ # [-+]?[0-9]*(\.[0-9]+)?([eE][-+]?[0-9]+)?
146
+
147
+ # ===========================================================================
148
+ #
149
+ # Smilies !!! ^_^
150
+ #
151
+ RE_SMILIES_EYES = "\\:8;"
152
+ RE_SMILIES_NOSE = "\\-=\\*o"
153
+ RE_SMILIES_MOUTH = "DP@Oo\\(\\)\\[\\]\\|\\{\\}\\/\\\\"
154
+ RE_SMILIES = %r{
155
+ (?:^|\W) # non-smilie character
156
+ ( (?:
157
+ >?
158
+ [#{RE_SMILIES_EYES}] # eyes
159
+ [#{RE_SMILIES_NOSE}]? # nose, maybe
160
+ [#{RE_SMILIES_MOUTH}] ) # mouth
161
+ |(?:
162
+ [#{RE_SMILIES_MOUTH}] # mouth
163
+ [#{RE_SMILIES_NOSE}]? # nose, maybe
164
+ [#{RE_SMILIES_EYES}] # eyes
165
+ <? )
166
+ |(?: =[#{RE_SMILIES_MOUTH}]) # =) (=
167
+ |(?: [#{RE_SMILIES_MOUTH}]=) # =) (=
168
+ |(?: \^[_\-]\^ ) # kawaaaaiiii!
169
+ |(?: :[,\']\( ) # snif
170
+ |(?: <3 ) # heart
171
+ |(?: \\m/ ) # rawk
172
+ |(?: x-\( ) # dead
173
+ )
174
+ (?:\W|$)
175
+ }x
176
+ end
177
+ end
178
+ end
179
+
180
+
181
+ # http://mail.google.com/support/bin/answer.py?hl=en&answer=34056
182
+ # http://en.wikipedia.org/wiki/Emoticons
183
+ #
184
+ # :-) :) =] =) Smiling, happy
185
+ # :-( =( :[ :< frowning, Sad
186
+ # ;-) ;) ;] Wink
187
+ # :D =D XD BD Large grin or laugh
188
+ # :P =P XP Tongue out, or after a joke
189
+ # <3 S2 :> Love
190
+ # :O =O Shocked or surprised
191
+ # =I :/ :-\ Bored, annoyed or awkward; concerned.
192
+ # :S =S :? Confused, embarrassed or uneasy
193
+
194
+ # Icon Meaning Icon Meaning Icon Meaning
195
+ # (^_^) smile (^o^) laughing out loud d(^_^)b thumbs up (not ears)
196
+ # (T_T) sad (crying face) (-.-)Zzz sleeping (Z.Z) sleepy person
197
+ # \(^_^)/ cheers, "Hurrah!" (*^^*) shyness (-_-); sweating (as in ashamed), or exasperated.
198
+ # (*3*) "Surprise !." (?_?) "Nonsense, I don't know." (^_~) wink
199
+ # (o.O) shocked/disturbed (<.<) shifty, suspicious v(^_^)v peace
200
+ #
201
+ # [\\dv](^_^)[bv/]
202
+ #