RubyGems - wuclan - Versions diffs - 0.2.0 - Mend

wuclan 0.2.0

Files changed (111) hide show

data/LICENSE.textile +20 -0
data/README.textile +28 -0
data/examples/analyze/strong_links/gen_multi_edge.rb +103 -0
data/examples/analyze/strong_links/main.rb +51 -0
data/examples/analyze/word_count/dump_schema.rb +13 -0
data/examples/analyze/word_count/freq_user.rb +31 -0
data/examples/analyze/word_count/freq_whole_corpus.rb +27 -0
data/examples/analyze/word_count/word_count.pig +43 -0
data/examples/analyze/word_count/word_count.rb +34 -0
data/examples/lastfm/scrape/load_lastfm.rb +31 -0
data/examples/lastfm/scrape/scrape_lastfm.rb +47 -0
data/examples/lastfm/scrape/seed.tsv +147 -0
data/examples/twitter/old/load_twitter_search_jobs.rb +157 -0
data/examples/twitter/old/scrape_twitter_api.rb +104 -0
data/examples/twitter/old/scrape_twitter_search.rb +57 -0
data/examples/twitter/old/scrape_twitter_trending.rb +73 -0
data/examples/twitter/parse/parse_twitter_requests.rb +81 -0
data/examples/twitter/parse/parse_twitter_search_requests.rb +28 -0
data/examples/twitter/scrape_twitter_api/scrape_twitter_api.rb +61 -0
data/examples/twitter/scrape_twitter_api/seed.tsv +4 -0
data/examples/twitter/scrape_twitter_api/start_cache_twitter.sh +2 -0
data/examples/twitter/scrape_twitter_api/support/make_request_stats.rb +291 -0
data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_1.rb +98 -0
data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_2.pig +4 -0
data/examples/twitter/scrape_twitter_api/support/twitter_search_jobs.tsv +6 -0
data/examples/twitter/scrape_twitter_api/support/twitter_trending_seed.tsv +725 -0
data/examples/twitter/scrape_twitter_hosebird/edamame-killall +4 -0
data/examples/twitter/scrape_twitter_hosebird/foo.rb +19 -0
data/examples/twitter/scrape_twitter_hosebird/ps_emulation.rb +111 -0
data/examples/twitter/scrape_twitter_hosebird/scrape_twitter_hosebird.rb +110 -0
data/examples/twitter/scrape_twitter_hosebird/test_spewer.rb +20 -0
data/examples/twitter/scrape_twitter_hosebird/twitter_hosebird_god.yaml +10 -0
data/examples/twitter/scrape_twitter_search/dump_twitter_search_jobs.rb +38 -0
data/examples/twitter/scrape_twitter_search/load_twitter_search_jobs.rb +63 -0
data/examples/twitter/scrape_twitter_search/scrape_twitter_search.rb +44 -0
data/examples/twitter/scrape_twitter_search/twitter_search_daemons.god +25 -0
data/lib/old/twitter_api.rb +88 -0
data/lib/wuclan/delicious/delicious_html_request.rb +31 -0
data/lib/wuclan/delicious/delicious_models.rb +26 -0
data/lib/wuclan/delicious/delicious_request.rb +65 -0
data/lib/wuclan/friendfeed/scrape/friendfeed_search_request.rb +60 -0
data/lib/wuclan/friendster.rb +7 -0
data/lib/wuclan/lastfm/model/base.rb +49 -0
data/lib/wuclan/lastfm/model/sample_responses.txt +16 -0
data/lib/wuclan/lastfm/scrape/base.rb +195 -0
data/lib/wuclan/lastfm/scrape/concrete.rb +143 -0
data/lib/wuclan/lastfm/scrape/lastfm_job.rb +12 -0
data/lib/wuclan/lastfm/scrape/lastfm_request_stream.rb +17 -0
data/lib/wuclan/lastfm/scrape/recursive_requests.rb +154 -0
data/lib/wuclan/lastfm/scrape.rb +12 -0
data/lib/wuclan/lastfm.rb +7 -0
data/lib/wuclan/metrics/user_graph_metrics.rb +99 -0
data/lib/wuclan/metrics/user_metrics.rb +443 -0
data/lib/wuclan/metrics/user_metrics_basic.rb +277 -0
data/lib/wuclan/metrics/user_scraping_metrics.rb +64 -0
data/lib/wuclan/metrics.rb +0 -0
data/lib/wuclan/myspace.rb +21 -0
data/lib/wuclan/open_social/model/base.rb +0 -0
data/lib/wuclan/open_social/scrape/base.rb +111 -0
data/lib/wuclan/open_social/scrape_request.rb +6 -0
data/lib/wuclan/open_social.rb +0 -0
data/lib/wuclan/rdf_output/relationship_rdf.rb +47 -0
data/lib/wuclan/rdf_output/text_element_rdf.rb +64 -0
data/lib/wuclan/rdf_output/tweet_rdf.rb +10 -0
data/lib/wuclan/rdf_output/twitter_rdf.rb +84 -0
data/lib/wuclan/rdf_output/twitter_user_rdf.rb +12 -0
data/lib/wuclan/shorturl/shorturl_request.rb +271 -0
data/lib/wuclan/twitter/api_response_examples.textile +300 -0
data/lib/wuclan/twitter/model/base.rb +72 -0
data/lib/wuclan/twitter/model/multi_edge.rb +31 -0
data/lib/wuclan/twitter/model/relationship.rb +176 -0
data/lib/wuclan/twitter/model/text_element/extract_info_tests.rb +83 -0
data/lib/wuclan/twitter/model/text_element/grok_tweets.rb +96 -0
data/lib/wuclan/twitter/model/text_element/more_regexes.rb +370 -0
data/lib/wuclan/twitter/model/text_element.rb +38 -0
data/lib/wuclan/twitter/model/tweet/tokenize.rb +38 -0
data/lib/wuclan/twitter/model/tweet/tweet_regexes.rb +202 -0
data/lib/wuclan/twitter/model/tweet/tweet_token.rb +79 -0
data/lib/wuclan/twitter/model/tweet.rb +74 -0
data/lib/wuclan/twitter/model/twitter_user/style/color_to_hsv.rb +57 -0
data/lib/wuclan/twitter/model/twitter_user.rb +145 -0
data/lib/wuclan/twitter/model.rb +21 -0
data/lib/wuclan/twitter/parse/ff_ids_parser.rb +27 -0
data/lib/wuclan/twitter/parse/friends_followers_parser.rb +52 -0
data/lib/wuclan/twitter/parse/generic_json_parser.rb +26 -0
data/lib/wuclan/twitter/parse/json_tweet.rb +63 -0
data/lib/wuclan/twitter/parse/json_twitter_user.rb +122 -0
data/lib/wuclan/twitter/parse/public_timeline_parser.rb +54 -0
data/lib/wuclan/twitter/parse/twitter_search_parse.rb +60 -0
data/lib/wuclan/twitter/parse/user_parser.rb +30 -0
data/lib/wuclan/twitter/scrape/base.rb +97 -0
data/lib/wuclan/twitter/scrape/old_skool_request_classes.rb +40 -0
data/lib/wuclan/twitter/scrape/twitter_fake_fetcher.rb +31 -0
data/lib/wuclan/twitter/scrape/twitter_ff_ids_request.rb +75 -0
data/lib/wuclan/twitter/scrape/twitter_followers_request.rb +135 -0
data/lib/wuclan/twitter/scrape/twitter_json_response.rb +124 -0
data/lib/wuclan/twitter/scrape/twitter_request_stream.rb +44 -0
data/lib/wuclan/twitter/scrape/twitter_search_fake_fetcher.rb +44 -0
data/lib/wuclan/twitter/scrape/twitter_search_flat_stream.rb +30 -0
data/lib/wuclan/twitter/scrape/twitter_search_job.rb +25 -0
data/lib/wuclan/twitter/scrape/twitter_search_request.rb +70 -0
data/lib/wuclan/twitter/scrape/twitter_search_request_stream.rb +19 -0
data/lib/wuclan/twitter/scrape/twitter_timeline_request.rb +72 -0
data/lib/wuclan/twitter/scrape/twitter_user_request.rb +64 -0
data/lib/wuclan/twitter/scrape.rb +27 -0
data/lib/wuclan/twitter.rb +7 -0
data/lib/wuclan.rb +1 -0
data/spec/spec_helper.rb +9 -0
data/spec/wuclan_spec.rb +7 -0
data/wuclan.gemspec +184 -0
metadata +219 -0

data/lib/wuclan/twitter/model/text_element/more_regexes.rb ADDED Viewed

@@ -0,0 +1,370 @@
+# http://github.com/Empact/html_test/tree/master
+# http://github.com/michaeledgar/validates_not_profane
+#
+# http://github.com/porras/livevalidation/tree/master
+#   Rails plugin which allows automatic integration of your Rails application with Javascript library LiveValidation. This library implements client-side form validation and you can
+#
+# http://github.com/cainlevy/semantic-attributes
+#
+# git://github.com/alexdunae/validates_email_format_of.git
+#   Validate e-mail addreses against RFC 2822 and RFC 3696 with this popular Ruby on Rails plugin and gem.
+#
+# http://github.com/freelancing-god/active-matchers/tree/master
+#   Helpful rspec matchers for testing validations and associations.
+#
+# http://github.com/redinger/validation_reflection/tree/master
+#  refl = Person.reflect_on_validations_for(:name)
+#  refl[0].macro
+#  => :validates_presence_of
+#
+# http://github.com/augustl/live-validations/tree/master
+#   Reads Active Record's validations and makes them available to live client side javascript validation scripts
+#
+# http://github.com/adzap/validates_timeliness/tree/master
+#   Date and time validation plugin for Rails 2.x and allows custom date/time formats
+# http://github.com/matthewrudy/regexpert/tree/master
+#   Description:        A collection of common Regexps for Ruby. Validation for emails, uk postcode, etc.
+#
+# http://plugins.jquery.com/project/validate
+#
+#
+# ===========================================================================
+#
+# # http://github.com/matthewrudy/regexpert/blob/master/lib/regexpert.rb
+#
+#   module Format
+#     # This is taken from dm-more - http://github.com/sam/dm-more/tree/master/dm-validations/lib/dm-validations/formats/email.rb
+#     # RFC2822 (No attribution reference available)
+#     #
+#     # doctest: email_address
+#     # >> "MatthewRudyJacobs@gmail.com" =~ Regexpert::Format::EmailAddress
+#     # => 0
+#     #
+#     # >> "dev@" =~ Regexpert::Format::EmailAddress
+#     # => nil
+#     #
+#     EmailAddress = begin
+#       alpha = "a-zA-Z"
+#       digit = "0-9"
+#       atext = "[#{alpha}#{digit}\!\#\$\%\&\'\*+\/\=\?\^\_\`\{\|\}\~\-]"
+#       dot_atom_text = "#{atext}+([.]#{atext}*)*"
+#       dot_atom = "#{dot_atom_text}"
+#       qtext = '[^\\x0d\\x22\\x5c\\x80-\\xff]'
+#       text = "[\\x01-\\x09\\x11\\x12\\x14-\\x7f]"
+#       quoted_pair = "(\\x5c#{text})"
+#       qcontent = "(?:#{qtext}|#{quoted_pair})"
+#       quoted_string = "[\"]#{qcontent}+[\"]"
+#       atom = "#{atext}+"
+#       word = "(?:#{atom}|#{quoted_string})"
+#       obs_local_part = "#{word}([.]#{word})*"
+#       local_part = "(?:#{dot_atom}|#{quoted_string}|#{obs_local_part})"
+#       no_ws_ctl = "\\x01-\\x08\\x11\\x12\\x14-\\x1f\\x7f"
+#       dtext = "[#{no_ws_ctl}\\x21-\\x5a\\x5e-\\x7e]"
+#       dcontent = "(?:#{dtext}|#{quoted_pair})"
+#       domain_literal = "\\[#{dcontent}+\\]"
+#       obs_domain = "#{atom}([.]#{atom})*"
+#       domain = "(?:#{dot_atom}|#{domain_literal}|#{obs_domain})"
+#       addr_spec = "#{local_part}\@#{domain}"
+#       pattern = /^#{addr_spec}$/
+#     end
+#
+#     # This is taken from dm-more http://github.com/sam/dm-more/tree/master/dm-validations/lib/dm-validations/formats/url.rb
+#     # Regex from http://www.igvita.com/2006/09/07/validating-url-in-ruby-on-rails/
+#     #
+#     # doctest: url # examples from Rails auto_link tests
+#     # >> "http://www.rubyonrails.com/contact;new" =~ Regexpert::Format::Url
+#     # => 0
+#     # >> "http://maps.google.co.uk/maps?f=q&q=the+london+eye&ie=UTF8&ll=51.503373,-0.11939&spn=0.007052,0.012767&z=16&iwloc=A" =~ Regexpert::Format::Url
+#     # => 0
+#     # >> "http://en.wikipedia.org/wiki/Sprite_(computer_graphics)" =~ Regexpert::Format::Url
+#     # => 0
+#     # TODO: think of a good example of a bad url
+#     Url = begin
+# /(^$)|(^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(([0-9]{1,5})?\/.*)?$)/ix
+#     end
+#
+#     # This is taken from Django.Contrib.Localflavor.uk
+#     # The regular expression used is sourced from the schema for British Standard
+#     # BS7666 address types: http://www.govtalk.gov.uk/gdsc/schemas/bs7666-v2-0.xsd
+#     #
+#     # doctest: ukpostcode
+#     # >> "GIR 0AA" =~ Regexpert::Format::UKPostcode # GIR 0AA is a special GIRO postcode
+#     # => 0
+#     # >> "AL40XB" =~ Regexpert::Format::UKPostcode
+#     # => 0
+#     # >> "CB4 1TL" =~ Regexpert::Format::UKPostcode
+#     # => 0
+#     #
+#     # >> "AL44 NOP" =~ Regexpert::Format::UKPostcode
+#     # => nil
+#     # >> "CB4-1TL" =~ Regexpert::Format::UKPostcode
+#     # => nil
+#     #
+#     UKPostcode = begin
+#       outcode_pattern = '[A-PR-UWYZ]([0-9]{1,2}|([A-HIK-Y][0-9](|[0-9]|[ABEHMNPRVWXY]))|[0-9][A-HJKSTUW])'
+#       incode_pattern = '[0-9][ABD-HJLNP-UW-Z]{2}'
+#       postcode_regex = Regexp.new("^(GIR *0AA|#{outcode_pattern} *#{incode_pattern})$", Regexp::IGNORECASE)
+#     end
+# ===========================================================================
+#
+# http://www.botvector.net/2008/05/regular-expression-samples.html
+#
+#
+# //Address: State code (US)
+# '/\\b(?:A[KLRZ]|C[AOT]|D[CE]|FL|GA|HI|I[ADLN]|K[SY]|LA|M[ADEINOST]|N[CDEHJMVY]|O[HKR]|PA|RI|S[CD]|T[NX]|UT|V[AT]|W[AIVY])\\b/'
+#
+# //Address: ZIP code (US)
+# '\b[0-9]{5}(?:-[0-9]{4})?\b'
+#
+# //Credit card: All major cards
+# '^(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|6011[0-9]{12}|3(?:0[0-5]|[68][0-9])[0-9]{11}|3[47][0-9]{13})$'
+#
+# //Credit card: American Express
+# '^3[47][0-9]{13}$'
+#
+# //Credit card: Diners Club
+# '^3(?:0[0-5]|[68][0-9])[0-9]{11}$'
+#
+# //Credit card: Discover
+# '^6011[0-9]{12}$'
+#
+# //Credit card: MasterCard
+# '^5[1-5][0-9]{14}$'
+#
+# //Credit card: Visa
+# '^4[0-9]{12}(?:[0-9]{3})?$'
+#
+# //Credit card: remove non-digits
+# '/[^0-9]+/'
+#
+# //Date d/m/yy and dd/mm/yyyy
+# //1/1/00 through 31/12/99 and 01/01/1900 through 31/12/2099
+# //Matches invalid dates such as February 31st
+# '\b(0?[1-9]|[12][0-9]|3[01])[- /.](0?[1-9]|1[012])[- /.](19|20)?[0-9]{2}\b'
+#
+# //Date dd/mm/yyyy
+# //01/01/1900 through 31/12/2099
+# //Matches invalid dates such as February 31st
+# '(0[1-9]|[12][0-9]|3[01])[- /.](0[1-9]|1[012])[- /.](19|20)[0-9]{2}'
+#
+# //Date m/d/y and mm/dd/yyyy
+# //1/1/99 through 12/31/99 and 01/01/1900 through 12/31/2099
+# //Matches invalid dates such as February 31st
+# //Accepts dashes, spaces, forward slashes and dots as date separators
+# '\b(0?[1-9]|1[012])[- /.](0?[1-9]|[12][0-9]|3[01])[- /.](19|20)?[0-9]{2}\b'
+#
+# //Date mm/dd/yyyy
+# //01/01/1900 through 12/31/2099
+# //Matches invalid dates such as February 31st
+# '(0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])[- /.](19|20)[0-9]{2}'
+#
+# //Date yy-m-d or yyyy-mm-dd
+# //00-1-1 through 99-12-31 and 1900-01-01 through 2099-12-31
+# //Matches invalid dates such as February 31st
+# '\b(19|20)?[0-9]{2}[- /.](0?[1-9]|1[012])[- /.](0?[1-9]|[12][0-9]|3[01])\b'
+#
+# //Date yyyy-mm-dd
+# //1900-01-01 through 2099-12-31
+# //Matches invalid dates such as February 31st
+# '(19|20)[0-9]{2}[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])'
+#
+#
+# //IP address
+# //Matches 0.0.0.0 through 999.999.999.999
+# //Use this fast and simple regex if you know the data does not contain invalid IP addresses.
+# '\b([0-9]{1,3})\.([0-9]{1,3})\.([0-9]{1,3})\.([0-9]{1,3})\b'
+#
+# //IP address
+# //Matches 0.0.0.0 through 999.999.999.999
+# //Use this fast and simple regex if you know the data does not contain invalid IP addresses,
+# //and you don't need access to the individual IP numbers.
+# '\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b'
+#
+# //IP address
+# //Matches 0.0.0.0 through 255.255.255.255
+# //Use this regex to match IP numbers with accurracy, without access to the individual IP numbers.
+# '\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b'
+#
+# //IP address
+# //Matches 0.0.0.0 through 255.255.255.255
+# //Use this regex to match IP numbers with accurracy.
+# //Each of the 4 numbers is stored into a capturing group, so you can access them for further processing.
+# '\b(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b'
+#
+#
+# //Number: Currency amount
+# //Optional thousands separators; optional two-digit fraction
+# '\b[0-9]{1,3}(?:,?[0-9]{3})*(?:\.[0-9]{2})?\b'
+#
+# //Number: Currency amount
+# //Optional thousands separators; mandatory two-digit fraction
+# '\b[0-9]{1,3}(?:,?[0-9]{3})*\.[0-9]{2}\b'
+#
+# //Number: floating point
+# //Matches an integer or a floating point number with mandatory integer part.  The sign is optional.
+# '[-+]?\b[0-9]+(\.[0-9]+)?\b'
+#
+# //Number: floating point
+# //Matches an integer or a floating point number with optional integer part.  The sign is optional.
+# '[-+]?\b[0-9]*\.?[0-9]+\b'
+#
+# //Number: hexadecimal (C-style)
+# '\b0[xX][0-9a-fA-F]+\b'
+#
+# //Number: Insert thousands separators
+# //Replaces 123456789.00 with 123,456,789.00
+# '(?<=[0-9])(?=(?:[0-9]{3})+(?![0-9]))'  //Number: integer //Will match 123 and 456 as separate integer numbers in 123.456 '\b\d+\b'  //Number: integer //Does not match numbers like 123.456 '(?
+#
+# Passwords
+#
+#
+# //Password complexity
+# //Tests if the input consists of 6 or more letters, digits, underscores and hyphens.
+# //The input must contain at least one upper case letter, one lower case letter and one digit.
+# '\A(?=[-_a-zA-Z0-9]*?[A-Z])(?=[-_a-zA-Z0-9]*?[a-z])(?=[-_a-zA-Z0-9]*?[0-9])[-_a-zA-Z0-9]{6,}\z'
+#
+# //Password complexity
+# //Tests if the input consists of 6 or more characters.
+# //The input must contain at least one upper case letter, one lower case letter and one digit.
+# '\A(?=[-_a-zA-Z0-9]*?[A-Z])(?=[-_a-zA-Z0-9]*?[a-z])(?=[-_a-zA-Z0-9]*?[0-9])\S{6,}\z'
+#
+# //Path: Windows
+# '\b[a-z]:\\[^/:*?"<>|\r\n]*'
+#
+# //Path: Windows
+# //Different elements of the path are captured into backreferences.
+# '\b((?#drive)[a-z]):\\((?#folder)[^/:*?"<>|\r\n]*\\)?((?#file)[^\\/:*?"<>|\r\n]*)'
+#
+# //Path: Windows or UNC
+# '(?:(?#drive)\b[a-z]:|\\\\[a-z0-9]+)\\[^/:*?"<>|\r\n]*'
+#
+# //Path: Windows or UNC
+# //Different elements of the path are captured into backreferences.
+# '((?#drive)\b[a-z]:|\\\\[a-z0-9]+)\\((?#folder)[^/:*?"<>|\r\n]*\\)?((?#file)[^\\/:*?"<>|\r\n]*)'
+# //Phone Number (North America)
+# //Matches 3334445555, 333.444.5555, 333-444-5555, 333 444 5555, (333) 444 5555 and all combinations thereof.
+# //Replaces all those with (333) 444-5555
+# preg_replace('\(?([0-9]{3})\)?[-. ]?([0-9]{3})[-. ]?([0-9]{4})', '(\1) \2-\3', $text);
+#
+# //Phone Number (North America)
+# //Matches 3334445555, 333.444.5555, 333-444-5555, 333 444 5555, (333) 444 5555 and all combinations thereof.
+# '\(?[0-9]{3}\)?[-. ]?[0-9]{3}[-. ]?[0-9]{4}'
+# Postal codes
+#
+#
+# //Postal code (Canada)
+# '\b[ABCEGHJKLMNPRSTVXY][0-9][A-Z] [0-9][A-Z][0-9]\b'
+#
+# //Postal code (UK)
+# '\b[A-Z]{1,2}[0-9][A-Z0-9]? [0-9][ABD-HJLNP-UW-Z]{2}\b'
+#
+#
+# Programming
+#
+# //Programming: GUID
+# //Microsoft-style GUID, numbers only.
+# '[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}'
+#
+# //Programming: GUID
+# //Microsoft-style GUID, with optional parentheses or braces.
+# //(Long version, if your regex flavor doesn't support conditionals.)
+# '[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}|\([A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}\)|\{[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}\}'
+#
+# //Programming: GUID
+# //Microsoft-style GUID, with optional parentheses or braces.
+# //Short version, illustrating the use of regex conditionals.  Not all regex flavors support conditionals.
+# //Also, when applied to large chunks of data, the regex using conditionals will likely be slower
+# //than the long version.  Straight alternation is much easier to optimize for a regex engine.
+# '(?:(\()|(\{))?[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}(?(1)\))(?(2)\})'
+#
+# //Programming: Remove escapes
+# //Remove backslashes used to escape other characters
+# preg_replace('\\(.)', '\1', $text);
+#
+# //Programming: String
+# //Quotes may appear in the string when escaped with a backslash.
+# //The string may span multiple lines.
+# '"[^"\\]*(?:\\.[^"\\]*)*"'
+#
+# Escape
+#
+# //Regex: Escape metacharacters
+# //Place a backslash in front of the regular expression metacharacters
+# gsub("[][{}()*+?.\\^$|]", "\\$0", $text);
+        # 3530588    3.4G /workspace/data lab13
+        # 2242028    2.2G /workspace/data lab17
+        # 3530588    3.4G /workspace/data lab16
+        # 3530588    3.4G /workspace/data lab21
+        # 3530588    3.4G /workspace/data lab14
+        #       4    4.0K /workspace/data lab12
+        # 3530588    3.4G /workspace/data lab15
+        #      20     20K /workspace/data lab23
+# Security
+#
+#
+# //Security: ASCII code characters excl. tab and CRLF
+# //Matches any single non-printable code character that may cause trouble in certain situations.
+# //Excludes tabs and line breaks.
+# '[\x00\x08\x0B\x0C\x0E-\x1F]'
+#
+# //Security: ASCII code characters incl. tab and CRLF
+# //Matches any single non-printable code character that may cause trouble in certain situations.
+# //Includes tabs and line breaks.
+# '[\x00-\x1F]'
+#
+# //Security: Escape quotes and backslashes
+# //E.g. escape user input before inserting it into a SQL statement
+# gsub("\\$0", "\\$0", $text);
+#
+# //Security: Unicode code and unassigned characters excl. tab and CRLF
+# //Matches any single non-printable code character that may cause trouble in certain situations.
+# //Also matches any Unicode code point that is unused in the current Unicode standard,
+# //and thus should not occur in text as it cannot be displayed.
+# //Excludes tabs and line breaks.
+# '[^\P{C}\t\r\n]'
+#
+# //Security: Unicode code and unassigned characters incl. tab and CRLF
+# //Matches any single non-printable code character that may cause trouble in certain situations.
+# //Also matches any Unicode code point that is unused in the current Unicode standard,
+# //and thus should not occur in text as it cannot be displayed.
+# //Includes tabs and line breaks.
+# '\p{C}'
+#
+# //Security: Unicode code characters excl. tab and CRLF
+# //Matches any single non-printable code character that may cause trouble in certain situations.
+# //Excludes tabs and line breaks.
+# '[^\P{Cc}\t\r\n]'
+#
+# //Security: Unicode code characters incl. tab and CRLF
+# //Matches any single non-printable code character that may cause trouble in certain situations.
+# //Includes tabs and line breaks.
+# '\p{Cc}'
+#
+#
+#
+# SSN (Social security numbers)
+#
+#
+# //Social security number (US)
+# '\b[0-9]{3}-[0-9]{2}-[0-9]{4}\b'

data/lib/wuclan/twitter/model/text_element.rb ADDED Viewed

@@ -0,0 +1,38 @@
+module Wuclan::Models
+  #
+  #
+  #
+  module TextElementCommon
+    # Key on text-status_id
+    def num_key_fields()  2  end
+  end
+  #
+  # Topical #hashtags extracted from tweet text
+  #
+  # the twitter_user_id is denormalized
+  # but is often what we wnat: saves a join
+  #
+  class Hashtag < TypedStruct.new(
+      [:hashtag,         String      ],
+      [:status_id,       Integer     ],
+      [:twitter_user_id, Integer     ]
+      )
+    include ModelCommon
+    include TextElementCommon
+    alias_method :text, :hashtag
+    def numeric_id_fields()     [:twitter_user_id, :status_id] ; end
+  end
+  class TweetUrl < TypedStruct.new(
+      [:tweet_url,       String      ],
+      [:status_id,       Integer     ],
+      [:twitter_user_id, Integer     ]
+      )
+    include ModelCommon
+    include TextElementCommon
+    alias_method :text, :tweet_url
+    def numeric_id_fields()     [:twitter_user_id, :status_id] ; end
+  end
+end

data/lib/wuclan/twitter/model/tweet/tokenize.rb ADDED Viewed

@@ -0,0 +1,38 @@
+require 'wuclan/models/tweet/tweet_token'
+require 'wukong/encoding'
+module Wuclan::Models
+  Tweet.class_eval do
+    def string_for_tokenizing
+      # simpleminded test for non-latin script: don't bother if > 20 entities
+      return if (text.count('&') > 20)
+      # skip default message from early days
+      return if (text =~ /just setting up my twttr/);
+      # return decoded, whitespace-flattened text
+      self.decoded_text.gsub(/\s+/s, ' ').strip
+    end
+    def tokens_for klass, str
+      klass.extract_tokens!(str).map do |word|
+        klass.new(word, twitter_user_id, id, 1)
+      end
+    end
+    def tokenize extract_word_tokens=nil
+      str = string_for_tokenizing
+      return [] if str.blank?
+      toks = []
+      # Case-sensitive tokens
+      [ SmilieToken, UrlToken ].each do |klass|
+        toks += tokens_for klass, str
+      end
+      # Case-insensitive tokens
+      str.downcase!
+      [ RtToken, AtsignToken, HashtagToken ].each do |klass| # ,
+        toks += tokens_for klass, str
+      end
+      toks += tokens_for WordToken, str if extract_word_tokens
+      toks
+    end
+  end
+end

data/lib/wuclan/twitter/model/tweet/tweet_regexes.rb ADDED Viewed

@@ -0,0 +1,202 @@
+#!/usr/bin/env ruby
+module Wuclan
+  module Models
+    module TweetRegexes
+      # ===========================================================================
+      #
+      # Twitter accepts URLs somewhat idiosyncratically, probably for good reason --
+      # we rarely see ()![] in urls; more likely in a status they are punctuation.
+      #
+      # This is what I've reverse engineered.
+      #
+      #
+      # Notes:
+      #
+      # * is.gd uses a trailing '-' (to indicate 'preview mode'): clever.
+      # * pastoid.com uses a trailing '+', and idek.net a trailing ~ for no reason. annoying.
+      #
+      # Counterexamples:
+      # * http://www.5irecipe.cn/recipe_content/2307/'/
+      # * http://www.facebook.com/groups.php?id=1347199977&gv=12#/group.php?gid=18183539495
+      #
+      RE_DOMAIN_HEAD       = '(?:[a-zA-Z0-9\-]+\.)+'
+      RE_DOMAIN_TLD        = '(?:com|org|net|edu|gov|mil|biz|info|mobi|name|aero|jobs|museum|[a-zA-Z]{2})'
+      # RE_URL_SCHEME      = '[a-zA-Z][a-zA-Z0-9\-\+\.]+'
+      RE_URL_SCHEME_STRICT = '[a-zA-Z]{3,6}'
+      RE_URL_UNRESERVED    = 'a-zA-Z0-9'   + '\-\._~'
+      RE_URL_OKCHARS       = RE_URL_UNRESERVED + '\'\+\,\;=' + '/%:@'   # not !$&()* [] \|
+      RE_URL_QUERYCHARS    = RE_URL_OKCHARS    + '&='
+      RE_URL_HOSTPART      = "#{RE_URL_SCHEME_STRICT}://#{RE_DOMAIN_HEAD}#{RE_DOMAIN_TLD}"
+      RE_URL               = %r{(
+                #{RE_URL_HOSTPART}                   # Host
+     (?:(?: \/ [#{RE_URL_OKCHARS}]+?          )*?    # path:  / delimited path segments
+        (?: \/ [#{RE_URL_OKCHARS}]*[\w\-\+\~] )      #        where the last one ends in a non-punctuation.
+       |                                             #        ... or no path segment
+                                              )\/?   #        with an optional trailing slash
+        (?: \? [#{RE_URL_QUERYCHARS}]+  )?           # query: introduced by a ?, with &foo= delimited segments
+        (?: \# [#{RE_URL_OKCHARS}]+     )?           # frag:  introduced by a #
+      )}x
+      #
+      # Technically a scheme can allow the characters '+', '-' and '.' within
+      # it. In practice you can not only ignore those characters but all but a
+      # few specific schemes.
+      #
+      # From a collection of ~9M tweeted urls, 99.4% were http://, with only the additional
+      #   https, mms, ftp, git, irc, feed, itpc, rtsp, hxxp, gopher, telnet, itms, ssh, webcal, svn
+      # seemingly worth finding:
+      #
+      #   8925742 http
+      #      6026 https  1841 ivo  122 mms    85 ftp    61 git  53 irc   45 feed   31 itpc  12 www
+      #        12 rtsp     12 hxxp  12 gopher  9 telnet  9 itms  7 ssh    5 webcal  5 sop    4 wiie
+      #         3 svn       3 sssp   3 file    2 res     1 xttp  1 xmlrpc 1 ssl     1 smb
+      #
+      # An hxxp http://en.wikipedia.org/wiki/Hxxp is used to obscure a link, so
+      # take of that what you may.
+      #
+      # The ivo:// scheme is used by virtual astronomical observatories; as its
+      # hostnames are given in reverse-dotted notation (uk.org.estar) these URIs
+      # are imperfectly recognized.  Twitter doesn't accept them at all:
+      #   http://twitter.com/eSTAR_Project/status/1113930948
+      #
+      #
+      # ===========================================================================
+      #
+      # A hash following a non-alphanum_ (or at the start of the line
+      # followed by (any number of alpha, num, -_.+:=) and ending in an alphanum_
+      #
+      # This is overly generous to those dorky triple tags (geo:lat=69.3), but we'll soldier on somehow.
+      #
+      RE_HASHTAGS        = %r{(?:^|\W)\#([a-zA-Z0-9\-_\.+:=]+\w)(?:\W|$)}
+      # ===========================================================================
+      #
+      # Retweets and Retweet Whores
+      #
+      # See ARetweetsB for more info.
+      #
+      # A retweet
+      #   RT @interesting_user Something so witty Dorothy Parker would just give up
+      #   Oh yeah and so's your mom (via @sixth_grader)
+      #   retweeting @ogre: KEGGER TONITE RT pls
+      #     ^^^ this is not a rtwhore; it matches first as a retweet
+      #
+      # and rtwhores
+      #   retweet please: Hey here's something I'm whoring xxx
+      #   KEGGER TONITE RT pls
+      #
+      # or semantically-incorrect matches such as (actual example):
+      #    @somebody lol, love the 'please retweet' ending!
+      #
+      # Things that don't match:
+      #   retweet is silly, @i_think_youre_dumb
+      #    misspell the name of my Sony Via
+      #
+      RE_RETWEET_WORDS  = 'rt|retweet|retweeting'
+      RE_RETWEET_ONLY   = %r{(?:#{RE_RETWEET_WORDS})}
+      RE_RETWEET_OR_VIA = %r{(?:#{RE_RETWEET_WORDS}|via|from)}
+      RE_PLEASE         = %r{(?:please|plz|pls)}
+      RE_RETWEET        = %r{\b#{RE_RETWEET_OR_VIA}\W*@(\w+)\b}i
+      RE_RTWHORE        = %r{
+          \b#{RE_RETWEET_ONLY}\W*#{RE_PLEASE}\b
+        | \b#{RE_PLEASE}\W*#{RE_RETWEET_ONLY}\b}ix
+      # ===========================================================================
+      #
+      # following either the start of the line, or a non-alphanum_ character
+      # the string of following [a-zA-Z0-9_]
+      #
+      # Note carefully: we _demand_ a preceding character (or start of line):
+      # \b would match email@address.com, which we don't want.
+      #
+      # Making an exception for RT@im_cramped_for_space.
+      #
+      # All retweets
+      #
+      RE_ATSIGNS         = %r{(?:^|\W|#{RE_RETWEET_OR_VIA})@(\w+)\b}
+      # ===========================================================================
+      #
+      # Smilies !!! ^_^
+      #
+      # RE_NUMBERS = %r{
+      #   (?:^|\D)                       # non-number
+      #   (
+      #    |(?:\(\d{3}\)[\ \-]?\d{3}[\ \-]\d{4})
+      #    |(?: (?:\d{1,3}\.)(?:\d{3},)*\.?\d+)        # decimal number
+      #    |(?: (?:\d{1,3}\.)(?:\d{3}\.)*,?\d+)        # euro-style
+      #    \d+
+      #   )
+      # }x
+      #
+      # # IP address
+      # \b(?:\d{1,3}\.){3}\d{1,3}\b
+      # credit card: (lax)
+      # \b(?:\d[ -]*){13,16}\b
+      # \b(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|6(?:011|5[0-9][0-9])[0-9]{12}|3[47][0-9]{13}|3(?:0[0-5]|[68][0-9])[0-9]{11}|(?:2131|1800|35\d{3})\d{11})\b
+      #
+      # [-+]?[0-9,]*\.?[0-9]*
+      # [-+]?[0-9]*(\.[0-9]+)?([eE][-+]?[0-9]+)?
+      # ===========================================================================
+      #
+      # Smilies !!! ^_^
+      #
+      RE_SMILIES_EYES  = "\\:8;"
+      RE_SMILIES_NOSE  = "\\-=\\*o"
+      RE_SMILIES_MOUTH = "DP@Oo\\(\\)\\[\\]\\|\\{\\}\\/\\\\"
+      RE_SMILIES = %r{
+        (?:^|\W)                       # non-smilie character
+        ( (?:
+            >?
+            [#{RE_SMILIES_EYES}]       # eyes
+            [#{RE_SMILIES_NOSE}]?      # nose, maybe
+            [#{RE_SMILIES_MOUTH}] )    # mouth
+         |(?:
+            [#{RE_SMILIES_MOUTH}]      # mouth
+            [#{RE_SMILIES_NOSE}]?      # nose, maybe
+            [#{RE_SMILIES_EYES}]       # eyes
+            <? )
+         |(?: =[#{RE_SMILIES_MOUTH}])  # =) (=
+         |(?: [#{RE_SMILIES_MOUTH}]=)  # =) (=
+         |(?: \^[_\-]\^ )              # kawaaaaiiii!
+         |(?: :[,\']\( )               # snif
+         |(?: <3 )                     # heart
+         |(?: \\m/ )                   # rawk
+         |(?: x-\( )                   # dead
+        )
+        (?:\W|$)
+       }x
+    end
+  end
+end
+# http://mail.google.com/support/bin/answer.py?hl=en&answer=34056
+# http://en.wikipedia.org/wiki/Emoticons
+#
+# :-)  :)  =]  =)       Smiling, happy
+# :-(  =(  :[  :<       frowning, Sad
+# ;-)  ;)  ;]           Wink
+# :D   =D  XD  BD       Large grin or laugh
+# :P   =P  XP           Tongue out, or after a joke
+# <3   S2  :>           Love
+# :O   =O               Shocked or surprised
+# =I   :/  :-\          Bored, annoyed or awkward; concerned.
+# :S   =S  :?           Confused, embarrassed or uneasy
+# Icon          Meaning                 Icon            Meaning                         Icon    Meaning
+# (^_^)         smile                   (^o^)           laughing out loud               d(^_^)b thumbs up (not ears)
+# (T_T)         sad (crying face)       (-.-)Zzz        sleeping                        (Z.Z)   sleepy person
+# \(^_^)/       cheers, "Hurrah!"       (*^^*)          shyness                         (-_-);  sweating (as in ashamed), or exasperated.
+# (*3*)         "Surprise !."           (?_?)           "Nonsense, I don't know."       (^_~)   wink
+# (o.O)         shocked/disturbed       (<.<)           shifty, suspicious              v(^_^)v peace
+#
+# [\\dv](^_^)[bv/]
+#