cinch-url-scraper 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. data/LICENSE +14 -0
  2. data/README.md +3 -0
  3. data/lib/cinch/plugins/urlscraper.rb +105 -0
  4. metadata +80 -0
data/LICENSE ADDED
@@ -0,0 +1,14 @@
1
+ Copyright (c) 2012 Michal Papis
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+
@@ -0,0 +1,3 @@
1
+ # Cinch Url Scraper plugin
2
+
3
+ A Cinch plugin to get information about posted URLs.
@@ -0,0 +1,105 @@
1
+ # source: http://subforge.org/projects/shreds/repository/entry/bot/cinch.rb#L396
2
+ # @copyright (c) 2010-2012, Christoph Kappel <unexist@dorfelite.net>
3
+
4
+ require "json"
5
+ require "mechanize"
6
+
7
+ module Cinch
8
+ module Plugins
9
+ class UrlScraper
10
+ include Cinch::Plugin
11
+ set :react_on, :channel
12
+
13
+ listen_to :channel
14
+ def listen(m)
15
+ # Create mechanize agent
16
+ if @agent.nil?
17
+ @agent = Mechanize.new
18
+ @agent.user_agent_alias = "Linux Mozilla"
19
+ @agent.max_history = 0
20
+ end
21
+
22
+ URI.extract(m.message, ["http", "https"]) do |link|
23
+ # Fetch data
24
+ begin
25
+ uri = URI.parse(link)
26
+ page = @agent.get(link)
27
+ rescue Mechanize::ResponseCodeError
28
+ if "www.youtube.com" == uri.host
29
+ m.reply "Thank you, GEMA!"
30
+ else
31
+ m.reply "Y U POST BROKEN LINKS?", true
32
+ end
33
+
34
+ next
35
+ end
36
+
37
+ # Replace strange characters
38
+ title = page.title.gsub(/[\x00-\x1f]*/, "").gsub(/[ ]{2,}/, " ").strip rescue nil
39
+
40
+ # Check host
41
+ case uri.host
42
+ when "www.imdb.com"
43
+ # Get user rating
44
+ rating = page.search("//strong/span[@itemprop='ratingValue']").text
45
+
46
+ # Get votes
47
+ votes = page.search("//a/span[@itemprop='ratingCount']").text
48
+
49
+ m.reply "Title: %s (at %s, %s/10 from %s users)" % [
50
+ title, uri.host, rating, votes
51
+ ]
52
+ when "www.youtube.com"
53
+ # Reload with nofeather
54
+ page = @agent.get(link + "&nofeather=True")
55
+
56
+ # Get page hits
57
+ hits = page.search("//span[@class='watch-view-count']/strong")
58
+ hits = hits.text.gsub(/[.,]/, "")
59
+
60
+ # Get likes
61
+ likes = page.search("//span[@class='watch-likes-dislikes']")
62
+ likes = likes.text.gsub(/[.,]/, "")
63
+
64
+ m.reply "Title: %s (at %s, %s hits, %s)" % [
65
+ title, uri.host, hits, likes.strip
66
+ ]
67
+ when "gist.github.com"
68
+ # Get owner
69
+ owner = page.search("//div[@class='name']/a").inner_html
70
+
71
+ # Get time
72
+ age = Time.parse(page.search("//span[@class='date']/abbr").text)
73
+ age = age.strftime("%Y-%m-%d %H:%M")
74
+
75
+ m.reply "Title: %s (at %s, %s on %s)" % [
76
+ title, uri.host, owner, age
77
+ ]
78
+ when "pastie.org"
79
+ # Get time
80
+ age = Time.parse(page.search("//span[@class='typo_date']").text)
81
+ age = age.strftime("%Y-%m-%d %H:%M")
82
+
83
+ m.reply "Title: %s (at %s, on %s)" % [
84
+ title, uri.host, age
85
+ ]
86
+ when "subforge.org", "subtle.de"
87
+ m.reply "Title: %s (at %s)" % [ title, uri.host ]
88
+ when "twitter.com"
89
+ if link =~ /\/status\/(\d+)$/
90
+ json = @agent.get("https://api.twitter.com/1/statuses/show/#{$1}.json?trim_user=1").body
91
+ tweet = JSON.parse(json)
92
+ unescaped = CGI.unescapeHTML(tweet["text"])
93
+
94
+ m.reply "@%s: %s" % [ tweet["user"]["screen_name"], unescaped ]
95
+ else
96
+ m.reply "Broken twitter link: %s (at %s)" % [ title, uri.host ] if title
97
+ end
98
+ else
99
+ m.reply "Title: %s (at %s)" % [ title, uri.host ] if title
100
+ end
101
+ end
102
+ end
103
+ end
104
+ end
105
+ end
metadata ADDED
@@ -0,0 +1,80 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: cinch-url-scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Michal Papis
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-06-26 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: cinch
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: '2'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: '2'
30
+ - !ruby/object:Gem::Dependency
31
+ name: mechanize
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ~>
36
+ - !ruby/object:Gem::Version
37
+ version: '2'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ version: '2'
46
+ description: A Cinch plugin to get information about posted URLs.
47
+ email:
48
+ - mpapis@gmail.com
49
+ executables: []
50
+ extensions: []
51
+ extra_rdoc_files: []
52
+ files:
53
+ - LICENSE
54
+ - README.md
55
+ - lib/cinch/plugins/urlscraper.rb
56
+ homepage: https://github.com/mpapis/cinch-url-scraper
57
+ licenses: []
58
+ post_install_message:
59
+ rdoc_options: []
60
+ require_paths:
61
+ - lib
62
+ required_ruby_version: !ruby/object:Gem::Requirement
63
+ none: false
64
+ requirements:
65
+ - - ! '>='
66
+ - !ruby/object:Gem::Version
67
+ version: 1.9.1
68
+ required_rubygems_version: !ruby/object:Gem::Requirement
69
+ none: false
70
+ requirements:
71
+ - - ! '>='
72
+ - !ruby/object:Gem::Version
73
+ version: '0'
74
+ requirements: []
75
+ rubyforge_project:
76
+ rubygems_version: 1.8.24
77
+ signing_key:
78
+ specification_version: 3
79
+ summary: A Cinch plugin to get information about posted URLs.
80
+ test_files: []