cinch-url-scraper 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +14 -0
- data/README.md +3 -0
- data/lib/cinch/plugins/urlscraper.rb +105 -0
- metadata +80 -0
data/LICENSE
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
Copyright (c) 2012 Michal Papis
|
2
|
+
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
|
data/README.md
ADDED
@@ -0,0 +1,105 @@
|
|
1
|
+
# source: http://subforge.org/projects/shreds/repository/entry/bot/cinch.rb#L396
|
2
|
+
# @copyright (c) 2010-2012, Christoph Kappel <unexist@dorfelite.net>
|
3
|
+
|
4
|
+
require "json"
|
5
|
+
require "mechanize"
|
6
|
+
|
7
|
+
module Cinch
|
8
|
+
module Plugins
|
9
|
+
class UrlScraper
|
10
|
+
include Cinch::Plugin
|
11
|
+
set :react_on, :channel
|
12
|
+
|
13
|
+
listen_to :channel
|
14
|
+
def listen(m)
|
15
|
+
# Create mechanize agent
|
16
|
+
if @agent.nil?
|
17
|
+
@agent = Mechanize.new
|
18
|
+
@agent.user_agent_alias = "Linux Mozilla"
|
19
|
+
@agent.max_history = 0
|
20
|
+
end
|
21
|
+
|
22
|
+
URI.extract(m.message, ["http", "https"]) do |link|
|
23
|
+
# Fetch data
|
24
|
+
begin
|
25
|
+
uri = URI.parse(link)
|
26
|
+
page = @agent.get(link)
|
27
|
+
rescue Mechanize::ResponseCodeError
|
28
|
+
if "www.youtube.com" == uri.host
|
29
|
+
m.reply "Thank you, GEMA!"
|
30
|
+
else
|
31
|
+
m.reply "Y U POST BROKEN LINKS?", true
|
32
|
+
end
|
33
|
+
|
34
|
+
next
|
35
|
+
end
|
36
|
+
|
37
|
+
# Replace strange characters
|
38
|
+
title = page.title.gsub(/[\x00-\x1f]*/, "").gsub(/[ ]{2,}/, " ").strip rescue nil
|
39
|
+
|
40
|
+
# Check host
|
41
|
+
case uri.host
|
42
|
+
when "www.imdb.com"
|
43
|
+
# Get user rating
|
44
|
+
rating = page.search("//strong/span[@itemprop='ratingValue']").text
|
45
|
+
|
46
|
+
# Get votes
|
47
|
+
votes = page.search("//a/span[@itemprop='ratingCount']").text
|
48
|
+
|
49
|
+
m.reply "Title: %s (at %s, %s/10 from %s users)" % [
|
50
|
+
title, uri.host, rating, votes
|
51
|
+
]
|
52
|
+
when "www.youtube.com"
|
53
|
+
# Reload with nofeather
|
54
|
+
page = @agent.get(link + "&nofeather=True")
|
55
|
+
|
56
|
+
# Get page hits
|
57
|
+
hits = page.search("//span[@class='watch-view-count']/strong")
|
58
|
+
hits = hits.text.gsub(/[.,]/, "")
|
59
|
+
|
60
|
+
# Get likes
|
61
|
+
likes = page.search("//span[@class='watch-likes-dislikes']")
|
62
|
+
likes = likes.text.gsub(/[.,]/, "")
|
63
|
+
|
64
|
+
m.reply "Title: %s (at %s, %s hits, %s)" % [
|
65
|
+
title, uri.host, hits, likes.strip
|
66
|
+
]
|
67
|
+
when "gist.github.com"
|
68
|
+
# Get owner
|
69
|
+
owner = page.search("//div[@class='name']/a").inner_html
|
70
|
+
|
71
|
+
# Get time
|
72
|
+
age = Time.parse(page.search("//span[@class='date']/abbr").text)
|
73
|
+
age = age.strftime("%Y-%m-%d %H:%M")
|
74
|
+
|
75
|
+
m.reply "Title: %s (at %s, %s on %s)" % [
|
76
|
+
title, uri.host, owner, age
|
77
|
+
]
|
78
|
+
when "pastie.org"
|
79
|
+
# Get time
|
80
|
+
age = Time.parse(page.search("//span[@class='typo_date']").text)
|
81
|
+
age = age.strftime("%Y-%m-%d %H:%M")
|
82
|
+
|
83
|
+
m.reply "Title: %s (at %s, on %s)" % [
|
84
|
+
title, uri.host, age
|
85
|
+
]
|
86
|
+
when "subforge.org", "subtle.de"
|
87
|
+
m.reply "Title: %s (at %s)" % [ title, uri.host ]
|
88
|
+
when "twitter.com"
|
89
|
+
if link =~ /\/status\/(\d+)$/
|
90
|
+
json = @agent.get("https://api.twitter.com/1/statuses/show/#{$1}.json?trim_user=1").body
|
91
|
+
tweet = JSON.parse(json)
|
92
|
+
unescaped = CGI.unescapeHTML(tweet["text"])
|
93
|
+
|
94
|
+
m.reply "@%s: %s" % [ tweet["user"]["screen_name"], unescaped ]
|
95
|
+
else
|
96
|
+
m.reply "Broken twitter link: %s (at %s)" % [ title, uri.host ] if title
|
97
|
+
end
|
98
|
+
else
|
99
|
+
m.reply "Title: %s (at %s)" % [ title, uri.host ] if title
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
metadata
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: cinch-url-scraper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Michal Papis
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-06-26 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: cinch
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '2'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '2'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: mechanize
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ~>
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '2'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ~>
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '2'
|
46
|
+
description: A Cinch plugin to get information about posted URLs.
|
47
|
+
email:
|
48
|
+
- mpapis@gmail.com
|
49
|
+
executables: []
|
50
|
+
extensions: []
|
51
|
+
extra_rdoc_files: []
|
52
|
+
files:
|
53
|
+
- LICENSE
|
54
|
+
- README.md
|
55
|
+
- lib/cinch/plugins/urlscraper.rb
|
56
|
+
homepage: https://github.com/mpapis/cinch-url-scraper
|
57
|
+
licenses: []
|
58
|
+
post_install_message:
|
59
|
+
rdoc_options: []
|
60
|
+
require_paths:
|
61
|
+
- lib
|
62
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
63
|
+
none: false
|
64
|
+
requirements:
|
65
|
+
- - ! '>='
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: 1.9.1
|
68
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
69
|
+
none: false
|
70
|
+
requirements:
|
71
|
+
- - ! '>='
|
72
|
+
- !ruby/object:Gem::Version
|
73
|
+
version: '0'
|
74
|
+
requirements: []
|
75
|
+
rubyforge_project:
|
76
|
+
rubygems_version: 1.8.24
|
77
|
+
signing_key:
|
78
|
+
specification_version: 3
|
79
|
+
summary: A Cinch plugin to get information about posted URLs.
|
80
|
+
test_files: []
|