twitterscraper-ruby 0.15.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.circleci/config.yml +31 -0
- data/.gitignore +10 -0
- data/.irbrc +7 -0
- data/.rspec +2 -0
- data/.ruby-version +1 -0
- data/.travis.yml +6 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +42 -0
- data/LICENSE.txt +21 -0
- data/README.md +174 -0
- data/Rakefile +10 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/bin/twitterscraper +13 -0
- data/lib/twitterscraper-ruby.rb +1 -0
- data/lib/twitterscraper.rb +27 -0
- data/lib/twitterscraper/cache.rb +69 -0
- data/lib/twitterscraper/cli.rb +119 -0
- data/lib/twitterscraper/client.rb +18 -0
- data/lib/twitterscraper/http.rb +31 -0
- data/lib/twitterscraper/lang.rb +40 -0
- data/lib/twitterscraper/logger.rb +9 -0
- data/lib/twitterscraper/proxy.rb +65 -0
- data/lib/twitterscraper/query.rb +254 -0
- data/lib/twitterscraper/template.rb +48 -0
- data/lib/twitterscraper/tweet.rb +123 -0
- data/lib/version.rb +3 -0
- data/twitterscraper-ruby.gemspec +31 -0
- metadata +104 -0
@@ -0,0 +1,48 @@
|
|
1
|
+
module Twitterscraper
|
2
|
+
module Template
|
3
|
+
module_function
|
4
|
+
|
5
|
+
def tweets_embedded_html(tweets)
|
6
|
+
tweets_html = tweets.map { |t| EMBED_TWEET_HTML.sub('__TWEET_URL__', t.tweet_url) }
|
7
|
+
EMBED_TWEETS_HTML.sub('__TWEETS__', tweets_html.join)
|
8
|
+
end
|
9
|
+
|
10
|
+
EMBED_TWEET_HTML = <<~'HTML'
|
11
|
+
<blockquote class="twitter-tweet">
|
12
|
+
<a href="__TWEET_URL__"></a>
|
13
|
+
</blockquote>
|
14
|
+
HTML
|
15
|
+
|
16
|
+
EMBED_TWEETS_HTML = <<~'HTML'
|
17
|
+
<html>
|
18
|
+
<head>
|
19
|
+
<style type=text/css>
|
20
|
+
.twitter-tweet {
|
21
|
+
margin: 30px auto 0 auto !important;
|
22
|
+
}
|
23
|
+
</style>
|
24
|
+
<script>
|
25
|
+
window.twttr = (function(d, s, id) {
|
26
|
+
var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
|
27
|
+
if (d.getElementById(id)) return t;
|
28
|
+
js = d.createElement(s);
|
29
|
+
js.id = id;
|
30
|
+
js.src = "https://platform.twitter.com/widgets.js";
|
31
|
+
fjs.parentNode.insertBefore(js, fjs);
|
32
|
+
|
33
|
+
t._e = [];
|
34
|
+
t.ready = function(f) {
|
35
|
+
t._e.push(f);
|
36
|
+
};
|
37
|
+
|
38
|
+
return t;
|
39
|
+
}(document, "script", "twitter-wjs"));
|
40
|
+
</script>
|
41
|
+
</head>
|
42
|
+
<body>
|
43
|
+
__TWEETS__
|
44
|
+
</body>
|
45
|
+
</html>
|
46
|
+
HTML
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
require 'time'
|
2
|
+
|
3
|
+
module Twitterscraper
|
4
|
+
class Tweet
|
5
|
+
KEYS = [
|
6
|
+
:screen_name,
|
7
|
+
:name,
|
8
|
+
:user_id,
|
9
|
+
:tweet_id,
|
10
|
+
:text,
|
11
|
+
:links,
|
12
|
+
:hashtags,
|
13
|
+
:image_urls,
|
14
|
+
:video_url,
|
15
|
+
:has_media,
|
16
|
+
:likes,
|
17
|
+
:retweets,
|
18
|
+
:replies,
|
19
|
+
:is_replied,
|
20
|
+
:is_reply_to,
|
21
|
+
:parent_tweet_id,
|
22
|
+
:reply_to_users,
|
23
|
+
:tweet_url,
|
24
|
+
:timestamp,
|
25
|
+
:created_at,
|
26
|
+
]
|
27
|
+
attr_reader *KEYS
|
28
|
+
|
29
|
+
def initialize(attrs)
|
30
|
+
attrs.each do |key, value|
|
31
|
+
instance_variable_set("@#{key}", value)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def attrs
|
36
|
+
KEYS.map do |key|
|
37
|
+
[key, send(key)]
|
38
|
+
end.to_h
|
39
|
+
end
|
40
|
+
|
41
|
+
def to_json(options = {})
|
42
|
+
attrs.to_json
|
43
|
+
end
|
44
|
+
|
45
|
+
class << self
|
46
|
+
def from_json(text)
|
47
|
+
json = JSON.parse(text)
|
48
|
+
json.map do |tweet|
|
49
|
+
tweet['created_at'] = Time.parse(tweet['created_at'])
|
50
|
+
new(tweet)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def from_html(text)
|
55
|
+
html = Nokogiri::HTML(text)
|
56
|
+
from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
|
57
|
+
end
|
58
|
+
|
59
|
+
def from_tweets_html(html)
|
60
|
+
html.map do |tweet|
|
61
|
+
from_tweet_html(tweet)
|
62
|
+
end.compact
|
63
|
+
end
|
64
|
+
|
65
|
+
def from_tweet_html(html)
|
66
|
+
screen_name = html.attr('data-screen-name')
|
67
|
+
tweet_id = html.attr('data-tweet-id')&.to_i
|
68
|
+
|
69
|
+
unless html.to_s.include?('js-tweet-text-container')
|
70
|
+
Twitterscraper.logger.warn "html doesn't include div.js-tweet-text-container url=https://twitter.com/#{screen_name}/status/#{tweet_id}"
|
71
|
+
return nil
|
72
|
+
end
|
73
|
+
|
74
|
+
inner_html = Nokogiri::HTML(html.inner_html)
|
75
|
+
text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
|
76
|
+
links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
|
77
|
+
image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
|
78
|
+
video_url = inner_html.xpath("//div[@class[contains(., 'PlayableMedia-container')]]/a").map { |elem| elem.attr('href') }[0]
|
79
|
+
has_media = !image_urls.empty? || (video_url && !video_url.empty?)
|
80
|
+
|
81
|
+
actions = inner_html.xpath("//div[@class[contains(., 'ProfileTweet-actionCountList')]]")
|
82
|
+
likes = actions.xpath("//span[@class[contains(., 'ProfileTweet-action--favorite')]]/span[@class[contains(., 'ProfileTweet-actionCount')]]").first.attr('data-tweet-stat-count').to_i || 0
|
83
|
+
retweets = actions.xpath("//span[@class[contains(., 'ProfileTweet-action--retweet')]]/span[@class[contains(., 'ProfileTweet-actionCount')]]").first.attr('data-tweet-stat-count').to_i || 0
|
84
|
+
replies = actions.xpath("//span[@class[contains(., 'ProfileTweet-action--reply u-hiddenVisually')]]/span[@class[contains(., 'ProfileTweet-actionCount')]]").first.attr('data-tweet-stat-count').to_i || 0
|
85
|
+
is_replied = replies != 0
|
86
|
+
|
87
|
+
parent_tweet_id = inner_html.xpath('//*[@data-conversation-id]').first.attr('data-conversation-id').to_i
|
88
|
+
if tweet_id == parent_tweet_id
|
89
|
+
is_reply_to = false
|
90
|
+
parent_tweet_id = nil
|
91
|
+
reply_to_users = []
|
92
|
+
else
|
93
|
+
is_reply_to = true
|
94
|
+
reply_to_users = inner_html.xpath("//div[@class[contains(., 'ReplyingToContextBelowAuthor')]]/a").map { |user| {screen_name: user.text.delete_prefix('@'), user_id: user.attr('data-user-id')} }
|
95
|
+
end
|
96
|
+
|
97
|
+
timestamp = inner_html.xpath("//span[@class[contains(., 'js-short-timestamp')]]").first.attr('data-time').to_i
|
98
|
+
new(
|
99
|
+
screen_name: screen_name,
|
100
|
+
name: html.attr('data-name'),
|
101
|
+
user_id: html.attr('data-user-id').to_i,
|
102
|
+
tweet_id: tweet_id,
|
103
|
+
text: text,
|
104
|
+
links: links,
|
105
|
+
hashtags: text.scan(/#\w+/).map { |tag| tag.delete_prefix('#') },
|
106
|
+
image_urls: image_urls,
|
107
|
+
video_url: video_url,
|
108
|
+
has_media: has_media,
|
109
|
+
likes: likes,
|
110
|
+
retweets: retweets,
|
111
|
+
replies: replies,
|
112
|
+
is_replied: is_replied,
|
113
|
+
is_reply_to: is_reply_to,
|
114
|
+
parent_tweet_id: parent_tweet_id,
|
115
|
+
reply_to_users: reply_to_users,
|
116
|
+
tweet_url: 'https://twitter.com' + html.attr('data-permalink-path'),
|
117
|
+
timestamp: timestamp,
|
118
|
+
created_at: Time.at(timestamp, in: '+00:00'),
|
119
|
+
)
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
data/lib/version.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
require_relative 'lib/version'
|
2
|
+
|
3
|
+
Gem::Specification.new do |spec|
|
4
|
+
spec.name = "twitterscraper-ruby"
|
5
|
+
spec.version = Twitterscraper::VERSION
|
6
|
+
spec.authors = ["ts-3156"]
|
7
|
+
spec.email = ["ts_3156@yahoo.co.jp"]
|
8
|
+
|
9
|
+
spec.summary = %q{A gem to scrape Tweets}
|
10
|
+
spec.description = %q{A gem to scrape Tweets}
|
11
|
+
spec.homepage = "https://github.com/ts-3156/twitterscraper-ruby"
|
12
|
+
spec.license = "MIT"
|
13
|
+
spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
|
14
|
+
|
15
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
16
|
+
spec.metadata["source_code_uri"] = spec.homepage
|
17
|
+
spec.metadata["changelog_uri"] = spec.homepage
|
18
|
+
|
19
|
+
# Specify which files should be added to the gem when it is released.
|
20
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
21
|
+
spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
|
22
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
23
|
+
end
|
24
|
+
spec.executables = ["twitterscraper"]
|
25
|
+
spec.require_paths = ["lib"]
|
26
|
+
|
27
|
+
spec.required_ruby_version = ">= 2.6.4"
|
28
|
+
|
29
|
+
spec.add_dependency "nokogiri"
|
30
|
+
spec.add_dependency "parallel"
|
31
|
+
end
|
metadata
ADDED
@@ -0,0 +1,104 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: twitterscraper-ruby
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.15.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- ts-3156
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2020-07-17 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: parallel
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
description: A gem to scrape Tweets
|
42
|
+
email:
|
43
|
+
- ts_3156@yahoo.co.jp
|
44
|
+
executables:
|
45
|
+
- twitterscraper
|
46
|
+
extensions: []
|
47
|
+
extra_rdoc_files: []
|
48
|
+
files:
|
49
|
+
- ".circleci/config.yml"
|
50
|
+
- ".gitignore"
|
51
|
+
- ".irbrc"
|
52
|
+
- ".rspec"
|
53
|
+
- ".ruby-version"
|
54
|
+
- ".travis.yml"
|
55
|
+
- CODE_OF_CONDUCT.md
|
56
|
+
- Gemfile
|
57
|
+
- Gemfile.lock
|
58
|
+
- LICENSE.txt
|
59
|
+
- README.md
|
60
|
+
- Rakefile
|
61
|
+
- bin/console
|
62
|
+
- bin/setup
|
63
|
+
- bin/twitterscraper
|
64
|
+
- lib/twitterscraper-ruby.rb
|
65
|
+
- lib/twitterscraper.rb
|
66
|
+
- lib/twitterscraper/cache.rb
|
67
|
+
- lib/twitterscraper/cli.rb
|
68
|
+
- lib/twitterscraper/client.rb
|
69
|
+
- lib/twitterscraper/http.rb
|
70
|
+
- lib/twitterscraper/lang.rb
|
71
|
+
- lib/twitterscraper/logger.rb
|
72
|
+
- lib/twitterscraper/proxy.rb
|
73
|
+
- lib/twitterscraper/query.rb
|
74
|
+
- lib/twitterscraper/template.rb
|
75
|
+
- lib/twitterscraper/tweet.rb
|
76
|
+
- lib/version.rb
|
77
|
+
- twitterscraper-ruby.gemspec
|
78
|
+
homepage: https://github.com/ts-3156/twitterscraper-ruby
|
79
|
+
licenses:
|
80
|
+
- MIT
|
81
|
+
metadata:
|
82
|
+
homepage_uri: https://github.com/ts-3156/twitterscraper-ruby
|
83
|
+
source_code_uri: https://github.com/ts-3156/twitterscraper-ruby
|
84
|
+
changelog_uri: https://github.com/ts-3156/twitterscraper-ruby
|
85
|
+
post_install_message:
|
86
|
+
rdoc_options: []
|
87
|
+
require_paths:
|
88
|
+
- lib
|
89
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
90
|
+
requirements:
|
91
|
+
- - ">="
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: 2.6.4
|
94
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
95
|
+
requirements:
|
96
|
+
- - ">="
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
version: '0'
|
99
|
+
requirements: []
|
100
|
+
rubygems_version: 3.0.3
|
101
|
+
signing_key:
|
102
|
+
specification_version: 4
|
103
|
+
summary: A gem to scrape Tweets
|
104
|
+
test_files: []
|