twitterscraper-ruby 0.15.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,48 @@
1
+ module Twitterscraper
2
+ module Template
3
+ module_function
4
+
5
+ def tweets_embedded_html(tweets)
6
+ tweets_html = tweets.map { |t| EMBED_TWEET_HTML.sub('__TWEET_URL__', t.tweet_url) }
7
+ EMBED_TWEETS_HTML.sub('__TWEETS__', tweets_html.join)
8
+ end
9
+
10
+ EMBED_TWEET_HTML = <<~'HTML'
11
+ <blockquote class="twitter-tweet">
12
+ <a href="__TWEET_URL__"></a>
13
+ </blockquote>
14
+ HTML
15
+
16
+ EMBED_TWEETS_HTML = <<~'HTML'
17
+ <html>
18
+ <head>
19
+ <style type=text/css>
20
+ .twitter-tweet {
21
+ margin: 30px auto 0 auto !important;
22
+ }
23
+ </style>
24
+ <script>
25
+ window.twttr = (function(d, s, id) {
26
+ var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
27
+ if (d.getElementById(id)) return t;
28
+ js = d.createElement(s);
29
+ js.id = id;
30
+ js.src = "https://platform.twitter.com/widgets.js";
31
+ fjs.parentNode.insertBefore(js, fjs);
32
+
33
+ t._e = [];
34
+ t.ready = function(f) {
35
+ t._e.push(f);
36
+ };
37
+
38
+ return t;
39
+ }(document, "script", "twitter-wjs"));
40
+ </script>
41
+ </head>
42
+ <body>
43
+ __TWEETS__
44
+ </body>
45
+ </html>
46
+ HTML
47
+ end
48
+ end
@@ -0,0 +1,123 @@
1
+ require 'time'
2
+
3
+ module Twitterscraper
4
+ class Tweet
5
+ KEYS = [
6
+ :screen_name,
7
+ :name,
8
+ :user_id,
9
+ :tweet_id,
10
+ :text,
11
+ :links,
12
+ :hashtags,
13
+ :image_urls,
14
+ :video_url,
15
+ :has_media,
16
+ :likes,
17
+ :retweets,
18
+ :replies,
19
+ :is_replied,
20
+ :is_reply_to,
21
+ :parent_tweet_id,
22
+ :reply_to_users,
23
+ :tweet_url,
24
+ :timestamp,
25
+ :created_at,
26
+ ]
27
+ attr_reader *KEYS
28
+
29
+ def initialize(attrs)
30
+ attrs.each do |key, value|
31
+ instance_variable_set("@#{key}", value)
32
+ end
33
+ end
34
+
35
+ def attrs
36
+ KEYS.map do |key|
37
+ [key, send(key)]
38
+ end.to_h
39
+ end
40
+
41
+ def to_json(options = {})
42
+ attrs.to_json
43
+ end
44
+
45
+ class << self
46
+ def from_json(text)
47
+ json = JSON.parse(text)
48
+ json.map do |tweet|
49
+ tweet['created_at'] = Time.parse(tweet['created_at'])
50
+ new(tweet)
51
+ end
52
+ end
53
+
54
+ def from_html(text)
55
+ html = Nokogiri::HTML(text)
56
+ from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
57
+ end
58
+
59
+ def from_tweets_html(html)
60
+ html.map do |tweet|
61
+ from_tweet_html(tweet)
62
+ end.compact
63
+ end
64
+
65
+ def from_tweet_html(html)
66
+ screen_name = html.attr('data-screen-name')
67
+ tweet_id = html.attr('data-tweet-id')&.to_i
68
+
69
+ unless html.to_s.include?('js-tweet-text-container')
70
+ Twitterscraper.logger.warn "html doesn't include div.js-tweet-text-container url=https://twitter.com/#{screen_name}/status/#{tweet_id}"
71
+ return nil
72
+ end
73
+
74
+ inner_html = Nokogiri::HTML(html.inner_html)
75
+ text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
76
+ links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
77
+ image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
78
+ video_url = inner_html.xpath("//div[@class[contains(., 'PlayableMedia-container')]]/a").map { |elem| elem.attr('href') }[0]
79
+ has_media = !image_urls.empty? || (video_url && !video_url.empty?)
80
+
81
+ actions = inner_html.xpath("//div[@class[contains(., 'ProfileTweet-actionCountList')]]")
82
+ likes = actions.xpath("//span[@class[contains(., 'ProfileTweet-action--favorite')]]/span[@class[contains(., 'ProfileTweet-actionCount')]]").first.attr('data-tweet-stat-count').to_i || 0
83
+ retweets = actions.xpath("//span[@class[contains(., 'ProfileTweet-action--retweet')]]/span[@class[contains(., 'ProfileTweet-actionCount')]]").first.attr('data-tweet-stat-count').to_i || 0
84
+ replies = actions.xpath("//span[@class[contains(., 'ProfileTweet-action--reply u-hiddenVisually')]]/span[@class[contains(., 'ProfileTweet-actionCount')]]").first.attr('data-tweet-stat-count').to_i || 0
85
+ is_replied = replies != 0
86
+
87
+ parent_tweet_id = inner_html.xpath('//*[@data-conversation-id]').first.attr('data-conversation-id').to_i
88
+ if tweet_id == parent_tweet_id
89
+ is_reply_to = false
90
+ parent_tweet_id = nil
91
+ reply_to_users = []
92
+ else
93
+ is_reply_to = true
94
+ reply_to_users = inner_html.xpath("//div[@class[contains(., 'ReplyingToContextBelowAuthor')]]/a").map { |user| {screen_name: user.text.delete_prefix('@'), user_id: user.attr('data-user-id')} }
95
+ end
96
+
97
+ timestamp = inner_html.xpath("//span[@class[contains(., 'js-short-timestamp')]]").first.attr('data-time').to_i
98
+ new(
99
+ screen_name: screen_name,
100
+ name: html.attr('data-name'),
101
+ user_id: html.attr('data-user-id').to_i,
102
+ tweet_id: tweet_id,
103
+ text: text,
104
+ links: links,
105
+ hashtags: text.scan(/#\w+/).map { |tag| tag.delete_prefix('#') },
106
+ image_urls: image_urls,
107
+ video_url: video_url,
108
+ has_media: has_media,
109
+ likes: likes,
110
+ retweets: retweets,
111
+ replies: replies,
112
+ is_replied: is_replied,
113
+ is_reply_to: is_reply_to,
114
+ parent_tweet_id: parent_tweet_id,
115
+ reply_to_users: reply_to_users,
116
+ tweet_url: 'https://twitter.com' + html.attr('data-permalink-path'),
117
+ timestamp: timestamp,
118
+ created_at: Time.at(timestamp, in: '+00:00'),
119
+ )
120
+ end
121
+ end
122
+ end
123
+ end
@@ -0,0 +1,3 @@
1
+ module Twitterscraper
2
+ VERSION = '0.15.0'
3
+ end
@@ -0,0 +1,31 @@
1
+ require_relative 'lib/version'
2
+
3
+ Gem::Specification.new do |spec|
4
+ spec.name = "twitterscraper-ruby"
5
+ spec.version = Twitterscraper::VERSION
6
+ spec.authors = ["ts-3156"]
7
+ spec.email = ["ts_3156@yahoo.co.jp"]
8
+
9
+ spec.summary = %q{A gem to scrape Tweets}
10
+ spec.description = %q{A gem to scrape Tweets}
11
+ spec.homepage = "https://github.com/ts-3156/twitterscraper-ruby"
12
+ spec.license = "MIT"
13
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
14
+
15
+ spec.metadata["homepage_uri"] = spec.homepage
16
+ spec.metadata["source_code_uri"] = spec.homepage
17
+ spec.metadata["changelog_uri"] = spec.homepage
18
+
19
+ # Specify which files should be added to the gem when it is released.
20
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
21
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
22
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
23
+ end
24
+ spec.executables = ["twitterscraper"]
25
+ spec.require_paths = ["lib"]
26
+
27
+ spec.required_ruby_version = ">= 2.6.4"
28
+
29
+ spec.add_dependency "nokogiri"
30
+ spec.add_dependency "parallel"
31
+ end
metadata ADDED
@@ -0,0 +1,104 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: twitterscraper-ruby
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.15.0
5
+ platform: ruby
6
+ authors:
7
+ - ts-3156
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-07-17 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: parallel
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: A gem to scrape Tweets
42
+ email:
43
+ - ts_3156@yahoo.co.jp
44
+ executables:
45
+ - twitterscraper
46
+ extensions: []
47
+ extra_rdoc_files: []
48
+ files:
49
+ - ".circleci/config.yml"
50
+ - ".gitignore"
51
+ - ".irbrc"
52
+ - ".rspec"
53
+ - ".ruby-version"
54
+ - ".travis.yml"
55
+ - CODE_OF_CONDUCT.md
56
+ - Gemfile
57
+ - Gemfile.lock
58
+ - LICENSE.txt
59
+ - README.md
60
+ - Rakefile
61
+ - bin/console
62
+ - bin/setup
63
+ - bin/twitterscraper
64
+ - lib/twitterscraper-ruby.rb
65
+ - lib/twitterscraper.rb
66
+ - lib/twitterscraper/cache.rb
67
+ - lib/twitterscraper/cli.rb
68
+ - lib/twitterscraper/client.rb
69
+ - lib/twitterscraper/http.rb
70
+ - lib/twitterscraper/lang.rb
71
+ - lib/twitterscraper/logger.rb
72
+ - lib/twitterscraper/proxy.rb
73
+ - lib/twitterscraper/query.rb
74
+ - lib/twitterscraper/template.rb
75
+ - lib/twitterscraper/tweet.rb
76
+ - lib/version.rb
77
+ - twitterscraper-ruby.gemspec
78
+ homepage: https://github.com/ts-3156/twitterscraper-ruby
79
+ licenses:
80
+ - MIT
81
+ metadata:
82
+ homepage_uri: https://github.com/ts-3156/twitterscraper-ruby
83
+ source_code_uri: https://github.com/ts-3156/twitterscraper-ruby
84
+ changelog_uri: https://github.com/ts-3156/twitterscraper-ruby
85
+ post_install_message:
86
+ rdoc_options: []
87
+ require_paths:
88
+ - lib
89
+ required_ruby_version: !ruby/object:Gem::Requirement
90
+ requirements:
91
+ - - ">="
92
+ - !ruby/object:Gem::Version
93
+ version: 2.6.4
94
+ required_rubygems_version: !ruby/object:Gem::Requirement
95
+ requirements:
96
+ - - ">="
97
+ - !ruby/object:Gem::Version
98
+ version: '0'
99
+ requirements: []
100
+ rubygems_version: 3.0.3
101
+ signing_key:
102
+ specification_version: 4
103
+ summary: A gem to scrape Tweets
104
+ test_files: []