panchira 1.4.0 → 1.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 87af8a25ccd6d841b133aaa28ce7853c0111ae0ce4768f287a1f9a8a09eec148
4
- data.tar.gz: 93233eca9a9fd019fb82aafbd1e97630aa242d1645d2e2306fee52d1812aa144
3
+ metadata.gz: 9119f6e4ad4e4a3b551642f7d19a7853805d2c82b65c5fc868b472f265e7169c
4
+ data.tar.gz: 03cbbf38e009cd326b4f40f467de1dcc1d0fb852de362b3dde05ed56eb65c2c2
5
5
  SHA512:
6
- metadata.gz: 42e8539356b8c73b8cced17cdaec2452960f17175736a004702c37e7370f407ea995504a07e2b060c53f52629f42cb247e34080d9e54c52a07158fddb5e427c1
7
- data.tar.gz: 4b1746f991ba2353304c96e3297efca0a45ba0e3d9c908f653e72aa6920de3654f2714235d24037de591699287bf0c027b0720d6e2741cf3ef6a6e5ae1add95a
6
+ metadata.gz: 6ff2fab3b4489ade9e7accb6f10dc2d391aa5ba5ebbbb08f130926df81acd31eeae36d2208b915848e5e1337e71ab2a7cbe300eeeb728ed8669593fc139573f8
7
+ data.tar.gz: df76a29e1af2515d4eee99568426295e02fbcf8b1b6b835932633458f3f287d4ca2460ec1a497a73f2f8a5e66a100acf71aa37e35be3dd778c95ac83f2f808e7
@@ -31,3 +31,5 @@ jobs:
31
31
  run: bundle install
32
32
  - name: Run tests
33
33
  run: bundle exec rake test
34
+ env:
35
+ TWITTER_BEARER_TOKEN: ${{ secrets.TWITTER_BEARER_TOKEN }}
data/.rubocop.yml CHANGED
@@ -17,6 +17,10 @@ Layout/FirstHashElementIndentation:
17
17
  Layout/IndentationConsistency:
18
18
  EnforcedStyle: indented_internal_methods
19
19
 
20
+ Layout/MultilineAssignmentLayout:
21
+ EnforcedStyle: same_line
22
+ SupportedTypes: ["block"]
23
+
20
24
  Layout/MultilineMethodCallIndentation:
21
25
  EnforcedStyle: indented
22
26
 
@@ -30,6 +34,9 @@ Lint/MissingSuper:
30
34
  Exclude:
31
35
  - lib/panchira/resolvers/*
32
36
 
37
+ Lint/SymbolConversion:
38
+ EnforcedStyle: consistent
39
+
33
40
  Style/AsciiComments:
34
41
  Enabled: false
35
42
 
data/CHANGELOG.md CHANGED
@@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.
4
4
  The format is based on [Keep a Changelog](http://keepachangelog.com/)
5
5
  and this project adheres to [Semantic Versioning](http://semver.org/).
6
6
 
7
+ ## 1.5.0 - 2022-03-01
8
+ ### Changed
9
+ - You can now set options in Panchira::fetch and Resolver's constructors.
10
+ - Twitter resolvers can now fetch datas from API (requires bearer token).
11
+ - Max execution time is now set to 10 seconds.
12
+
7
13
  ## 1.4.0 - 2022-01-10
8
14
  ### Added
9
15
  - Added support for non-Japanese pixiv URLs.
data/Gemfile.lock CHANGED
@@ -1,41 +1,43 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- panchira (1.4.0)
4
+ panchira (1.5.0)
5
5
  fastimage (~> 2.1.7)
6
- nokogiri (>= 1.10.9, < 1.13.0)
6
+ nokogiri (>= 1.10.9, < 1.14.0)
7
7
 
8
8
  GEM
9
9
  remote: https://rubygems.org/
10
10
  specs:
11
11
  ast (2.4.2)
12
12
  fastimage (2.1.7)
13
- minitest (5.14.4)
14
- nokogiri (1.12.5-x86_64-darwin)
13
+ mini_portile2 (2.8.0)
14
+ minitest (5.15.0)
15
+ nokogiri (1.13.3)
16
+ mini_portile2 (~> 2.8.0)
15
17
  racc (~> 1.4)
16
- parallel (1.20.1)
17
- parser (3.0.1.1)
18
+ parallel (1.21.0)
19
+ parser (3.1.1.0)
18
20
  ast (~> 2.4.1)
19
21
  racc (1.6.0)
20
- rainbow (3.0.0)
22
+ rainbow (3.1.1)
21
23
  rake (12.3.3)
22
- regexp_parser (2.1.1)
24
+ regexp_parser (2.2.1)
23
25
  rexml (3.2.5)
24
- rubocop (1.15.0)
26
+ rubocop (1.25.1)
25
27
  parallel (~> 1.10)
26
- parser (>= 3.0.0.0)
28
+ parser (>= 3.1.0.0)
27
29
  rainbow (>= 2.2.2, < 4.0)
28
30
  regexp_parser (>= 1.8, < 3.0)
29
31
  rexml
30
- rubocop-ast (>= 1.5.0, < 2.0)
32
+ rubocop-ast (>= 1.15.1, < 2.0)
31
33
  ruby-progressbar (~> 1.7)
32
34
  unicode-display_width (>= 1.4.0, < 3.0)
33
- rubocop-ast (1.5.0)
34
- parser (>= 3.0.1.1)
35
- rubocop-minitest (0.12.1)
35
+ rubocop-ast (1.16.0)
36
+ parser (>= 3.1.1.0)
37
+ rubocop-minitest (0.17.2)
36
38
  rubocop (>= 0.90, < 2.0)
37
39
  ruby-progressbar (1.11.0)
38
- unicode-display_width (2.0.0)
40
+ unicode-display_width (2.1.0)
39
41
 
40
42
  PLATFORMS
41
43
  ruby
data/README.md CHANGED
@@ -46,6 +46,16 @@ In most situation you would call `Panchira#fetch`. It is a singular method that
46
46
 
47
47
  Panchira has a special treatment for each website. `Resolver` classes are where those treatments take place, and you can use your own `Resolver` classes by registering it to Panchira. See `Panchira::Extensions` documentation in source code for further details.
48
48
 
49
+ ### About Twitter API
50
+
51
+ Due to a recent change in Twitter, it's getting really hard to fetch tweet data by scraping. To solve this problem, Panchira can now use Twitter official API.
52
+
53
+ To use Twitter API instead of normal scraping, please set Twitter's bearer token as an option to `Panchira::fetch`. If you don't set token, Panchira will just fall back to simple scraping.
54
+
55
+ ```
56
+ > Panchira.fetch("https://twitter.com/example/status/1234567890", options: {twitter: {bearer_token: 'ABC...123'}})
57
+ ```
58
+
49
59
  ## Development
50
60
 
51
61
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
@@ -6,8 +6,8 @@ module Panchira
6
6
  class KomifloResolver < Resolver
7
7
  URL_REGEXP = %r{komiflo\.com(?:/#!)?/comics/(\d+)}.freeze
8
8
 
9
- def initialize(url)
10
- @url = url
9
+ def initialize(url, options = nil)
10
+ super(url, options)
11
11
 
12
12
  @id = url.slice(URL_REGEXP, 1)
13
13
  raw_json = URI.parse("https://api.komiflo.com/content/id/#{@id}").read('User-Agent' => user_agent)
@@ -8,8 +8,8 @@ module Panchira
8
8
  URL_REGEXP = %r{novel18\.syosetu\.com/}.freeze
9
9
  ID_REGEXP = %{novel18\.syosetu\.com/(?<id>[^/]+)}
10
10
 
11
- def initialize(url)
12
- super(url)
11
+ def initialize(url, options = nil)
12
+ super(url, options)
13
13
 
14
14
  if id = @url.match(ID_REGEXP)[:id]
15
15
  @desc = fetch_page("https://novel18.syosetu.com/novelview/infotop/ncode/#{id}/")
@@ -48,8 +48,8 @@ module Panchira
48
48
  URL_REGEXP = /ncode\.syosetu\.com/.freeze
49
49
  ID_REGEXP = %{ncode\.syosetu\.com/(?<id>[^/]+)}
50
50
 
51
- def initialize(url)
52
- super(url)
51
+ def initialize(url, options = nil)
52
+ super(url, options)
53
53
 
54
54
  if id = @url.match(ID_REGEXP)[:id]
55
55
  @desc = fetch_page("https://novel18.syosetu.com/novelview/infotop/ncode/#{id}/")
@@ -4,8 +4,8 @@ module Panchira
4
4
  class PixivResolver < Resolver
5
5
  URL_REGEXP = %r{pixiv\.net/.*(member_illust.php?.*illust_id=|artworks/)(\d+)}.freeze
6
6
 
7
- def initialize(url)
8
- super(url)
7
+ def initialize(url, options = nil)
8
+ super(url, options)
9
9
  @illust_id = url.slice(URL_REGEXP, 2)
10
10
 
11
11
  raw_json = URI.parse("https://www.pixiv.net/ajax/illust/#{@illust_id}").read('User-Agent' => user_agent)
@@ -47,8 +47,8 @@ module Panchira
47
47
  class PixivNovelResolver < Resolver
48
48
  URL_REGEXP = %r{pixiv\.net/novel/show.php\?id=(\d+)}.freeze
49
49
 
50
- def initialize(url)
51
- super(url)
50
+ def initialize(url, options = nil)
51
+ super(url, options)
52
52
  @novel_id = url.slice(URL_REGEXP, 1)
53
53
 
54
54
  raw_json = URI.parse("https://www.pixiv.net/ajax/novel/#{@novel_id}").read('User-Agent' => user_agent)
@@ -11,8 +11,9 @@ module Panchira
11
11
  # You must override this in subclasses to limit which urls to resolve.
12
12
  URL_REGEXP = URI::DEFAULT_PARSER.make_regexp
13
13
 
14
- def initialize(url)
14
+ def initialize(url, options = nil)
15
15
  @url = url
16
+ @options = options
16
17
  end
17
18
 
18
19
  # This function is called right after this Resolver instance is made.
@@ -53,7 +54,8 @@ module Panchira
53
54
  def fetch_page(url)
54
55
  read_options = {
55
56
  'User-Agent' => user_agent,
56
- 'Cookie' => cookie
57
+ 'Cookie' => cookie,
58
+ :read_timeout => 10
57
59
  }
58
60
 
59
61
  raw_page = URI.parse(url).read(read_options)
@@ -1,22 +1,99 @@
1
+ require 'uri'
2
+
1
3
  module Panchira
2
4
  class TwitterResolver < Resolver
3
- URL_REGEXP = /twitter.com\/\w+\/status\/\d+/.freeze
5
+ URL_REGEXP = %r{twitter.com/(\w+)/status/(\d+)}.freeze
6
+
7
+ def initialize(url, options = nil)
8
+ super(url, options)
9
+ @screen_name = @url.slice(URL_REGEXP, 1)
10
+ @id = @url.slice(URL_REGEXP, 2)
11
+
12
+ @bearer_token = options&.dig(:twitter, :bearer_token)
13
+ end
14
+
15
+ def fetch
16
+ return super unless @bearer_token
17
+
18
+ @response = fetch_api if @bearer_token
19
+
20
+ result = PanchiraResult.new
21
+
22
+ result.canonical_url = parse_canonical_url
23
+ result.title = parse_title
24
+ result.description = parse_description
25
+ result.image = parse_image
26
+ result.tags = parse_tags
27
+ result.author = parse_author
28
+ result.resolver = parse_resolver
29
+
30
+ result
31
+ end
4
32
 
5
33
  private
34
+
35
+ def fetch_api
36
+ uri = URI.parse("https://api.twitter.com/2/tweets/#{@id}")
37
+ uri.query = URI.encode_www_form({
38
+ 'expansions': 'attachments.media_keys,author_id',
39
+ 'media.fields': 'preview_image_url,type,url',
40
+ 'user.fields': 'name,username',
41
+ 'tweet.fields': 'entities'
42
+ })
43
+
44
+ raw_json = uri.read('Authorization' => "Bearer #{@bearer_token}")
45
+ JSON.parse(raw_json)
46
+ end
47
+
48
+ def parse_canonical_url
49
+ # Twitter returns false canonical url when the account is set as sensitive.
50
+ "https://twitter.com/#{@screen_name}/status/#{@id}"
51
+ end
52
+
6
53
  def parse_title
7
- @title = super
54
+ @title = if @response
55
+ @author = @response['includes']['users'][0]['name']
56
+ "#{@author} on Twitter"
57
+ else
58
+ super
59
+ end
8
60
  end
9
61
 
10
62
  def parse_author
11
- @title.match(/\A(.+) on Twitter\z/)[1]
63
+ @author || @title.match(/\A(.+) on Twitter\z/)[1]
64
+ rescue StandardError
65
+ nil
12
66
  end
13
67
 
14
68
  def parse_description
15
- @description = super.gsub(/\A“|”\z/, '')
69
+ if @response
70
+ @response['data']['text']
71
+ else
72
+ @description = super.gsub(/\A“|”\z/, '')
73
+ end
16
74
  end
17
75
 
18
76
  def parse_tags
19
- @description.scan(/[##]([^##\s]+)/).map(&:first)
77
+ if @response
78
+ @response.dig('data', 'entities', 'hashtags')&.map { |obj| obj['tag'] }
79
+ else
80
+ @description.scan(/[##]([^##\s]+)/).map(&:first)
81
+ end
82
+ end
83
+
84
+ def parse_image_url
85
+ return super unless @response
86
+
87
+ first_media = @response.dig('includes', 'media')&.first
88
+
89
+ return unless first_media
90
+
91
+ case first_media['type']
92
+ when 'photo'
93
+ first_media['url']
94
+ when 'video'
95
+ first_media['preview_image_url']
96
+ end
20
97
  end
21
98
  end
22
99
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Panchira
4
- VERSION = '1.4.0'
4
+ VERSION = '1.5.0'
5
5
  end
data/lib/panchira.rb CHANGED
@@ -21,10 +21,10 @@ Dir.glob("#{project_root}/panchira/resolvers/*_resolver.rb").sort.each { |file|
21
21
  module Panchira
22
22
  class << self
23
23
  # Return a PanchiraResult that contains the attributes of given url.
24
- def fetch(url)
24
+ def fetch(url, options = nil)
25
25
  resolver = select_resolver(url)
26
26
 
27
- resolver.new(url).fetch
27
+ resolver.new(url, options).fetch
28
28
  end
29
29
 
30
30
  private
data/panchira.gemspec CHANGED
@@ -40,5 +40,5 @@ Gem::Specification.new do |spec|
40
40
  spec.add_development_dependency 'rubocop-minitest', '~> 0.10'
41
41
 
42
42
  spec.add_dependency 'fastimage', '~> 2.1.7'
43
- spec.add_dependency 'nokogiri', '>= 1.10.9', '< 1.13.0'
43
+ spec.add_dependency 'nokogiri', '>= 1.10.9', '< 1.14.0'
44
44
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: panchira
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.0
4
+ version: 1.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - kyp
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-01-10 00:00:00.000000000 Z
11
+ date: 2022-03-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -103,7 +103,7 @@ dependencies:
103
103
  version: 1.10.9
104
104
  - - "<"
105
105
  - !ruby/object:Gem::Version
106
- version: 1.13.0
106
+ version: 1.14.0
107
107
  type: :runtime
108
108
  prerelease: false
109
109
  version_requirements: !ruby/object:Gem::Requirement
@@ -113,7 +113,7 @@ dependencies:
113
113
  version: 1.10.9
114
114
  - - "<"
115
115
  - !ruby/object:Gem::Version
116
- version: 1.13.0
116
+ version: 1.14.0
117
117
  description: |2
118
118
  Panchira allows you to parse attributes of hentais on some web platforms, such as Pixiv and DLSite.
119
119
  If you need card previews on hentai but can't get it with simply parsing metatags, then it is time for Panchira.
@@ -173,7 +173,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
173
173
  - !ruby/object:Gem::Version
174
174
  version: '0'
175
175
  requirements: []
176
- rubygems_version: 3.3.4
176
+ rubygems_version: 3.1.4
177
177
  signing_key:
178
178
  specification_version: 4
179
179
  summary: A parser for hentai websites