webg 0.1.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/exe/webg +61 -14
  3. data/lib/webg/version.rb +1 -1
  4. metadata +3 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7bdc7060ad44a00a15487e6905f336ecfe2f144940e06253d0073c183236bffd
4
- data.tar.gz: 897236c858cc3fa6fee45d83677213318a6d11610646fac6214e1cf07e2af553
3
+ metadata.gz: 941940d059cf78116a42cdf6733bea9dd14a243e1fcde8773494a6eabb3c5e43
4
+ data.tar.gz: c2188442e6ab0f3cefea91d0a900259fcf254184b9fe38076a34899e4b924196
5
5
  SHA512:
6
- metadata.gz: babb391559cd6b4e67fd86ca7c172b4ed341c378f4a5a3b4d3b3ba569036c48fcafc65143b17990b472aafdca7d11fe680b18fa7825bd4a023c2392dbbcee8ca
7
- data.tar.gz: a99d118c3041fac293b5efcfb76643521c81a63cb827d489686abe49cb5b2cd068d1cbe54ad938663d6b208beaf95d5e59740bbce468b23e642c59e9d9193422
6
+ metadata.gz: 35f501258470789b85c1ea9543a611ae35d1e40ac35317fdf6fd0a139e1111ed7e7772ecba1f7fd7a7a7feef5eb27e602333cc9da1ade6ebc1bcaea051c357df
7
+ data.tar.gz: cfb7b398c2c399b79bac1d7ffa22b15f102da564a94c22a9fabd7bbe98af41926e3c9e5b6dc648feef2eb06faea778c99d22bf1c84b6d8082942616f5b8abc0e
data/exe/webg CHANGED
@@ -7,17 +7,36 @@ require "nokogiri"
7
7
 
8
8
  require "webg/version"
9
9
 
10
+ module Squeezable
11
+ def squeezed_text(document)
12
+ return text(document).each_line.map { |l|
13
+ l.gsub(/\p{Space}+/, " ").strip
14
+ }.join("\n").strip.gsub(/\n{3,}/, "\n\n")
15
+ end
16
+ end
17
+
10
18
  module Fetcher
11
19
  end
12
20
 
13
21
  class Fetcher::Raw
22
+ DEFAULT_USER_AGENT = "webg/#{Webg::VERSION}"
23
+
24
+ def initialize(headers)
25
+ @headers = headers
26
+ end
27
+
14
28
  def call(uri)
15
29
  require("open-uri")
16
- return uri.read
30
+ return uri.read({"User-Agent" => DEFAULT_USER_AGENT}.merge(@headers))
17
31
  end
18
32
  end
19
33
 
20
34
  class Fetcher::Firefox
35
+ def initialize(headers)
36
+ return if headers.empty?
37
+ raise "specifying headers on Firefox is not supported"
38
+ end
39
+
21
40
  def call(uri)
22
41
  require("capybara")
23
42
  session = Capybara::Session.new(:selenium_headless)
@@ -30,6 +49,8 @@ module Selector
30
49
  end
31
50
 
32
51
  class Selector::All
52
+ include Squeezable
53
+
33
54
  REJECT_TAG_NAMES = %w[script noscript style]
34
55
 
35
56
  def raw(document)
@@ -43,32 +64,33 @@ class Selector::All
43
64
  end
44
65
 
45
66
  class Selector::Css
67
+ include Squeezable
68
+
46
69
  def initialize(css_selectors)
47
70
  @css_selectors = css_selectors
48
71
  end
49
72
 
50
73
  def raw(document)
51
- return nodes(session).map { |node|
52
- node.evaluate_script('this.outerHTML')
53
- }.join("\n")
74
+ return process(document, :to_s)
54
75
  end
55
76
 
56
- def text(session)
57
- return nodes(session).map(&:text).join("\n")
77
+ def text(document)
78
+ return process(document, :text)
58
79
  end
59
80
 
60
81
  private
61
82
 
62
- def nodes(session)
63
- return session.all(@css_selectors.join(", "))
83
+ def process(document, method)
84
+ return document.css(@css_selectors.join(", ")).map(&method).join("\n")
64
85
  end
65
86
  end
66
87
 
67
88
  def parse_options(argv)
68
89
  argv = argv.dup
69
90
  fetcher = :raw
91
+ headers = {}
70
92
  css_selectors = []
71
- text = false
93
+ output_method_name = :raw
72
94
 
73
95
  parser = OptionParser.new
74
96
  parser.version = Webg::VERSION
@@ -81,6 +103,26 @@ def parse_options(argv)
81
103
  ) do
82
104
  fetcher = :firefox
83
105
  end
106
+ parser.on(
107
+ "--user-agent=USER-AGENT",
108
+ "specify User-Agent header",
109
+ ) do |ua|
110
+ headers["User-Agent"] = ua
111
+ end
112
+ parser.on(
113
+ "--referer=REFERER",
114
+ "specify Referer header",
115
+ ) do |referer|
116
+ headers["Referer"] = referer
117
+ end
118
+ parser.on(
119
+ "--header=HEADER-LINE",
120
+ "specify various headers in HTTP request. e.g: --header='Accept-Language: ja'"
121
+ ) do |header_line|
122
+ md = /:\s+/.match(header_line)
123
+ raise "cannot parse header-line(#{header_line})" if !md
124
+ headers[md.pre_match] = md.post_match
125
+ end
84
126
  parser.on(
85
127
  "--css-selector=SELECTOR",
86
128
  "specify css selector to filter output.",
@@ -91,7 +133,13 @@ def parse_options(argv)
91
133
  "--text",
92
134
  "output only text",
93
135
  ) do
94
- text = true
136
+ output_method_name = :text
137
+ end
138
+ parser.on(
139
+ "--squeezed-text",
140
+ "output text with squeezing white spaces",
141
+ ) do
142
+ output_method_name = :squeezed_text
95
143
  end
96
144
  parser.parse!(argv)
97
145
 
@@ -102,15 +150,14 @@ def parse_options(argv)
102
150
  end
103
151
  uri = URI(uri)
104
152
 
105
- return uri, fetcher, css_selectors, text
153
+ return uri, fetcher, headers, css_selectors, output_method_name
106
154
  end
107
155
 
108
156
  begin
109
- uri, fetcher_name, css_selectors, text = parse_options(ARGV)
157
+ uri, fetcher_name, headers, css_selectors, output_method_name = parse_options(ARGV)
110
158
 
111
- fetcher = Fetcher.const_get(fetcher_name.capitalize).new
159
+ fetcher = Fetcher.const_get(fetcher_name.capitalize).new(headers)
112
160
  selector = css_selectors.empty? ? Selector::All.new : Selector::Css.new(css_selectors)
113
- output_method_name = text ? :text : :raw
114
161
 
115
162
  document = Nokogiri::HTML.parse(fetcher.(uri))
116
163
  puts(selector.public_send(output_method_name, document))
data/lib/webg/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Webg
4
- VERSION = "0.1.0"
4
+ VERSION = "0.3.0"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webg
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yuya.Nishida.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-05-02 00:00:00.000000000 Z
11
+ date: 2022-07-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: capybara
@@ -77,7 +77,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
77
77
  - !ruby/object:Gem::Version
78
78
  version: '0'
79
79
  requirements: []
80
- rubygems_version: 3.2.15
80
+ rubygems_version: 3.3.7
81
81
  signing_key:
82
82
  specification_version: 4
83
83
  summary: 'webg: A downloader to get web page with JavaScript'