webg 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/exe/webg +61 -14
  3. data/lib/webg/version.rb +1 -1
  4. metadata +3 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7bdc7060ad44a00a15487e6905f336ecfe2f144940e06253d0073c183236bffd
4
- data.tar.gz: 897236c858cc3fa6fee45d83677213318a6d11610646fac6214e1cf07e2af553
3
+ metadata.gz: 941940d059cf78116a42cdf6733bea9dd14a243e1fcde8773494a6eabb3c5e43
4
+ data.tar.gz: c2188442e6ab0f3cefea91d0a900259fcf254184b9fe38076a34899e4b924196
5
5
  SHA512:
6
- metadata.gz: babb391559cd6b4e67fd86ca7c172b4ed341c378f4a5a3b4d3b3ba569036c48fcafc65143b17990b472aafdca7d11fe680b18fa7825bd4a023c2392dbbcee8ca
7
- data.tar.gz: a99d118c3041fac293b5efcfb76643521c81a63cb827d489686abe49cb5b2cd068d1cbe54ad938663d6b208beaf95d5e59740bbce468b23e642c59e9d9193422
6
+ metadata.gz: 35f501258470789b85c1ea9543a611ae35d1e40ac35317fdf6fd0a139e1111ed7e7772ecba1f7fd7a7a7feef5eb27e602333cc9da1ade6ebc1bcaea051c357df
7
+ data.tar.gz: cfb7b398c2c399b79bac1d7ffa22b15f102da564a94c22a9fabd7bbe98af41926e3c9e5b6dc648feef2eb06faea778c99d22bf1c84b6d8082942616f5b8abc0e
data/exe/webg CHANGED
@@ -7,17 +7,36 @@ require "nokogiri"
7
7
 
8
8
  require "webg/version"
9
9
 
10
+ module Squeezable
11
+ def squeezed_text(document)
12
+ return text(document).each_line.map { |l|
13
+ l.gsub(/\p{Space}+/, " ").strip
14
+ }.join("\n").strip.gsub(/\n{3,}/, "\n\n")
15
+ end
16
+ end
17
+
10
18
  module Fetcher
11
19
  end
12
20
 
13
21
  class Fetcher::Raw
22
+ DEFAULT_USER_AGENT = "webg/#{Webg::VERSION}"
23
+
24
+ def initialize(headers)
25
+ @headers = headers
26
+ end
27
+
14
28
  def call(uri)
15
29
  require("open-uri")
16
- return uri.read
30
+ return uri.read({"User-Agent" => DEFAULT_USER_AGENT}.merge(@headers))
17
31
  end
18
32
  end
19
33
 
20
34
  class Fetcher::Firefox
35
+ def initialize(headers)
36
+ return if headers.empty?
37
+ raise "specifying headers on Firefox is not supported"
38
+ end
39
+
21
40
  def call(uri)
22
41
  require("capybara")
23
42
  session = Capybara::Session.new(:selenium_headless)
@@ -30,6 +49,8 @@ module Selector
30
49
  end
31
50
 
32
51
  class Selector::All
52
+ include Squeezable
53
+
33
54
  REJECT_TAG_NAMES = %w[script noscript style]
34
55
 
35
56
  def raw(document)
@@ -43,32 +64,33 @@ class Selector::All
43
64
  end
44
65
 
45
66
  class Selector::Css
67
+ include Squeezable
68
+
46
69
  def initialize(css_selectors)
47
70
  @css_selectors = css_selectors
48
71
  end
49
72
 
50
73
  def raw(document)
51
- return nodes(session).map { |node|
52
- node.evaluate_script('this.outerHTML')
53
- }.join("\n")
74
+ return process(document, :to_s)
54
75
  end
55
76
 
56
- def text(session)
57
- return nodes(session).map(&:text).join("\n")
77
+ def text(document)
78
+ return process(document, :text)
58
79
  end
59
80
 
60
81
  private
61
82
 
62
- def nodes(session)
63
- return session.all(@css_selectors.join(", "))
83
+ def process(document, method)
84
+ return document.css(@css_selectors.join(", ")).map(&method).join("\n")
64
85
  end
65
86
  end
66
87
 
67
88
  def parse_options(argv)
68
89
  argv = argv.dup
69
90
  fetcher = :raw
91
+ headers = {}
70
92
  css_selectors = []
71
- text = false
93
+ output_method_name = :raw
72
94
 
73
95
  parser = OptionParser.new
74
96
  parser.version = Webg::VERSION
@@ -81,6 +103,26 @@ def parse_options(argv)
81
103
  ) do
82
104
  fetcher = :firefox
83
105
  end
106
+ parser.on(
107
+ "--user-agent=USER-AGENT",
108
+ "specify User-Agent header",
109
+ ) do |ua|
110
+ headers["User-Agent"] = ua
111
+ end
112
+ parser.on(
113
+ "--referer=REFERER",
114
+ "specify Referer header",
115
+ ) do |referer|
116
+ headers["Referer"] = referer
117
+ end
118
+ parser.on(
119
+ "--header=HEADER-LINE",
120
+ "specify various headers in HTTP request. e.g: --header='Accept-Language: ja'"
121
+ ) do |header_line|
122
+ md = /:\s+/.match(header_line)
123
+ raise "cannot parse header-line(#{header_line})" if !md
124
+ headers[md.pre_match] = md.post_match
125
+ end
84
126
  parser.on(
85
127
  "--css-selector=SELECTOR",
86
128
  "specify css selector to filter output.",
@@ -91,7 +133,13 @@ def parse_options(argv)
91
133
  "--text",
92
134
  "output only text",
93
135
  ) do
94
- text = true
136
+ output_method_name = :text
137
+ end
138
+ parser.on(
139
+ "--squeezed-text",
140
+ "output text with squeezing white spaces",
141
+ ) do
142
+ output_method_name = :squeezed_text
95
143
  end
96
144
  parser.parse!(argv)
97
145
 
@@ -102,15 +150,14 @@ def parse_options(argv)
102
150
  end
103
151
  uri = URI(uri)
104
152
 
105
- return uri, fetcher, css_selectors, text
153
+ return uri, fetcher, headers, css_selectors, output_method_name
106
154
  end
107
155
 
108
156
  begin
109
- uri, fetcher_name, css_selectors, text = parse_options(ARGV)
157
+ uri, fetcher_name, headers, css_selectors, output_method_name = parse_options(ARGV)
110
158
 
111
- fetcher = Fetcher.const_get(fetcher_name.capitalize).new
159
+ fetcher = Fetcher.const_get(fetcher_name.capitalize).new(headers)
112
160
  selector = css_selectors.empty? ? Selector::All.new : Selector::Css.new(css_selectors)
113
- output_method_name = text ? :text : :raw
114
161
 
115
162
  document = Nokogiri::HTML.parse(fetcher.(uri))
116
163
  puts(selector.public_send(output_method_name, document))
data/lib/webg/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Webg
4
- VERSION = "0.1.0"
4
+ VERSION = "0.3.0"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webg
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yuya.Nishida.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-05-02 00:00:00.000000000 Z
11
+ date: 2022-07-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: capybara
@@ -77,7 +77,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
77
77
  - !ruby/object:Gem::Version
78
78
  version: '0'
79
79
  requirements: []
80
- rubygems_version: 3.2.15
80
+ rubygems_version: 3.3.7
81
81
  signing_key:
82
82
  specification_version: 4
83
83
  summary: 'webg: A downloader to get web page with JavaScript'