chupa-text-decomposer-webkit 1.0.2 → 1.0.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 9cfa2e299e210eed4fcdafa1f85934d377eb56f1
4
- data.tar.gz: a4658b1c5b840ad94e6185ba1d41e4ba703e83c4
2
+ SHA256:
3
+ metadata.gz: 1e38649e31b6142b1b907ecdc15fc109d66293c3408b55dca5fa5596d77a38a0
4
+ data.tar.gz: 4a85f2e3a9a21c280469c53bfb1f54f70ca1c80b7d35eb2093d524f6ab64a0f3
5
5
  SHA512:
6
- metadata.gz: 5ff6407a451e09c91b670b1ee85361bffa7c3ed166d15ef7cd88d1cbc16af1ee0381f7c011d116346090325f6d0237703b331f9d692ca25d5de04d0ed9c283e1
7
- data.tar.gz: 122344f3cfcda27ec94131a933aed1836289c8565c439d09b6e5b37962718db211dece223af7c565fefd4d94fe50627493bbf26ad2b5288ca55224b92353a0c4
6
+ metadata.gz: 20637409e930276464defaddf3a1d7018bc67b3c524509322c4b599dda0f217a7c0a8a2053d105520c47e9a1bd8e949870dbc381d598b12103dc1b05bdb46fdc
7
+ data.tar.gz: 9f03f9ed5efb9d242d4770a884afe7f85b1547e72f6ebb1703cae45bc4fc544c5b00f0126cc5558aaaf8023ede3e829b86773e2e6c0b73ae3149d633cf484f4d
@@ -0,0 +1,40 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Copyright (C) 2017-2021 Sutou Kouhei <kou@clear-code.com>
4
+ #
5
+ # This library is free software; you can redistribute it and/or
6
+ # modify it under the terms of the GNU Lesser General Public
7
+ # License as published by the Free Software Foundation; either
8
+ # version 2.1 of the License, or (at your option) any later version.
9
+ #
10
+ # This library is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ # Lesser General Public License for more details.
14
+ #
15
+ # You should have received a copy of the GNU Lesser General Public
16
+ # License along with this library; if not, write to the Free Software
17
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+
19
+ require "chupa-text-decomposer-webkit/screenshoter"
20
+
21
+ logger = Object.new
22
+ def logger.debug
23
+ $stdout.puts("debug: #{yield}")
24
+ $stdout.flush
25
+ end
26
+
27
+ def logger.error
28
+ $stdout.puts("error: #{yield}")
29
+ $stdout.flush
30
+ end
31
+
32
+ path, uri, output_path, width, height = ARGV
33
+ screenshoter = ChupaTextDecomposerWebKit::Screenshoter.new(logger)
34
+ File.open(path, encoding: "UTF-8") do |input|
35
+ screenshoter.run(input.read,
36
+ uri,
37
+ output_path,
38
+ Integer(width),
39
+ Integer(height))
40
+ end
@@ -22,7 +22,7 @@ end
22
22
 
23
23
  Gem::Specification.new do |spec|
24
24
  spec.name = "chupa-text-decomposer-webkit"
25
- spec.version = "1.0.2"
25
+ spec.version = "1.0.7"
26
26
  spec.homepage = "https://github.com/ranguba/chupa-text-decomposer-webkit"
27
27
  spec.authors = ["Kouhei Sutou"]
28
28
  spec.email = ["kou@clear-code.com"]
@@ -38,6 +38,9 @@ Gem::Specification.new do |spec|
38
38
  spec.files += Dir.glob("lib/**/*.rb")
39
39
  spec.files += Dir.glob("doc/text/*")
40
40
  spec.files += Dir.glob("test/**/*")
41
+ Dir.chdir("bin") do
42
+ spec.executables = Dir.glob("*")
43
+ end
41
44
 
42
45
  spec.add_runtime_dependency("chupa-text", ">= 1.0.7")
43
46
  spec.add_runtime_dependency("webkit2-gtk", ">= 3.1.7")
data/doc/text/news.md CHANGED
@@ -1,10 +1,42 @@
1
1
  # News
2
2
 
3
+ ## 1.0.7: 2021-03-04
4
+
5
+ ### Improvements
6
+
7
+ * screenshoter: Stopped to depend on locale.
8
+
9
+ ## 1.0.6: 2017-07-12
10
+
11
+ ### Fixes
12
+
13
+ * Added missing executable files.
14
+
15
+ ## 1.0.5: 2017-07-12
16
+
17
+ ### Improvements
18
+
19
+ * Added workaround for crash.
20
+
21
+ ## 1.0.4: 2017-07-12
22
+
23
+ ### Improvements
24
+
25
+ * Added more logs.
26
+
27
+ * Supported proxy.
28
+
29
+ ## 1.0.3: 2017-07-11
30
+
31
+ ### Improvements
32
+
33
+ * Supported timeout.
34
+
3
35
  ## 1.0.2: 2017-07-11
4
36
 
5
37
  ### Improvements
6
38
 
7
- * Reduces required resources.
39
+ * Reduced required resources.
8
40
 
9
41
  ## 1.0.1: 2017-07-11
10
42
 
@@ -0,0 +1,216 @@
1
+ # Copyright (C) 2017 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # This library is free software; you can redistribute it and/or
4
+ # modify it under the terms of the GNU Lesser General Public
5
+ # License as published by the Free Software Foundation; either
6
+ # version 2.1 of the License, or (at your option) any later version.
7
+ #
8
+ # This library is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
+ # Lesser General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+
17
+ require "webkit2-gtk"
18
+
19
+ module ChupaTextDecomposerWebKit
20
+ class Screenshoter
21
+ def initialize(logger)
22
+ @logger = logger
23
+ @view_context = create_view_context
24
+ @view = create_view
25
+ @window = create_window
26
+ @main_loop = GLib::MainLoop.new(nil, false)
27
+ @timeout_second = compute_timeout_second
28
+ @screenshot_cancellable = nil
29
+ @on_snapshot = nil
30
+ end
31
+
32
+ def run(body, uri, output_path, width, height)
33
+ @on_snapshot = lambda do |snapshot_surface|
34
+ scaled_surface = scale_snapshot(snapshot_surface, width, height)
35
+ scaled_surface.write_to_png(output_path)
36
+ end
37
+
38
+ begin
39
+ timeout do
40
+ debug do
41
+ "#{log_tag}[load][HTML] #{uri}"
42
+ end
43
+ @view.load_html(body, uri)
44
+ @main_loop.run
45
+ end
46
+ ensure
47
+ @on_snapshot = nil
48
+ end
49
+ end
50
+
51
+ private
52
+ def create_view_context
53
+ context = WebKit2Gtk::WebContext.new(ephemeral: true)
54
+ http_proxy = ENV["http_proxy"]
55
+ https_proxy = ENV["https_proxy"]
56
+ ftp_proxy = ENV["ftp_proxy"]
57
+ if http_proxy or https_proxy or ftp_proxy
58
+ proxy_settings = WebKit2Gtk::NetworkProxySettings.new
59
+ if http_proxy
60
+ proxy_settings.add_proxy_for_scheme("http", http_proxy)
61
+ end
62
+ if https_proxy
63
+ proxy_settings.add_proxy_for_scheme("https", https_proxy)
64
+ end
65
+ if ftp_proxy
66
+ proxy_settings.add_proxy_for_scheme("ftp", ftp_proxy)
67
+ end
68
+ context.set_network_proxy_settings(:custom, proxy_settings)
69
+ end
70
+ context
71
+ end
72
+
73
+ def create_view
74
+ view = WebKit2Gtk::WebView.new(context: @view_context)
75
+
76
+ view.signal_connect("load-changed") do |_, load_event|
77
+ debug do
78
+ "#{log_tag}[load][#{load_event.nick}] #{view.uri}"
79
+ end
80
+
81
+ case load_event
82
+ when WebKit2Gtk::LoadEvent::FINISHED
83
+ debug do
84
+ "#{log_tag}[screenshot][start] #{view.uri}"
85
+ end
86
+ cancel_screenshot
87
+ @screenshot_cancellable = Gio::Cancellable.new
88
+ view.get_snapshot(:full_document,
89
+ :none,
90
+ @screenshot_cancellable) do |_, result|
91
+ @screenshot_cancellable = nil
92
+ @main_loop.quit
93
+ begin
94
+ snapshot_surface = view.get_snapshot_finish(result)
95
+ rescue
96
+ error do
97
+ message = "failed to create snapshot: #{view.uri}: "
98
+ message << "#{$!.class}: #{$!.message}"
99
+ "#{log_tag}[screenshot][failed] #{message}"
100
+ end
101
+ else
102
+ debug do
103
+ size = "#{snapshot_surface.width}x#{snapshot_surface.height}"
104
+ "#{log_tag}[screenshot][finish] #{view.uri}: #{size}"
105
+ end
106
+ unless snapshot_surface.width.zero?
107
+ @on_snapshot.call(snapshot_surface) if @on_snapshot
108
+ end
109
+ end
110
+ end
111
+ end
112
+ end
113
+
114
+ view.signal_connect("load-failed") do |_, _, failed_uri, error|
115
+ cancel_screenshot
116
+ @main_loop.quit
117
+ error do
118
+ message = "failed to load URI: #{failed_uri}: "
119
+ message << "#{error.class}(#{error.code}): #{error.message}"
120
+ "#{log_tag}[load][failed] #{message}"
121
+ end
122
+ true
123
+ end
124
+
125
+ view
126
+ end
127
+
128
+ def scale_snapshot(snapshot_surface, width, height)
129
+ scaled_surface = Cairo::ImageSurface.new(:argb32, width, height)
130
+
131
+ context = Cairo::Context.new(scaled_surface)
132
+ context.set_source_color(:white)
133
+ context.paint
134
+
135
+ ratio = width.to_f / snapshot_surface.width
136
+ context.scale(ratio, ratio)
137
+ context.set_source(snapshot_surface)
138
+ context.paint
139
+
140
+ scaled_surface
141
+ end
142
+
143
+ def create_window
144
+ window = Gtk::OffscreenWindow.new
145
+ window.set_default_size(800, 600)
146
+ window.add(@view)
147
+ window.show_all
148
+ window
149
+ end
150
+
151
+ def cancel_screenshot
152
+ return if @screenshot_cancellable.nil?
153
+
154
+ debug do
155
+ "#{log_tag}[snapshot][cancel] cancel screenshot: #{@view.uri}"
156
+ end
157
+ @screenshot_cancellable.cancel
158
+ @screenshot_cancellable = nil
159
+ end
160
+
161
+ def timeout
162
+ timeout_id = GLib::Timeout.add_seconds(@timeout_second) do
163
+ timeout_id = nil
164
+ error do
165
+ message = "timeout to load URI: #{@timeout_second}s: #{@view.uri}"
166
+ message << ": loading" if @view.loading?
167
+ "#{log_tag}[load][timeout] #{message}"
168
+ end
169
+ cancel_screenshot
170
+ if @view.loading?
171
+ close_id = @view.signal_connect("close") do
172
+ @view.signal_handler_disconnect(close_id)
173
+ @main_loop.quit
174
+ error do
175
+ "#{log_tag}[load][closed] #{@view.uri}"
176
+ end
177
+ end
178
+ @view.try_close
179
+ else
180
+ @main_loop.quit
181
+ end
182
+ GLib::Source::REMOVE
183
+ end
184
+
185
+ begin
186
+ yield
187
+ ensure
188
+ GLib::Source.remove(timeout_id) if timeout_id
189
+ end
190
+ end
191
+
192
+ def compute_timeout_second
193
+ default_timeout = 5
194
+ timeout_string =
195
+ ENV["CHUPA_TEXT_DECOMPOSER_WEBKIT_TIMEOUT"] || default_timeout.to_s
196
+ begin
197
+ Integer(timeout_string)
198
+ rescue ArgumentError
199
+ default_timeout
200
+ end
201
+ end
202
+
203
+ private
204
+ def log_tag
205
+ "[decomposer][webkit]"
206
+ end
207
+
208
+ def debug(*args, &block)
209
+ @logger.debug(*args, &block)
210
+ end
211
+
212
+ def error(*args, &block)
213
+ @logger.error(*args, &block)
214
+ end
215
+ end
216
+ end
@@ -14,12 +14,21 @@
14
14
  # License along with this library; if not, write to the Free Software
15
15
  # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
16
 
17
- require "webkit2-gtk"
17
+ require "English"
18
+ require "rbconfig"
18
19
 
19
20
  module ChupaText
20
21
  module Decomposers
21
22
  class WebKit < Decomposer
23
+ module LogTag
24
+ private
25
+ def log_tag
26
+ "[decomposer][webkit]"
27
+ end
28
+ end
29
+
22
30
  include Loggable
31
+ include LogTag
23
32
 
24
33
  registry.register("webkit", self)
25
34
 
@@ -49,88 +58,92 @@ module ChupaText
49
58
  false
50
59
  end
51
60
 
61
+ IN_PROCESS = ENV["CHUPA_TEXT_DECOMPOSER_WEBKIT_IN_PROCESS"] == "yes"
62
+ if IN_PROCESS
63
+ require "chupa-text-decomposer-webkit/screenshoter"
64
+ end
65
+
52
66
  def decompose(data)
53
- data.screenshot = create_screenshot(data.source)
67
+ body = data.source.body
68
+ uri = data.source.uri.to_s
69
+ output = Tempfile.new(["chupa-text-decomposer-webkit", ".png"])
70
+ width, height = data.expected_screenshot_size
71
+ if IN_PROCESS
72
+ screenshoter = ChupaTextDecomposerWebKit::Screenshoter.new(logger)
73
+ screenshoter.run(body, uri, output.path, width, height)
74
+ else
75
+ screenshoter = ExternalScreenshoter.new
76
+ screenshoter.run(data.source.path, uri, output.path, width, height)
77
+ end
78
+ unless File.size(output.path).zero?
79
+ png = output.read
80
+ data.screenshot = Screenshot.new("image/png",
81
+ [png].pack("m*"),
82
+ "base64")
83
+ end
54
84
  data[AVAILABLE_ATTRIBUTE_NAME] = !data.screenshot.nil?
55
85
  yield(data)
56
86
  end
57
87
 
58
- private
59
- def create_screenshot(data)
60
- screenshot = nil
61
-
62
- @@view_context ||= WebKit2Gtk::WebContext.new(ephemeral: true)
63
- view = WebKit2Gtk::WebView.new(context: @@view_context)
64
- window = Gtk::OffscreenWindow.new
65
- window.set_default_size(800, 600)
66
- window.add(view)
67
- window.show_all
68
-
69
- finished = false
70
- view.signal_connect("load-changed") do |_, load_event|
71
- debug do
72
- "#{log_tag}[load][#{load_event.nick}] #{view.uri}"
73
- end
88
+ class ExternalScreenshoter
89
+ include Loggable
90
+ include LogTag
91
+
92
+ def initialize
93
+ @screenshoter = File.join(__dir__,
94
+ "..",
95
+ "..",
96
+ "..",
97
+ "bin",
98
+ "chupa-text-decomposer-webkit-screenshoter")
99
+ @command = ExternalCommand.new(RbConfig.ruby)
100
+ end
74
101
 
75
- case load_event
76
- when WebKit2Gtk::LoadEvent::FINISHED
77
- view.get_snapshot(:full_document, :none) do |_, result|
78
- finished = true
79
- snapshot_surface = view.get_snapshot_finish(result)
80
- unless snapshot_surface.width.zero?
81
- png = convert_snapshot_surface_to_png(data, snapshot_surface)
82
- screenshot = Screenshot.new("image/png",
83
- [png].pack("m*"),
84
- "base64")
102
+ def run(html_path, uri, output_path, width, height)
103
+ output_read, output_write = IO.pipe
104
+ error_output = Tempfile.new("chupa-text-decomposer-webkit-error")
105
+ output_reader = Thread.new do
106
+ loop do
107
+ IO.select([output_read])
108
+ line = output_read.gets
109
+ break if line.nil?
110
+
111
+ case line.chomp
112
+ when /\Adebug: /
113
+ debug($POSTMATCH)
114
+ when /\Aerror: /
115
+ error($POSTMATCH)
85
116
  end
86
117
  end
87
118
  end
88
- end
89
- view.signal_connect("load-failed") do |_, _, failed_uri, error|
90
- finished = true
91
- error do
92
- message = "failed to load URI: #{failed_uri}: "
93
- message << "#{error.class}(#{error.code}): #{error.message}"
94
- "#{log_tag}[load][failed] #{message}"
119
+ successed = @command.run(@screenshoter,
120
+ html_path,
121
+ uri,
122
+ output_path,
123
+ width.to_s,
124
+ height.to_s,
125
+ {
126
+ :spawn_options => {
127
+ :out => output_write,
128
+ :err => error_output.path,
129
+ },
130
+ })
131
+ output_write.close
132
+ output_reader.join
133
+
134
+ unless successed
135
+ error do
136
+ message = "failed to external screenshoter: #{uri}: "
137
+ message << "#{@command.path} #{@screenshoter}"
138
+ "#{log_tag}[external-screenshoter][run][failed] #{message}"
139
+ end
140
+ end
141
+ unless error_output.size.zero?
142
+ error_output.each_line do |line|
143
+ error(line)
144
+ end
95
145
  end
96
- true
97
- end
98
- debug do
99
- "#{log_tag}[load][html] #{data.uri}"
100
- end
101
- view.load_html(data.body, data.uri.to_s)
102
-
103
- main_context = GLib::MainContext.default
104
- until finished
105
- main_context.iteration(true)
106
146
  end
107
- window.destroy
108
-
109
- screenshot
110
- end
111
-
112
- def convert_snapshot_surface_to_png(data, snapshot_surface)
113
- screenshot_width, screenshot_height = data.expected_screenshot_size
114
-
115
- screenshot_surface = Cairo::ImageSurface.new(:argb32,
116
- screenshot_width,
117
- screenshot_height)
118
- context = Cairo::Context.new(screenshot_surface)
119
- context.set_source_color(:white)
120
- context.paint
121
-
122
- ratio = screenshot_width.to_f / snapshot_surface.width
123
- context.scale(ratio, ratio)
124
- context.set_source(snapshot_surface)
125
- context.paint
126
-
127
- png = StringIO.new
128
- screenshot_surface.write_to_png(png)
129
- png.string
130
- end
131
-
132
- def log_tag
133
- "[decomposer][webkit]"
134
147
  end
135
148
  end
136
149
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: chupa-text-decomposer-webkit
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 1.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kouhei Sutou
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-07-11 00:00:00.000000000 Z
11
+ date: 2021-03-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: chupa-text
@@ -115,7 +115,8 @@ description: |
115
115
  You can use `webkit` decomposer.
116
116
  email:
117
117
  - kou@clear-code.com
118
- executables: []
118
+ executables:
119
+ - chupa-text-decomposer-webkit-screenshoter
119
120
  extensions: []
120
121
  extra_rdoc_files: []
121
122
  files:
@@ -124,8 +125,10 @@ files:
124
125
  - LICENSE.txt
125
126
  - README.md
126
127
  - Rakefile
128
+ - bin/chupa-text-decomposer-webkit-screenshoter
127
129
  - chupa-text-decomposer-webkit.gemspec
128
130
  - doc/text/news.md
131
+ - lib/chupa-text-decomposer-webkit/screenshoter.rb
129
132
  - lib/chupa-text/decomposers/webkit.rb
130
133
  - test/run-test.rb
131
134
  - test/test-webkit.rb
@@ -148,8 +151,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
148
151
  - !ruby/object:Gem::Version
149
152
  version: '0'
150
153
  requirements: []
151
- rubyforge_project:
152
- rubygems_version: 2.5.2
154
+ rubygems_version: 3.3.0.dev
153
155
  signing_key:
154
156
  specification_version: 4
155
157
  summary: This is a ChupaText decomposer plugin for to extract text and meta-data from