aranha 0.5.0 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0e0391fd8968a27fb7422b6311396ef730623740c0fa5f8d3bab0a520f3278fc
4
- data.tar.gz: 0dae6f9c28a19145630482bc81ad131ac24a8285d1ff41276c7e900ff1769e01
3
+ metadata.gz: d65a2ccecd09ab619dea2d76306dfc48c7aad83b0b111b4544afb8492a97ef04
4
+ data.tar.gz: fd1d57c4d7ec4a22f8bd5829b8ad9a76838b729d54440b68c559cdf8ba5f9482
5
5
  SHA512:
6
- metadata.gz: 422311a8f9fc05d042eccdbe5c91742ab2e8802a2aad57ed45cbce9ac39ebee75d0dc1ba179e5b4499d54b51a41f7d85717defa8f906c8df3e6a1468c995ddca
7
- data.tar.gz: 41eaaad6cd8d1ad9fe68b926b66bbbde692befc4bd465a0ff794f74f9454ae1851961af6525edd60f911bfa560703956b8d70739ac88b962ea6485b49aa2e050
6
+ metadata.gz: 3b34ee01cfc1f6be364017da680c64b4abf7a5bc303b40edfdedf7532cdfd69cbe17d57349fac4fbc8881dc8db336f8a5738aa1976e9e5a3f60a09b8c4808f77
7
+ data.tar.gz: c5129c06f6ec81bfd02da79ccfa64b1187abf644e2271ac84ea30d24fc02b1608c13b74fbf3f962ecbe4a3d69955c08c51fd0ad1f692395e82ac15a78978f638
@@ -16,3 +16,4 @@ require_dependency 'aranha/parsers/html/base'
16
16
  require_dependency 'aranha/parsers/html/item_list'
17
17
  require_dependency 'aranha/parsers/invalid_state_exception'
18
18
  require_dependency 'aranha/dom_elements_traverser'
19
+ require_dependency 'aranha/selenium/driver_factory'
@@ -6,6 +6,8 @@ require 'fileutils'
6
6
  module Aranha
7
7
  module Parsers
8
8
  class Base
9
+ LOG_DIR_ENVVAR = 'ARANHA_PARSERS_LOG_DIR'
10
+
9
11
  def initialize(url)
10
12
  @url = url
11
13
  end
@@ -67,14 +69,32 @@ module Aranha
67
69
  end
68
70
 
69
71
  def log_content(content)
70
- File.open(log_file, 'wb') { |file| file.write(content) }
72
+ path = log_file
73
+ return unless path
74
+ File.open(path, 'wb') { |file| file.write(content) }
71
75
  end
72
76
 
73
77
  def log_file
74
- f = Rails.root.join('log', 'parsers', "#{self.class.name.parameterize}.log")
78
+ dir = log_parsers_dir
79
+ return nil unless dir
80
+ f = ::File.join(dir, "#{self.class.name.parameterize}.log")
75
81
  FileUtils.mkdir_p(File.dirname(f))
76
82
  f
77
83
  end
84
+
85
+ def log_parsers_dir
86
+ return ENV[LOG_DIR_ENVVAR] if ENV[LOG_DIR_ENVVAR]
87
+ return ::Rails.root.join('log', 'parsers') if rails_root_exist?
88
+ nil
89
+ end
90
+
91
+ def rails_root_exist?
92
+ klass = Module.const_get('Rails')
93
+ return false unless klass.is_a?(Class)
94
+ klass.respond_to?(:root)
95
+ rescue NameError
96
+ return false
97
+ end
78
98
  end
79
99
  end
80
100
  end
@@ -1,12 +1,26 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_dependency 'aranha/parsers/base'
4
- require_dependency 'aranha/parsers/html/node/default'
3
+ require_relative '../base'
4
+ require_relative 'node/default'
5
5
 
6
6
  module Aranha
7
7
  module Parsers
8
8
  module Html
9
9
  class Base < ::Aranha::Parsers::Base
10
+ class << self
11
+ def fields
12
+ @fields ||= []
13
+ @fields.dup
14
+ end
15
+
16
+ def field(name, type, xpath)
17
+ @fields ||= []
18
+ @fields << Field.new(name, type, xpath)
19
+ end
20
+
21
+ Field = Struct.new(:name, :type, :xpath)
22
+ end
23
+
10
24
  def nokogiri
11
25
  @nokogiri ||= Nokogiri::HTML(content, &:noblanks)
12
26
  end
@@ -22,6 +36,10 @@ module Aranha
22
36
  def node_parser
23
37
  @node_parser ||= node_parser_class.new(fields)
24
38
  end
39
+
40
+ def fields
41
+ self.class.fields.map { |f| [f.name, f.type, f.xpath] }
42
+ end
25
43
  end
26
44
  end
27
45
  end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'base'
4
+
5
+ module Aranha
6
+ module Parsers
7
+ module Html
8
+ class Item < Base
9
+ def data
10
+ @data ||= node_parser.parse(item_node)
11
+ end
12
+
13
+ def item_node
14
+ @item_node ||= begin
15
+ r = item_xpath ? nokogiri.at_xpath(item_xpath) : nokogiri
16
+ raise "Item node not found (Item xpath: #{item_xpath})" unless r
17
+ r
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require_relative 'base'
4
+
3
5
  module Aranha
4
6
  module Parsers
5
7
  module Html
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_dependency 'aranha/parsers/html/node/base'
3
+ require_relative 'base'
4
4
 
5
5
  module Aranha
6
6
  module Parsers
@@ -0,0 +1,67 @@
1
+ # encoding: UTF-8
2
+ # frozen_string_literal: true
3
+
4
+ require 'yaml'
5
+
6
+ module Aranha
7
+ module Spec
8
+ # Lists pairs of source/target files in a directory.
9
+ class SourceTargetFixtures
10
+ class << self
11
+ def source_target_basename(file)
12
+ m = /^(.+)\.(?:source|target)(?:\..+)?$/.match(File.basename(file))
13
+ m ? m[1] : nil
14
+ end
15
+ end
16
+
17
+ attr_reader :fixtures_directory
18
+
19
+ def initialize(fixtures_directory)
20
+ @fixtures_directory = fixtures_directory
21
+ end
22
+
23
+ def source_target_files
24
+ sources_targets_basenames.map do |basename|
25
+ OpenStruct.new(source: source_file(basename), target: target_file(basename))
26
+ end
27
+ end
28
+
29
+ def source_files
30
+ r = []
31
+ source_target_files.each do |st|
32
+ r << st.source if st.source
33
+ end
34
+ r
35
+ end
36
+
37
+ private
38
+
39
+ def target_file(basename)
40
+ fixture_file(basename, 'target')
41
+ end
42
+
43
+ def source_file(basename)
44
+ fixture_file(basename, 'source')
45
+ end
46
+
47
+ def fixture_file(basename, suffix)
48
+ prefix = "#{basename}.#{suffix}"
49
+ Dir.foreach(fixtures_directory) do |item|
50
+ next if item == '.' || item == '..'
51
+ return File.expand_path(item, fixtures_directory) if item.starts_with?(prefix)
52
+ end
53
+ nil
54
+ end
55
+
56
+ def sources_targets_basenames
57
+ basenames = Set.new
58
+ Dir.foreach(fixtures_directory) do |item|
59
+ next if item == '.' || item == '..'
60
+ b = self.class.source_target_basename(item)
61
+ basenames << b if b.present?
62
+ end
63
+ basenames
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'source_target_fixtures'
4
+
5
+ RSpec.shared_examples 'source_target_fixtures' do |spec_file| # rubocop:disable Metrics/BlockLength
6
+ let(:spec_file) { spec_file }
7
+
8
+ it 'fixtures directory should exist' do
9
+ expect(::File.directory?(fixtures_dir)).to be true
10
+ end
11
+
12
+ context 'in fixtures directory' do
13
+ it 'should have at least one file' do
14
+ expect(source_target_fixtures.source_target_files.count).to be > 0
15
+ end
16
+
17
+ if ENV['WRITE_TARGET_FIXTURES']
18
+ it 'should write target data for all files' do
19
+ source_target_fixtures.source_files.each do |source_file|
20
+ sd = sort_results(source_data(source_file))
21
+ basename = ::Aranha::Spec::SourceTargetFixtures.source_target_basename(source_file)
22
+ target_file = File.expand_path("../#{basename}.target.yaml", source_file)
23
+ File.write(target_file, sd.to_yaml)
24
+ end
25
+ end
26
+ else
27
+ it 'should parse data for all files' do
28
+ source_target_fixtures.source_target_files.each do |st|
29
+ assert_source_target_complete(st)
30
+ sd = source_data(st.source)
31
+ td = YAML.load_file(st.target)
32
+ expect(sort_results(sd)).to eq(sort_results(td))
33
+ end
34
+ end
35
+ end
36
+ end
37
+
38
+ def source_target_fixtures
39
+ @source_target_fixtures ||= ::Aranha::Spec::SourceTargetFixtures.new(fixtures_dir)
40
+ end
41
+
42
+ def assert_source_target_complete(st)
43
+ expect(st.source).to(be_truthy, "Source not found (Target: #{st.target})")
44
+ expect(st.target).to(be_truthy, "Target not found (Source: #{st.source})")
45
+ end
46
+
47
+ def source_data(source_file)
48
+ described_class.new(source_file).data
49
+ end
50
+
51
+ def fixtures_dir
52
+ ::File.join(
53
+ ::File.dirname(spec_file),
54
+ ::File.basename(spec_file, '.*') + '_files'
55
+ )
56
+ end
57
+
58
+ def sort_results(r)
59
+ r
60
+ end
61
+ end
@@ -0,0 +1,685 @@
1
+ application/andrew-inset
2
+ application/applixware
3
+ application/atomcat+xml
4
+ application/atomsvc+xml
5
+ application/atom+xml
6
+ application/ccxml+xml,
7
+ application/cdmi-capability
8
+ application/cdmi-container
9
+ application/cdmi-domain
10
+ application/cdmi-object
11
+ application/cdmi-queue
12
+ application/cu-seeme
13
+ application/davmount+xml
14
+ application/dssc+der
15
+ application/dssc+xml
16
+ application/ecmascript
17
+ application/emma+xml
18
+ application/epub+zip
19
+ application/exi
20
+ application/font-tdpfr
21
+ application/hyperstudio
22
+ application/ipfix
23
+ application/java-archive
24
+ application/javascript
25
+ application/java-serializd-object
26
+ application/java-vm
27
+ application/json
28
+ application/mac-binhex40
29
+ application/mac-compactpro
30
+ application/mads+xml
31
+ application/marc
32
+ application/marcxml+xml
33
+ application/mathematica
34
+ application/mathml+xml
35
+ application/mbox
36
+ application/mediaservercontrol+xml
37
+ application/metalink4+xml
38
+ application/mets+xml
39
+ application/mods+xml
40
+ application/mp21
41
+ application/mp4
42
+ application/msword
43
+ application/mxf
44
+ application/nd.uiq.theme
45
+ application/octet-stream
46
+ application/oda
47
+ application/oebps-package+xml
48
+ application/ogg
49
+ application/onenote
50
+ application/patch-ops-error+xml
51
+ application/pdf
52
+ application/pgp-encrypted
53
+ application/pgp-signature
54
+ application/pics-rules
55
+ application/pkcs10
56
+ application/pkcs7-mime
57
+ application/pkcs7-signature
58
+ application/pkcs8
59
+ application/pkix-attr-cert
60
+ application/pkix-cert
61
+ application/pkixcmp
62
+ application/pkix-crl
63
+ application/pkix-pkipath
64
+ application/pls+xml
65
+ application/postscript
66
+ application/prs.cww
67
+ application/pskc+xml
68
+ application/rdf+xml
69
+ application/reginfo+xml
70
+ application/relax-ng-compact-syntax
71
+ application/resource-lists-diff+xml
72
+ application/resource-lists+xml
73
+ application/rls-services+xml
74
+ application/rsd+xml
75
+ application/rss+xml
76
+ application/rtf
77
+ application/sbml+xml
78
+ application/scvp-cv-request
79
+ application/scvp-vp-request
80
+ application/scvp-vp-response
81
+ application/sdp
82
+ application/set-payment-initiation
83
+ application/set-reistration-initiation
84
+ application/shf+xml
85
+ application/smil+xml
86
+ application/sml+xml
87
+ application/sparql-query
88
+ application/sparql-results+xml
89
+ application/srgs
90
+ application/srgs+xml
91
+ application/sru+xml
92
+ application/svp-cv-response
93
+ application/tei+xml
94
+ application/thraud+xml
95
+ application/timestamped-data
96
+ application/vn.contact.cmsg
97
+ application/vn.crick.clicker.wordbank
98
+ application/vnd.3gp2.tcap
99
+ application/vnd.3gpp.pic-bw-large
100
+ application/vnd.3gpp.pic-bw-small
101
+ application/vnd.3gpp.pic-bw-var
102
+ application/vnd.3m.post-it-notes
103
+ application/vnd.accpac.simply.aso
104
+ application/vnd.accpac.simply.imp
105
+ application/vnd.acucobol
106
+ application/vnd.acucorp
107
+ application/vnd.adobe.air-application-installer-package+zip
108
+ application/vnd.adobe.fxp
109
+ application/vnd.adobe.xdp+xml
110
+ application/vnd.adobe.xfdf
111
+ application/vnd.ahead.space
112
+ application/vnd.airzip.filesecure.azf
113
+ application/vnd.airzip.filesecure.azs
114
+ application/vnd.amazon.ebook
115
+ application/vnd.americandynamics.acc
116
+ application/vnd.amiga.ami
117
+ application/vnd.android.package-archive
118
+ application/vnd.anser-web-certificate-issue-initiation
119
+ application/vnd.anser-web-funds-transfer-initiation
120
+ application/vnd.antix.game-component
121
+ application/vnd.apple.installe+xml
122
+ application/vnd.apple.mpegurl
123
+ application/vnd.aristanetworks.swi
124
+ application/vnd.asis.opendocument.text-template
125
+ application/vnd.audiograph
126
+ application/vnd.blueice.multipass
127
+ application/vnd.bm
128
+ application/vnd.businessobjects
129
+ application/vnd.chemdraw+xml
130
+ application/vnd.chipnuts.karaoke-mmd
131
+ application/vnd.cinderella
132
+ application/vnd.claymore
133
+ application/vnd.cloanto.rp9
134
+ application/vnd.clonk.c4group
135
+ applicationvnd.cluetrust.cartomobile-config
136
+ application/vnd.cluetrust.cartomobile-config-pkg
137
+ application/vnd.commonspace
138
+ application/vnd.cosmocaller
139
+ application/vnd.crick.clicker
140
+ application/vnd.crick.clicker.keyboard
141
+ application/vnd.crick.clicker.palette
142
+ application/vnd.crick.clicker.template
143
+ application/vnd.criticaltools.wbs+xml
144
+ application/vnd.ctc-posml
145
+ application/vnd.cups-ppd
146
+ application/vnd.curl.car
147
+ application/vnd.curl.pcurl
148
+ application/vnd.data-vision.rdz
149
+ application/vnd.dolby.mlp
150
+ application/vnd.dpgraph
151
+ application/vnd.dreamfactory
152
+ application/vnd.dvb.ait
153
+ application/vnd.dvb.service
154
+ application/vnd.dynageo
155
+ application/vnd.ecowin.chart
156
+ application/vnd.enliven
157
+ application/vn.denovo.fcselayout-link
158
+ application/vnd.epson.esf
159
+ application/vnd.epson.msf
160
+ application/vnd.epson.quickanime
161
+ application/vnd.epson.salt
162
+ application/vnd.epson.ssf
163
+ application/vnd.eszigno3+xml
164
+ application/vnd.ezpix-album
165
+ application/vnd.ezpix-package
166
+ application/vnd.fdf
167
+ application/vnd.fdsn.seed
168
+ application/vnd.flographit
169
+ application/vnd.fluxtime.clip
170
+ application/vnd.framemaker
171
+ application/vnd.frogans.fnc
172
+ application/vnd.frogans.ltf
173
+ application/vnd.fsc.weblaunch
174
+ application/vnd.fujitsu.oasys2
175
+ application/vnd.fujitsu.oasys3
176
+ application/vnd.fujitsu.oasysgp
177
+ application/vnd.fujitsu.oasysprs
178
+ application/vnd.fujitu.oasys
179
+ application/vnd.fujixerox.ddd
180
+ application/vnd.fujixerox.docuworks
181
+ application/vnd.fujixerox.docuworks.binder
182
+ application/vnd.fuzzysheet
183
+ application/vnd.genomatix.tuxedo
184
+ application/vnd.geogebra.file
185
+ application/vnd.geogebra.tool
186
+ application/vnd.geometry-explorer
187
+ application/vnd.geonext
188
+ application/vnd.geoplan
189
+ application/vnd.geospace
190
+ application/vnd.gmx
191
+ application/vnd.google-earth.kml+xml
192
+ application/vnd.google-earth.kmz
193
+ application/vnd.grafeq
194
+ application/vnd.groove-account
195
+ application/vnd.groove-help
196
+ application/vnd.groove-identity-message
197
+ application/vnd.groove-injector
198
+ application/vnd.groove-tool-message
199
+ application/vnd.groove-tool-template
200
+ application/vnd.groove-vcar
201
+ application/vnd.hal+xml
202
+ application/vnd.handheld-entertainment+xml
203
+ application/vnd.hbci
204
+ application/vnd.hhe.lesson-player
205
+ application/vnd.hp-hpgl
206
+ application/vnd.hp-hpid
207
+ application/vnd.hp-hps
208
+ application/vnd.hp-jlyt
209
+ application/vnd.hp-pcl
210
+ application/vnd.hp-pclxl
211
+ application/vnd.hydrostatix.sof-data
212
+ application/vnd.hzn-3d-crossword
213
+ application/vnd.ibm.minipay
214
+ application/vnd.ibm.modcap
215
+ application/vnd.ibm.rights-management
216
+ application/vnd.ibm.securecontainer
217
+ application/vnd.iccprofile
218
+ application/vnd.igloader
219
+ application/vnd.immervision-ivp
220
+ application/vnd.immervision-ivu
221
+ application/vnd.insors.igm
222
+ application/vnd.intercon.formnet
223
+ application/vnd.intergeo
224
+ application/vnd.intu.qbo
225
+ application/vnd.intu.qfx
226
+ application/vnd.ipunplugged.rcprofile
227
+ application/vnd.irepository.package+xml
228
+ application/vnd.isac.fcs
229
+ application/vnd.is-xpr
230
+ application/vnd.jam
231
+ application/vnd.jcp.javame.midlet-rms
232
+ application/vnd.jisp
233
+ application/vnd.joost.joda-archive
234
+ application/vnd.kahootz
235
+ application/vnd.kde.karbon
236
+ application/vnd.kde.kchart
237
+ application/vnd.kde.kformula
238
+ application/vnd.kde.kivio
239
+ application/vnd.kde.kontour
240
+ application/vnd.kde.kpresenter
241
+ application/vnd.kde.kspread
242
+ application/vnd.kde.kword
243
+ application/vnd.kenameaapp
244
+ application/vnd.kidspiration
245
+ application/vnd.kinar
246
+ application/vnd.koan
247
+ application/vnd.kodak-descriptor
248
+ application/vnd.las.las+xml
249
+ application/vnd.llamagraphics.life-balance.desktop
250
+ application/vnd.llamagraphics.life-balance.exchange+xml
251
+ application/vnd.lotus-1-2-3
252
+ application/vnd.lotus-approach
253
+ application/vnd.lotus-freelance
254
+ application/vnd.lotus-notes
255
+ application/vnd.lotus-organizer
256
+ application/vnd.lotus-screencam
257
+ application/vnd.lotus-wordro
258
+ application/vnd.macports.portpkg
259
+ application/vnd.mcd
260
+ application/vnd.medcalcdata
261
+ application/vnd.mediastation.cdkey
262
+ application/vnd.mfer
263
+ application/vnd.mfmp
264
+ application/vnd.micrografx.flo
265
+ application/vnd.micrografx.igx
266
+ application/vnd.mif
267
+ application/vnd.mobius.daf
268
+ application/vnd.mobius.dis
269
+ application/vnd.mobius.mbk
270
+ application/vnd.mobius.mqy
271
+ application/vnd.mobius.msl
272
+ application/vnd.mobius.plc
273
+ application/vnd.mobius.txf
274
+ application/vnd.mophun.application
275
+ application/vnd.mophun.certificate
276
+ application/vnd.mozilla.xul+xml
277
+ application/vnd.ms-artgalry
278
+ application/vnd.ms-ca-compressed
279
+ application/vnd.mseq
280
+ application/vnd.ms-excel
281
+ application/vnd.ms-excel.addin.macroenabled.12
282
+ application/vnd.ms-excelsheet.binary.macroenabled.12
283
+ application/vnd.ms-excel.sheet.macroenabled.12
284
+ application/vnd.ms-excel.template.macroenabled.12
285
+ application/vnd.ms-fontobject
286
+ application/vnd.ms-htmlhelp
287
+ application/vnd.msician
288
+ application/vnd.ms-ims
289
+ application/vnd.ms-lrm
290
+ application/vnd.ms-officetheme
291
+ application/vnd.ms-pki.seccat
292
+ application/vnd.ms-pki.stl
293
+ application/vnd.ms-powerpoint
294
+ application/vnd.ms-powerpoint.addin.macroenabled.12
295
+ application/vnd.ms-powerpoint.presentation.macroenabled.12
296
+ application/vnd.ms-powerpoint.slide.macroenabled.12
297
+ application/vnd.ms-powerpoint.slideshow.macroenabled.12
298
+ application/vnd.ms-powerpoint.template.macroenabled.12
299
+ application/vnd.ms-project
300
+ application/vnd.ms-word.document.macroenabled.12
301
+ application/vnd.ms-word.template.macroenabed.12
302
+ application/vnd.ms-works
303
+ application/vnd.ms-wpl
304
+ application/vnd.ms-xpsdocument
305
+ application/vnd.muvee.style
306
+ application/vnd.na
307
+ application/vnd.neurolanguage.nlu
308
+ application/vnd.noblenet-directory
309
+ application/vnd.noblenet-sealer
310
+ application/vnd.noblenet-web
311
+ application/vnd.nokia.n-gage.data
312
+ application/vnd.nokia.n-gage.symbian.install
313
+ application/vnd.nokia.radio-preset
314
+ application/vnd.nokia.radio-presets
315
+ application/vnd.novadigm.edm
316
+ application/vnd.novadigm.ext
317
+ application/vnd.novadim.edx
318
+ application/vnd.oasis.opendocumen.presentation-template
319
+ application/vnd.oasis.opendocument.char
320
+ application/vnd.oasis.opendocument.chart-template
321
+ application/vnd.oasis.opendocument.database
322
+ application/vnd.oasis.opendocument.formula
323
+ application/vnd.oasis.opendocument.formula-template
324
+ application/vnd.oasis.opendocument.graphics-template
325
+ application/vnd.oasis.opendocument.grapics
326
+ application/vnd.oasis.opendocument.image
327
+ application/vnd.oasis.opendocument.image-template
328
+ application/vnd.oasis.opendocument.presentation
329
+ application/vnd.oasis.opendocument.spreadsheet
330
+ application/vnd.oasis.opendocument.spreadsheet-template
331
+ application/vnd.oasis.opendocument.text
332
+ application/vnd.oasis.opendocument.text-master
333
+ application/vnd.oasis.opendocument.text-web
334
+ application/vnd.olpc-sugar
335
+ application/vnd.oma.dd2+xml
336
+ application/vnd.openofficeorg.extension
337
+ application/vnd.openxmformats-officedocument.wordprocessingml.document
338
+ application/vnd.openxmlformats-officedocument.presentationml.presentation
339
+ application/vnd.openxmlformats-officedocument.presentationml.slide
340
+ application/vnd.openxmlformats-officedocument.presentationml.slideshw
341
+ application/vnd.openxmlformats-officedocument.presentationml.template
342
+ application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
343
+ application/vnd.openxmlformats-officedocument.spreadsheetml.template
344
+ application/vnd.openxmlformats-officedocument.wordprocessingml.template
345
+ application/vnd.osgeo.mapguide.package
346
+ application/vnd.osgi.dp
347
+ application/vnd.palm
348
+ application/vnd.pawaafile
349
+ application/vnd.pg.format
350
+ application/vnd.pg.osasli
351
+ application/vnd.picsel
352
+ application/vnd.pmi.widget
353
+ application/vnd.pocketlearn
354
+ application/vnd.powerbuilder6
355
+ application/vnd.previewsystems.box
356
+ application/vnd.proteus.magazine
357
+ application/vnd.publishare-delta-tree
358
+ application/vnd.pvi.ptid1
359
+ application/vnd.quark.quarkxpress
360
+ application/vnd.realvnc.bed
361
+ application/vnd.recordare.musicxml
362
+ application/vnd.recordare.musicxml+xml
363
+ application/vnd.rig.cryptonote
364
+ application/vnd.rim.cod
365
+ application/vnd.rn-realmedia
366
+ application/vnd.route66.link66+xml
367
+ application/vnd.sailingtracker.track
368
+ application/vnd.seemail
369
+ application/vnd.sema
370
+ application/vnd.semd
371
+ application/vnd.semf
372
+ application/vnd.shana.informed.formdata
373
+ application/vnd.shana.informed.formtemplate
374
+ application/vnd.shana.informed.interchange
375
+ application/vnd.shana.informed.package
376
+ application/vnd.simtech-mindmapper
377
+ application/vnd.smaf
378
+ application/vnd.smart.teacher
379
+ application/vnd.solent.sdkm+xml
380
+ application/vnd.spotfire.dxp
381
+ application/vnd.spotfire.sfs
382
+ application/vnd.stardivision.calc
383
+ application/vnd.stardivision.draw
384
+ application/vnd.stardivision.impress
385
+ application/vnd.stardivision.math
386
+ application/vnd.stardivision.writer
387
+ application/vnd.stepmania.stepchart
388
+ application/vnd.sun.xl.impress.template
389
+ application/vnd.sun.xml.calc
390
+ application/vnd.sun.xml.calc.template
391
+ application/vnd.sun.xml.draw
392
+ application/vnd.sun.xml.draw.template
393
+ application/vnd.sun.xml.impress
394
+ application/vnd.sun.xml.math
395
+ application/vnd.sun.xml.writer
396
+ application/vnd.sun.xml.writer.global
397
+ application/vnd.sun.xml.writer.template
398
+ application/vnd.sus-calendar
399
+ application/vnd.svd
400
+ application/vnd.symbian.install
401
+ application/vnd.syncml.dm+wbxml
402
+ application/vnd.syncml.dm+xml
403
+ application/vnd.syncml+xml
404
+ application/vnd.tardivision.writer-global
405
+ application/vnd.tmobile-ivetv
406
+ application/vnd.to.intent-module-archive
407
+ applicationvnd.trid.tpt
408
+ application/vnd.triscape.mxs
409
+ application/vnd.trueapp
410
+ application/vnd.ufdl
411
+ application/vnd.ul
412
+ application/vnd.umajin
413
+ application/vnd.unity
414
+ application/vnd.uoml+xml
415
+ application/vnd.vcx
416
+ application/vnd.visionary
417
+ application/vnd.vsf
418
+ application/vnd.wap.wbxml
419
+ application/vnd.wap.wmlc
420
+ application/vnd.wap.wmlscriptc
421
+ application/vnd.webturb
422
+ application/vnd.wolfrm.player
423
+ application/vnd.wordperfect
424
+ application/vnd.wqd
425
+ application/vnd.wt.stf
426
+ application/vnd.xara
427
+ application/vnd.xfdl
428
+ application/vnd.yamaha.h-script
429
+ application/vnd.yamaha.hv-dic
430
+ application/vnd.yamaha.hv-voice
431
+ application/vnd.yamaha.openscoreformat
432
+ application/vnd.yamaha.openscoreformat.osfpvg+xml
433
+ application/vnd.yamaha.smaf-phrase
434
+ application/vnd.yellowriver-custom-menu
435
+ application/vnd.ymaha.smaf-audio
436
+ application/vnd.zzazz.deck+xml
437
+ application/vn.visio
438
+ application/voicexml+xml
439
+ application/widget
440
+ application/winhlp
441
+ application/wsdl+xml
442
+ application/wspolicy+xml
443
+ application/x-7z-compressed
444
+ application/x-abiword
445
+ application/x-ace-compressed
446
+ application/x-athorware-map
447
+ application/x-authorware-bin
448
+ application/x-authorware-seg
449
+ application/x-bcpio
450
+ application/x-bittorrent
451
+ application/x-bzip
452
+ application/x-bzip2
453
+ application/xcap-diff+xml
454
+ application/x-cdlink
455
+ application/x-chat
456
+ application/x-chess-pgn
457
+ application/x-cpio
458
+ application/x-csh
459
+ application/x-debian-package
460
+ application/x-director
461
+ application/x-doom
462
+ application/x-dtbncx+xml
463
+ application/x-dtbook+xml
464
+ application/x-dtbresource+xml
465
+ application/x-dvi
466
+ application/xenc+xml
467
+ application/x-font-bdf
468
+ application/x-font-ghostscript
469
+ application/x-font-linux-psf
470
+ application/x-font-otf
471
+ application/x-font-pcf
472
+ application/x-font-snf
473
+ application/x-font-ttf
474
+ application/xfont-type1
475
+ application/x-font-woff
476
+ application/x-futuresplash
477
+ application/x-gnumeric
478
+ application/x-gtar
479
+ application/x-hdf
480
+ application/xhtml+xml
481
+ application/x-java-jnlp-file
482
+ application/x-latex
483
+ application/xml
484
+ application/xml-dtd
485
+ application/x-mobipocket-ebook
486
+ application/x-msaccess
487
+ application/x-ms-application
488
+ application/x-msbinder
489
+ application/x-mscardfile
490
+ application/x-msclip
491
+ application/x-msdownload
492
+ application/x-msmediaview
493
+ application/x-msmetafile
494
+ application/x-msmoney
495
+ application/x-mspublisher
496
+ application/x-msschedule
497
+ application/x-msterminal
498
+ application/x-ms-wmd
499
+ application/x-ms-wmz
500
+ application/x-mswrite
501
+ application/x-ms-xbap
502
+ application/x-netcdf
503
+ application/xop+xml
504
+ application/x-pkcs12
505
+ application/x-pkcs7-certificates
506
+ application/x-pkcs7-certreqresp
507
+ application/x-rar-compressed
508
+ application/x-sh
509
+ application/x-shar
510
+ application/x-shockwave-flash
511
+ application/x-silverlight-app
512
+ application/xslt+xml
513
+ application/xspf+xml
514
+ application/x-stuffit
515
+ application/x-stuffitx
516
+ application/x-sv4cpio
517
+ application/x-sv4crc
518
+ application/x-tar
519
+ application/x-tcl
520
+ application/x-tex
521
+ application/x-texinfo
522
+ application/x-tex-tfm
523
+ application/x-ustar
524
+ application/xv+xml
525
+ application/x-wais-source
526
+ application/x-x509-ca-cert
527
+ application/x-xfig
528
+ application/x-xpinstall
529
+ application/yang
530
+ application/yin+xml
531
+ application/zip
532
+ audio/adpcm
533
+ audio/basic
534
+ audio/midi
535
+ audio/mp4
536
+ audio/mpeg
537
+ audio/ogg
538
+ audio/vnd.dece.audio
539
+ audio/vnd.digital-winds
540
+ audio/vnd.dra
541
+ audio/vnd.dts
542
+ audio/vnd.dts.hd
543
+ audio/vnd.lucent.voice
544
+ audio/vnd.ms-playready.media.pya
545
+ audio/vnd.nuera.ecelp4800
546
+ audio/vnd.nuera.ecelp7470
547
+ audio/vnd.nuera.ecelp9600
548
+ audio/vnd.rip
549
+ audio/webm
550
+ audio/x-aac
551
+ audio/x-aiff
552
+ audio/x-mpegurl
553
+ audio/x-ms-wax
554
+ audio/x-ms-wma
555
+ audio/x-pn-realaudio
556
+ audio/x-pn-realaudio-plugin
557
+ audio/x-wav
558
+ chemical/x-cdx
559
+ chemical/x-cif
560
+ chemical/x-cmdf
561
+ chemical/x-cml
562
+ chemical/x-csml
563
+ chemical/x-xyz
564
+ image/bmp
565
+ image/cgm
566
+ image/g3fax
567
+ image/gif
568
+ image/ief
569
+ image/jpeg
570
+ image/ktx
571
+ image/png
572
+ image/-portable-bitmap
573
+ image/prs.btif
574
+ image/svg+xml
575
+ image/tiff
576
+ image/vnd.adobe.photoshop
577
+ image/vnd.dece.graphic
578
+ image/vnd.djvu
579
+ image/vnd.dvb.subtitle
580
+ image/vnd.dxf
581
+ image/vnd.fastbidsheet
582
+ image/vnd.fpx
583
+ image/vnd.fst
584
+ image/vnd.fujixerox.edmics-mmr
585
+ image/vnd.fujixerox.edmics-rlc
586
+ image/vnd.ms-modi
587
+ image/vnd.net-fpx
588
+ image/vnd.wap.wbmp
589
+ image/vnd.xiff
590
+ image/webp
591
+ image/x-cmu-raster
592
+ image/x-cmx
593
+ image/x-freehand
594
+ image/x-icon
595
+ image/x-pcx
596
+ image/x-pict
597
+ image/x-portable-anymap
598
+ image/x-portable-graymap
599
+ image/x-portable-pixmap
600
+ image/x-rgb
601
+ image/x-xbitmap
602
+ image/x-xpixmap
603
+ image/x-xwindowump
604
+ imag/vnd.dwg
605
+ message/rfc82
606
+ mode/iges
607
+ model/mesh
608
+ model/vnd.collada+xml
609
+ model/vnd.dwf
610
+ model/vnd.gdl
611
+ model/vnd.mts
612
+ model/vnd.vtu
613
+ model/vn.gtw
614
+ model/vrml
615
+ text/calendar
616
+ text/css
617
+ text/csv
618
+ text/html
619
+ text/n3
620
+ text/plain
621
+ text/plain-bas
622
+ text/prs.lines.tag
623
+ text/richtex
624
+ text/sgml
625
+ text/tab-separated-values
626
+ text/troff
627
+ text/turtle
628
+ text/uri-list
629
+ text/vnd.crl.scurl
630
+ text/vnd.curl
631
+ text/vnd.curl.dcurl
632
+ text/vnd.curl.mcurl
633
+ text/vnd.fly
634
+ text/vnd.fmi.flexstor
635
+ text/vnd.graphviz
636
+ text/vnd.in3d.3dml
637
+ text/vnd.in3d.spot
638
+ text/vnd.sun.j2me.app-descriptor
639
+ text/vnd.wap.wml
640
+ text/vnd.wap.wmlscript
641
+ text/x-asm
642
+ text/x-c
643
+ text/x-fortran
644
+ text/x-java-source,java
645
+ text/x-pascal
646
+ text/x-setext
647
+ text/x-uuencode
648
+ text/x-vcalendar
649
+ text/x-vcard
650
+ text/yaml
651
+ video/3gpp
652
+ video/3gpp2
653
+ video/h261
654
+ video/h263
655
+ video/h264
656
+ video/jpeg
657
+ video/jpm
658
+ video/mj2
659
+ video/mp4
660
+ video/mpeg
661
+ video/ogg
662
+ video/quicktime
663
+ video/vnd.dece.mobile
664
+ video/vnd.dece.pd
665
+ video/vnd.dece.sd
666
+ video/vnd.dece.video
667
+ video/vnd.dec.hd
668
+ video/vnd.fvt
669
+ video/vnd.mpegurl
670
+ video/vnd.ms-playready.media.pyv
671
+ video/vnd.uvvu.mp4
672
+ video/vnd.vivo
673
+ video/webm
674
+ video/x-f4v
675
+ video/x-fli
676
+ video/x-flv
677
+ video/x-m4v
678
+ video/x-ms-asf
679
+ video/x-msvieo
680
+ video/x-ms-wm
681
+ video/x-ms-wmv
682
+ video/x-ms-wmx
683
+ video/x-ms-wvx
684
+ video/x-sgi-movie
685
+ x-conference/x-cooltalk
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'selenium-webdriver'
4
+
5
+ module Aranha
6
+ module Selenium
7
+ class DriverFactory
8
+ class << self
9
+ DEFAULT_DOWNLOAD_DIR = '/tmp/aranha_download_dir'
10
+
11
+ def create_driver(options = {})
12
+ options = options.with_indifferent_access
13
+ options[:download_dir] ||= DEFAULT_DOWNLOAD_DIR
14
+ create_firefox_driver(options)
15
+ end
16
+
17
+ private
18
+
19
+ def create_firefox_driver(options)
20
+ ::Selenium::WebDriver.for(
21
+ :firefox,
22
+ options: ::Selenium::WebDriver::Firefox::Options.new(
23
+ profile: create_firefox_profile(options)
24
+ )
25
+ )
26
+ end
27
+
28
+ def create_firefox_profile(options)
29
+ profile = ::Selenium::WebDriver::Firefox::Profile.new
30
+ profile['browser.download.dir'] = options[:download_dir]
31
+ profile['browser.download.folderList'] = 2
32
+ profile['browser.helperApps.neverAsk.saveToDisk'] = auto_download_mime_types.join(';')
33
+ profile['pdfjs.disabled'] = true
34
+ profile
35
+ end
36
+
37
+ def auto_download_mime_types
38
+ ::File.read(::File.join(__dir__, 'auto_download_mime_types')).each_line.map(&:strip)
39
+ end
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,74 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'aranha/selenium/driver_factory'
4
+
5
+ module Aranha
6
+ module Selenium
7
+ class Session < ::SimpleDelegator
8
+ attr_reader :downloads, :wait
9
+
10
+ def initialize
11
+ @downloads = Downloads.new
12
+ @wait = ::Selenium::WebDriver::Wait.new(timeout: 15)
13
+ super(::Aranha::Selenium::DriverFactory.create_driver download_dir: @downloads.dir)
14
+ end
15
+
16
+ def find_or_not_element(find_element_args)
17
+ r = find_elements(find_element_args)
18
+ r.any? ? r.first : nil
19
+ end
20
+
21
+ def wait_for_click(find_element_args)
22
+ wait.until do
23
+ element = find_element(find_element_args)
24
+ element ? element_click(element) : nil
25
+ end
26
+ end
27
+
28
+ def wait_for_element(find_element_args)
29
+ wait.until { find_element(find_element_args) }
30
+ end
31
+
32
+ def wait_for_download
33
+ initial_downloads = downloads.current
34
+ yield
35
+ new_downloads = []
36
+ wait.until do
37
+ new_downloads = downloads.current - initial_downloads
38
+ new_downloads.any?
39
+ end
40
+ new_downloads.first
41
+ end
42
+
43
+ def current_source
44
+ element = find_element(xpath: '/html[1]')
45
+ raise 'Root element not found' unless element
46
+
47
+ s = element.attribute('innerHTML')
48
+ "<html>\n#{s}\n</html>\n"
49
+ end
50
+
51
+ class Downloads
52
+ attr_reader :dir
53
+
54
+ def initialize
55
+ @dir = ::Dir.mktmpdir
56
+ end
57
+
58
+ def current
59
+ Dir.glob("#{dir}/**/*")
60
+ end
61
+ end
62
+
63
+ private
64
+
65
+ def element_click(element)
66
+ element.click
67
+ element
68
+ rescue ::Selenium::WebDriver::Error::ElementClickInterceptedError,
69
+ ::Selenium::WebDriver::Error::ElementNotInteractableError
70
+ nil
71
+ end
72
+ end
73
+ end
74
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Aranha
4
- VERSION = '0.5.0'
4
+ VERSION = '0.6.0'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aranha
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eduardo H. Bogoni
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-06-11 00:00:00.000000000 Z
11
+ date: 2019-07-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: active_scaffold
@@ -66,6 +66,26 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: 4.2.10
69
+ - !ruby/object:Gem::Dependency
70
+ name: selenium-webdriver
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '3.142'
76
+ - - ">="
77
+ - !ruby/object:Gem::Version
78
+ version: 3.142.3
79
+ type: :runtime
80
+ prerelease: false
81
+ version_requirements: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - "~>"
84
+ - !ruby/object:Gem::Version
85
+ version: '3.142'
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: 3.142.3
69
89
  - !ruby/object:Gem::Dependency
70
90
  name: sqlite3
71
91
  requirement: !ruby/object:Gem::Requirement
@@ -110,11 +130,17 @@ files:
110
130
  - lib/aranha/fixtures/download.rb
111
131
  - lib/aranha/parsers/base.rb
112
132
  - lib/aranha/parsers/html/base.rb
133
+ - lib/aranha/parsers/html/item.rb
113
134
  - lib/aranha/parsers/html/item_list.rb
114
135
  - lib/aranha/parsers/html/node/base.rb
115
136
  - lib/aranha/parsers/html/node/default.rb
116
137
  - lib/aranha/parsers/invalid_state_exception.rb
138
+ - lib/aranha/parsers/spec/source_target_fixtures.rb
139
+ - lib/aranha/parsers/spec/source_target_fixtures_example.rb
117
140
  - lib/aranha/processor.rb
141
+ - lib/aranha/selenium/auto_download_mime_types
142
+ - lib/aranha/selenium/driver_factory.rb
143
+ - lib/aranha/selenium/session.rb
118
144
  - lib/aranha/version.rb
119
145
  - lib/tasks/aranha_tasks.rake
120
146
  - test/aranha_test.rb
@@ -181,42 +207,42 @@ signing_key:
181
207
  specification_version: 4
182
208
  summary: Rails utilities for web crawling.
183
209
  test_files:
184
- - test/aranha_test.rb
210
+ - test/dummy/Rakefile
211
+ - test/dummy/README.rdoc
212
+ - test/dummy/config.ru
213
+ - test/dummy/config/boot.rb
214
+ - test/dummy/config/database.yml
215
+ - test/dummy/config/secrets.yml
216
+ - test/dummy/config/locales/en.yml
217
+ - test/dummy/config/application.rb
218
+ - test/dummy/config/environments/development.rb
219
+ - test/dummy/config/environments/test.rb
220
+ - test/dummy/config/environments/production.rb
221
+ - test/dummy/config/environment.rb
185
222
  - test/dummy/config/routes.rb
186
223
  - test/dummy/config/initializers/assets.rb
187
224
  - test/dummy/config/initializers/cookies_serializer.rb
188
- - test/dummy/config/initializers/to_time_preserves_timezone.rb
225
+ - test/dummy/config/initializers/inflections.rb
189
226
  - test/dummy/config/initializers/session_store.rb
190
- - test/dummy/config/initializers/backtrace_silencers.rb
191
227
  - test/dummy/config/initializers/wrap_parameters.rb
228
+ - test/dummy/config/initializers/to_time_preserves_timezone.rb
192
229
  - test/dummy/config/initializers/filter_parameter_logging.rb
193
- - test/dummy/config/initializers/inflections.rb
230
+ - test/dummy/config/initializers/backtrace_silencers.rb
194
231
  - test/dummy/config/initializers/mime_types.rb
195
- - test/dummy/config/database.yml
196
- - test/dummy/config/secrets.yml
197
- - test/dummy/config/locales/en.yml
198
- - test/dummy/config/environment.rb
199
- - test/dummy/config/boot.rb
200
- - test/dummy/config/application.rb
201
- - test/dummy/config/environments/production.rb
202
- - test/dummy/config/environments/test.rb
203
- - test/dummy/config/environments/development.rb
204
- - test/dummy/Rakefile
205
- - test/dummy/public/favicon.ico
206
- - test/dummy/public/404.html
207
- - test/dummy/public/500.html
208
- - test/dummy/public/422.html
209
- - test/dummy/config.ru
210
- - test/dummy/app/assets/stylesheets/application.css
211
- - test/dummy/app/assets/javascripts/application.js
212
- - test/dummy/app/helpers/application_helper.rb
232
+ - test/dummy/db/schema.rb
213
233
  - test/dummy/app/views/layouts/application.html.erb
214
234
  - test/dummy/app/controllers/application_controller.rb
215
- - test/dummy/bin/setup
235
+ - test/dummy/app/helpers/application_helper.rb
236
+ - test/dummy/app/assets/stylesheets/application.css
237
+ - test/dummy/app/assets/javascripts/application.js
238
+ - test/dummy/public/422.html
239
+ - test/dummy/public/404.html
240
+ - test/dummy/public/favicon.ico
241
+ - test/dummy/public/500.html
216
242
  - test/dummy/bin/bundle
243
+ - test/dummy/bin/setup
217
244
  - test/dummy/bin/rails
218
245
  - test/dummy/bin/rake
219
- - test/dummy/db/schema.rb
220
- - test/dummy/README.rdoc
221
- - test/integration/navigation_test.rb
246
+ - test/aranha_test.rb
222
247
  - test/test_helper.rb
248
+ - test/integration/navigation_test.rb