crabfarm 0.2.3 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/crabfarm/adapters/parser/nokogiri.rb +12 -4
- data/lib/crabfarm/adapters/parser/pdf_reader.rb +10 -2
- data/lib/crabfarm/base_parser.rb +29 -9
- data/lib/crabfarm/cli.rb +1 -1
- data/lib/crabfarm/context_factory.rb +2 -1
- data/lib/crabfarm/global_state.rb +4 -6
- data/lib/crabfarm/modes/recorder.rb +8 -2
- data/lib/crabfarm/rspec.rb +16 -7
- data/lib/crabfarm/templates/parser.rb.erb +3 -1
- data/lib/crabfarm/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fff9d93bc19a725f5fd02a5460ceafe44c3c7694
|
4
|
+
data.tar.gz: 46a109065b2018ed7a1a1bac296d0c53fba4e6d5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d756345e28af5f03a2a5bb9404035c6262ca0e192606939e8d8daa5968fca2a3c6c9faab52d3a81c4bf4bcf54579a5d7d031af12c86016b9ec3461ef17edd6f6
|
7
|
+
data.tar.gz: 79f8db6736962460721927082c49d9060aebcdfdc8b8773101427a113ee1b0e08c8065cbbd7715afc72d9e70af02adf10f6bd436e3af526ef85b3655c1a84ca3
|
@@ -2,11 +2,19 @@ require 'nokogiri'
|
|
2
2
|
|
3
3
|
module Crabfarm
|
4
4
|
class NokogiriAdapter
|
5
|
-
def self.
|
6
|
-
|
7
|
-
|
5
|
+
def self.format
|
6
|
+
'html'
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.parse(_raw)
|
10
|
+
Nokogiri::HTML _raw
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.preprocess_parsing_target(_target)
|
14
|
+
if _target.respond_to? :to_html
|
15
|
+
_target.to_html
|
8
16
|
else
|
9
|
-
|
17
|
+
_target
|
10
18
|
end
|
11
19
|
end
|
12
20
|
end
|
@@ -2,8 +2,16 @@ require "pdf-reader"
|
|
2
2
|
|
3
3
|
module Crabfarm
|
4
4
|
class PdfReaderAdapter
|
5
|
-
def self.
|
6
|
-
|
5
|
+
def self.format
|
6
|
+
'pdf'
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.parse(_raw)
|
10
|
+
PDF::Reader.new StringIO.new _raw
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.preprocess_parsing_target(_target)
|
14
|
+
_target
|
7
15
|
end
|
8
16
|
end
|
9
17
|
end
|
data/lib/crabfarm/base_parser.rb
CHANGED
@@ -6,13 +6,26 @@ module Crabfarm
|
|
6
6
|
|
7
7
|
attr_reader :params, :document
|
8
8
|
|
9
|
-
def self.
|
10
|
-
@
|
9
|
+
def self.parser_engine(_engine=nil)
|
10
|
+
@engine_name = _engine
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.engine
|
14
|
+
@engine ||= Strategies.load(:parser_engine, @engine_name || Crabfarm.config.parser_engine)
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.snapshot_path(_name=nil)
|
18
|
+
_name = self.to_s.underscore if _name.nil?
|
19
|
+
File.join(GlobalState.snapshots_path, _name + '.' + engine.format)
|
20
|
+
end
|
21
|
+
|
22
|
+
def engine
|
23
|
+
self.class.engine
|
11
24
|
end
|
12
25
|
|
13
26
|
def initialize(_target, _params)
|
14
|
-
|
15
|
-
@document =
|
27
|
+
@parsed_data = engine.preprocess_parsing_target _target
|
28
|
+
@document = engine.parse @parsed_data
|
16
29
|
@params = _params
|
17
30
|
|
18
31
|
super @document
|
@@ -22,6 +35,18 @@ module Crabfarm
|
|
22
35
|
raise NotImplementedError.new
|
23
36
|
end
|
24
37
|
|
38
|
+
def take_snapshot(_name=nil)
|
39
|
+
file_path = self.class.snapshot_path _name
|
40
|
+
|
41
|
+
raise ArgumentError.new "Snapshot already exists '#{file_path}', make sure to implement the #{self.class.to_s} parse method." if File.exist? file_path
|
42
|
+
|
43
|
+
dir_path = file_path.split(File::SEPARATOR)[0...-1]
|
44
|
+
FileUtils.mkpath dir_path.join(File::SEPARATOR) if dir_path.length > 0
|
45
|
+
|
46
|
+
File.write file_path, @parsed_data
|
47
|
+
nil
|
48
|
+
end
|
49
|
+
|
25
50
|
def __getobj__
|
26
51
|
@document
|
27
52
|
end
|
@@ -30,10 +55,5 @@ module Crabfarm
|
|
30
55
|
@document = obj
|
31
56
|
end
|
32
57
|
|
33
|
-
private
|
34
|
-
|
35
|
-
def class_engine
|
36
|
-
self.class.instance_variable_get :@engine
|
37
|
-
end
|
38
58
|
end
|
39
59
|
end
|
data/lib/crabfarm/cli.rb
CHANGED
@@ -115,7 +115,7 @@ module Crabfarm
|
|
115
115
|
next puts "This command can only be ran inside a crabfarm application" unless GlobalState.inside_crawler_app?
|
116
116
|
|
117
117
|
require "crabfarm/modes/recorder"
|
118
|
-
Crabfarm::Modes::Recorder.start
|
118
|
+
Crabfarm::Modes::Recorder.start args[0], options[:playback]
|
119
119
|
end
|
120
120
|
end
|
121
121
|
|
@@ -21,7 +21,8 @@ module Crabfarm
|
|
21
21
|
|
22
22
|
def load_crabtrap_context(_memento)
|
23
23
|
require 'crabfarm/crabtrap_context'
|
24
|
-
|
24
|
+
require 'crabfarm/modes/recorder'
|
25
|
+
m_path = Modes::Recorder.memento_path _memento
|
25
26
|
raise ResourceNotFoundError.new "Could not find memento '#{_name}'" unless File.exists? m_path
|
26
27
|
Crabfarm::CrabtrapContext.new :replay, m_path
|
27
28
|
end
|
@@ -9,14 +9,12 @@ module Crabfarm
|
|
9
9
|
CF_PATH
|
10
10
|
end
|
11
11
|
|
12
|
-
def
|
13
|
-
|
14
|
-
File.join(app_path, 'spec/mementos', _name + '.json.gz')
|
12
|
+
def mementos_path
|
13
|
+
File.join(app_path, 'spec/mementos')
|
15
14
|
end
|
16
15
|
|
17
|
-
def
|
18
|
-
|
19
|
-
File.join(app_path, 'spec/snapshots', _file_name)
|
16
|
+
def snapshots_path
|
17
|
+
File.join app_path, 'spec/snapshots'
|
20
18
|
end
|
21
19
|
|
22
20
|
extend self
|
@@ -6,14 +6,20 @@ module Crabfarm
|
|
6
6
|
module Modes
|
7
7
|
module Recorder
|
8
8
|
|
9
|
+
def self.memento_path(_name)
|
10
|
+
File.join(GlobalState.mementos_path, _name + '.json.gz')
|
11
|
+
end
|
12
|
+
|
9
13
|
def self.start(_target, _replay=false)
|
10
14
|
return puts "Must provide a recording target" unless _target.is_a? String
|
11
|
-
|
15
|
+
|
16
|
+
target_path = memento_path _target
|
17
|
+
return puts "Memento file does not exist: #{target_path}" if _replay and not File.exist? target_path
|
12
18
|
|
13
19
|
crabtrap_config = Crabfarm.config.crabtrap_config
|
14
20
|
crabtrap_config[:mode] = _replay ? :replay : :capture
|
15
21
|
crabtrap_config[:port] = Utils::PortDiscovery.find_available_port
|
16
|
-
crabtrap_config[:bucket_path] =
|
22
|
+
crabtrap_config[:bucket_path] = target_path
|
17
23
|
|
18
24
|
crabtrap = CrabtrapRunner.new crabtrap_config
|
19
25
|
crabtrap.start
|
data/lib/crabfarm/rspec.rb
CHANGED
@@ -3,12 +3,21 @@ module Crabfarm
|
|
3
3
|
|
4
4
|
class Error < Crabfarm::Error; end
|
5
5
|
|
6
|
-
def parse(_snapshot, _options={})
|
7
|
-
|
6
|
+
def parse(_snapshot=nil, _options={})
|
7
|
+
|
8
|
+
raise Error.new "Crawl is only available in parser specs" unless described_class < Crabfarm::BaseParser
|
9
|
+
|
10
|
+
if _snapshot.is_a? Hash
|
11
|
+
raise ArgumentException.new 'Invalid arguments' unless _options.nil?
|
12
|
+
_options = _snapshot
|
13
|
+
_snapshot = nil
|
14
|
+
end
|
15
|
+
|
16
|
+
snapshot_path = described_class.snapshot_path _snapshot
|
8
17
|
raise Error.new "Snapshot does not exist #{_snapshot}" unless File.exist? snapshot_path
|
9
18
|
|
10
|
-
|
11
|
-
parser = described_class.new
|
19
|
+
data = File.read snapshot_path
|
20
|
+
parser = described_class.new data, _options
|
12
21
|
parser.parse
|
13
22
|
parser
|
14
23
|
end
|
@@ -39,7 +48,7 @@ module Crabfarm
|
|
39
48
|
end
|
40
49
|
|
41
50
|
def parser
|
42
|
-
@parser
|
51
|
+
@parser ||= parse
|
43
52
|
end
|
44
53
|
|
45
54
|
def driver(_session_id=nil)
|
@@ -54,8 +63,8 @@ RSpec.configure do |config|
|
|
54
63
|
|
55
64
|
config.around(:example) do |example|
|
56
65
|
if described_class < Crabfarm::BaseParser
|
57
|
-
if example.metadata[:parsing]
|
58
|
-
@parser = parse example.metadata[:parsing], example.metadata[:
|
66
|
+
if example.metadata[:parsing] || example[:parsing_with_params]
|
67
|
+
@parser = parse example.metadata[:parsing], example.metadata[:parsing_with_params] || {}
|
59
68
|
end
|
60
69
|
example.run
|
61
70
|
elsif described_class < Crabfarm::BaseState
|
@@ -1,7 +1,9 @@
|
|
1
1
|
class <%= parser_class %> < Crabfarm::BaseParser
|
2
2
|
|
3
3
|
def parse
|
4
|
-
#
|
4
|
+
# You can replace the following line after running the owner state specs once.
|
5
|
+
# Take a look at the 'Testing' section of the README.md for more information!
|
6
|
+
take_snapshot
|
5
7
|
end
|
6
8
|
|
7
9
|
end
|
data/lib/crabfarm/version.rb
CHANGED