sinew 1.0.4 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,126 @@
1
+ require 'csv'
2
+ require 'stringex'
3
+
4
+ #
5
+ # CSV output.
6
+ #
7
+
8
+ module Sinew
9
+ class Output
10
+ attr_reader :sinew, :columns, :rows, :csv
11
+
12
+ def initialize(sinew)
13
+ @sinew = sinew
14
+ @rows = []
15
+ end
16
+
17
+ def filename
18
+ @filename ||= begin
19
+ recipe = sinew.options[:recipe]
20
+ ext = File.extname(recipe)
21
+ if ext.empty?
22
+ "#{recipe}.csv"
23
+ else
24
+ recipe.gsub(ext, '.csv')
25
+ end
26
+ end
27
+ end
28
+
29
+ def header(columns)
30
+ sinew.banner("Writing to #{filename}...") if !sinew.quiet?
31
+
32
+ columns = columns.flatten
33
+ @columns = columns
34
+
35
+ # open csv, write header row
36
+ @csv = CSV.open(filename, 'wb')
37
+ csv << columns
38
+ end
39
+
40
+ def emit(row)
41
+ # implicit header if necessary
42
+ header(row.keys) if !csv
43
+
44
+ rows << row.dup
45
+
46
+ # map columns to row, and normalize along the way
47
+ print = {}
48
+ row = columns.map do |i|
49
+ value = normalize(row[i])
50
+ print[i] = value if value.present?
51
+ value
52
+ end
53
+
54
+ # print
55
+ sinew.vputs print.ai
56
+
57
+ csv << row
58
+ csv.flush
59
+ end
60
+
61
+ def count
62
+ rows.length
63
+ end
64
+
65
+ def report
66
+ return if count == 0
67
+
68
+ sinew.banner("Got #{count} rows.")
69
+
70
+ # calculate counts
71
+ counts = Hash.new(0)
72
+ rows.each do |row|
73
+ row.each_pair { |k, v| counts[k] += 1 if v.present? }
74
+ end
75
+ # sort by counts
76
+ cols = columns.sort_by { |i| [ -counts[i], i ] }
77
+
78
+ # report
79
+ len = cols.map { |i| i.to_s.length }.max
80
+ fmt = " %-#{len + 1}s %7d / %-7d %6.1f%%\n"
81
+ cols.each do |col|
82
+ $stderr.printf(fmt, col, counts[col], count, counts[col] * 100.0 / count)
83
+ end
84
+ end
85
+
86
+ def normalize(s)
87
+ # noko/array/misc => string
88
+ s = case s
89
+ when Nokogiri::XML::Element, Nokogiri::XML::NodeSet
90
+ s.inner_html
91
+ when Array
92
+ s.map(&:to_s).join('|')
93
+ else
94
+ s.to_s
95
+ end
96
+
97
+ #
98
+ # Below uses stringex
99
+ #
100
+ # github.com/rsl/stringex/blob/master/lib/stringex/string_extensions.rb
101
+ # github.com/rsl/stringex/blob/master/lib/stringex/localization/conversion_expressions.rb
102
+ #
103
+
104
+ # <a>b</a> => b
105
+ s = s.strip_html_tags
106
+
107
+ # Converts MS Word 'smart punctuation' to ASCII
108
+ s = s.convert_smart_punctuation
109
+
110
+ # "&aacute;".convert_accented_html_entities # => "a"
111
+ s = s.convert_accented_html_entities
112
+
113
+ # &amp, &frac, etc.
114
+ s = s.convert_miscellaneous_html_entities
115
+
116
+ # convert unicode => regular characters
117
+ s = s.to_ascii
118
+
119
+ # squish
120
+ s = s.squish
121
+
122
+ s
123
+ end
124
+ protected :normalize
125
+ end
126
+ end
@@ -0,0 +1,148 @@
1
+ require 'digest/md5'
2
+ require 'httparty'
3
+ require 'htmlentities'
4
+
5
+ #
6
+ # Process a single HTTP request. Mostly a wrapper around HTTParty.
7
+ #
8
+
9
+ module Sinew
10
+ class Error < StandardError; end
11
+
12
+ class Request
13
+ HTML_ENTITIES = HTMLEntities.new
14
+ VALID_METHODS = %w[get post patch put delete head options].freeze
15
+
16
+ attr_reader :sinew, :method, :uri, :options, :cache_key
17
+
18
+ # Options are largely compatible with HTTParty, except for :method.
19
+ def initialize(sinew, method, url, options = {})
20
+ @sinew = sinew
21
+ @method = method
22
+ @options = options.dup
23
+ @uri = parse_url(url)
24
+ @cache_key = calculate_cache_key
25
+ end
26
+
27
+ # run the request, return the result
28
+ def perform
29
+ validate!
30
+
31
+ # merge global/options headers
32
+ headers = sinew.runtime_options.headers
33
+ headers = headers.merge(options[:headers]) if options[:headers]
34
+ options[:headers] = headers
35
+
36
+ party_response = HTTParty.send(method, uri, options)
37
+ Response.from_network(self, party_response)
38
+ end
39
+
40
+ # We accept sloppy urls and attempt to clean them up
41
+ def parse_url(url)
42
+ s = url
43
+
44
+ # remove entities
45
+ s = HTML_ENTITIES.decode(s)
46
+
47
+ # fix a couple of common encoding bugs
48
+ s = s.gsub(' ', '%20')
49
+ s = s.gsub("'", '%27')
50
+
51
+ # append query manually (instead of letting HTTParty handle it) so we can
52
+ # include it in cache_key
53
+ query = options.delete(:query)
54
+ if query.present?
55
+ q = HTTParty::HashConversions.to_params(query)
56
+ separator = s.include?('?') ? '&' : '?'
57
+ s = "#{s}#{separator}#{q}"
58
+ end
59
+
60
+ URI.parse(s)
61
+ end
62
+ protected :parse_url
63
+
64
+ def calculate_cache_key
65
+ dir = pathify(uri.host)
66
+
67
+ body_key = if body.is_a?(Hash)
68
+ HTTParty::HashConversions.to_params(body)
69
+ else
70
+ body&.dup
71
+ end
72
+
73
+ # build key, as a hash for before_generate_cache_key
74
+ key = {
75
+ method: method.dup,
76
+ path: uri.path,
77
+ query: uri.query,
78
+ body: body_key,
79
+ }
80
+ key = sinew.runtime_options.before_generate_cache_key.call(key)
81
+
82
+ # strip method for gets
83
+ key.delete(:method) if key[:method] == 'get'
84
+
85
+ # pull out the values, join and pathify
86
+ path = key.values.select(&:present?).join(',')
87
+ path = pathify(path)
88
+
89
+ # shorten long paths
90
+ if path.length > 250
91
+ path = Digest::MD5.hexdigest(path)
92
+ end
93
+
94
+ "#{dir}/#{path}"
95
+ end
96
+ protected :calculate_cache_key
97
+
98
+ def validate!
99
+ raise "invalid method #{method}" if !VALID_METHODS.include?(method)
100
+ raise "invalid url #{uri}" if uri.scheme !~ /^http/
101
+ raise "can't get with a body" if method == 'get' && body
102
+ raise "Content-Type doesn't make sense without a body" if content_type && !body
103
+ end
104
+ protected :validate!
105
+
106
+ def body
107
+ options[:body]
108
+ end
109
+ protected :body
110
+
111
+ def headers
112
+ options[:headers]
113
+ end
114
+ protected :headers
115
+
116
+ def content_type
117
+ headers && headers['Content-Type']
118
+ end
119
+ protected :content_type
120
+
121
+ def form?
122
+ content_type == 'application/x-www-form-urlencoded'
123
+ end
124
+ protected :form?
125
+
126
+ def pathify(s)
127
+ # remove leading slash
128
+ s = s.gsub(/^\//, '')
129
+ # .. => comma
130
+ s = s.gsub('..', ',')
131
+ # query separators => comma
132
+ s = s.gsub(/[?\/&]/, ',')
133
+ # ,, => comma
134
+ s = s.gsub(',,', ',')
135
+ # encode invalid path chars
136
+ s = s.gsub(/[^A-Za-z0-9_.,=-]/) do |i|
137
+ hex = i.unpack('H2').first
138
+ "%#{hex}"
139
+ end
140
+ # handle empty case
141
+ s = '_root_' if s.blank?
142
+ # always downcase
143
+ s = s.downcase
144
+ s
145
+ end
146
+ protected :pathify
147
+ end
148
+ end
@@ -0,0 +1,75 @@
1
+ #
2
+ # An HTTP response. Mostly a wrapper around HTTParty.
3
+ #
4
+
5
+ module Sinew
6
+ class Response
7
+ attr_accessor :request, :uri, :body, :code, :headers
8
+
9
+ def self.from_network(request, party_response)
10
+ Response.new.tap do |response|
11
+ response.request = request
12
+ response.uri = party_response.request.last_uri
13
+ response.code = party_response.code
14
+ response.headers = party_response.headers.to_h
15
+
16
+ # force to utf-8 as best we can
17
+ body = party_response.body
18
+ if body.encoding != Encoding::UTF_8
19
+ body = body.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
20
+ end
21
+ response.body = body
22
+ end
23
+ end
24
+
25
+ def self.from_cache(request, body, head)
26
+ Response.new.tap do |response|
27
+ response.request = request
28
+ response.body = body
29
+
30
+ # defaults
31
+ response.uri = request.uri
32
+ response.code = 200
33
+ response.headers = {}
34
+
35
+ # overwrite with cached response headers
36
+ if head
37
+ head = JSON.parse(head, symbolize_names: true)
38
+ response.uri = URI.parse(head[:uri])
39
+ response.code = head[:code]
40
+ response.headers = head[:headers]
41
+ end
42
+ end
43
+ end
44
+
45
+ def self.from_timeout(request)
46
+ Response.new.tap do |response|
47
+ response.request = request
48
+ response.uri = request.uri
49
+ response.body = 'timeout'
50
+ response.code = 999
51
+ response.headers = {}
52
+ end
53
+ end
54
+
55
+ def error?
56
+ code >= 400
57
+ end
58
+
59
+ def error_500?
60
+ code / 100 >= 5
61
+ end
62
+
63
+ def redirected?
64
+ request.uri != uri
65
+ end
66
+
67
+ def head_as_json
68
+ {
69
+ uri: uri,
70
+ code: code,
71
+ headers: headers,
72
+ }
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,26 @@
1
+ #
2
+ # Runtime options that sinew files can modify.
3
+ #
4
+
5
+ module Sinew
6
+ class RuntimeOptions
7
+ attr_accessor :retries
8
+ attr_accessor :rate_limit
9
+ attr_accessor :headers
10
+ attr_accessor :before_generate_cache_key
11
+
12
+ def initialize
13
+ self.retries = 3
14
+ self.rate_limit = 1
15
+ self.headers = {
16
+ 'User-Agent' => "sinew/#{VERSION}",
17
+ }
18
+ self.before_generate_cache_key = ->(i) { i }
19
+
20
+ # for testing
21
+ if ENV['SINEW_TEST']
22
+ self.rate_limit = 0
23
+ end
24
+ end
25
+ end
26
+ end
@@ -1,4 +1,4 @@
1
1
  module Sinew
2
2
  # Gem version
3
- VERSION = "1.0.4"
3
+ VERSION = '2.0.0'.freeze
4
4
  end
@@ -1,7 +1,9 @@
1
- get "http://httpbin.org"
2
- noko.css("ul li a").each do |a|
3
- row = { }
1
+ get 'http://httpbin.org'
2
+ noko.css('ul li a').each do |a|
3
+ row = {}
4
4
  row[:url] = a[:href]
5
5
  row[:title] = a.text
6
6
  csv_emit(row)
7
7
  end
8
+
9
+ get 'http://httpbin.org/redirect/2'
@@ -1,29 +1,34 @@
1
- $LOAD_PATH << File.expand_path("../lib", __FILE__)
1
+ $LOAD_PATH.unshift("#{__dir__}/lib")
2
2
 
3
- require "sinew/version"
3
+ require 'sinew/version'
4
4
 
5
5
  Gem::Specification.new do |s|
6
- s.name = "sinew"
6
+ s.name = 'sinew'
7
7
  s.version = Sinew::VERSION
8
8
  s.platform = Gem::Platform::RUBY
9
- s.authors = ["Adam Doppelt"]
10
- s.email = ["amd@gurge.com"]
11
- s.homepage = "http://github.com/gurgeous/sinew"
12
- s.summary = "Sinew - structured web crawling using recipes."
13
- s.description = "Crawl web sites easily using ruby recipes, with caching and nokogiri."
9
+ s.license = 'MIT'
10
+ s.authors = [ 'Adam Doppelt' ]
11
+ s.email = [ 'amd@gurge.com' ]
12
+ s.homepage = 'http://github.com/gurgeous/sinew'
13
+ s.summary = 'Sinew - structured web crawling using recipes.'
14
+ s.description = 'Crawl web sites easily using ruby recipes, with caching and nokogiri.'
15
+ s.required_ruby_version = '~> 2.3'
14
16
 
15
- s.rubyforge_project = "sinew"
17
+ s.rubyforge_project = 'sinew'
16
18
 
17
- s.add_runtime_dependency "activesupport", "~> 3.0"
18
- s.add_runtime_dependency "awesome_print"
19
- s.add_runtime_dependency "htmlentities"
20
- s.add_runtime_dependency "nokogiri"
21
- s.add_runtime_dependency "stringex", "~> 2.0"
22
- s.add_runtime_dependency "trollop"
23
- s.add_development_dependency "rake"
19
+ s.add_runtime_dependency 'awesome_print', '~> 1.8'
20
+ s.add_runtime_dependency 'htmlentities', '~> 4.3'
21
+ s.add_runtime_dependency 'httparty', '~> 0.16'
22
+ s.add_runtime_dependency 'nokogiri', '~> 1.8'
23
+ s.add_runtime_dependency 'scripto', '~> 0'
24
+ s.add_runtime_dependency 'slop', '~> 4.6'
25
+ s.add_runtime_dependency 'stringex', '~> 2.8'
26
+ s.add_development_dependency 'minitest', '~> 5.11'
27
+ s.add_development_dependency 'rake', '~> 12.3'
28
+ s.add_development_dependency 'webmock', '~> 3.4'
24
29
 
25
30
  s.files = `git ls-files`.split("\n")
26
- s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
27
- s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
28
- s.require_paths = ["lib"]
31
+ s.test_files = `git ls-files -- test/*`.split("\n")
32
+ s.executables = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
33
+ s.require_paths = [ 'lib' ]
29
34
  end