curl 0.0.4 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of curl might be problematic. Click here for more details.

Files changed (3) hide show
  1. data/lib/curl.rb +72 -40
  2. data/lib/string_cleaner.rb +181 -0
  3. metadata +16 -16
data/lib/curl.rb CHANGED
@@ -4,6 +4,7 @@ require "open3"
4
4
  require 'fileutils'
5
5
  require 'ap'
6
6
  require 'digest/md5'
7
+ require 'string_cleaner'
7
8
 
8
9
 
9
10
  include Open3
@@ -31,15 +32,20 @@ class CURL
31
32
  attr_accessor :user_agent
32
33
 
33
34
  def initialize(keys={})
35
+ @socks_hostname = keys[:socks_hostname] ||= false
34
36
  @cache = ( keys[:cache] ? keys[:cache] : false )
37
+ @cache_time = ( keys[:cache_time] ? keys[:cache_time] : 3600*24*1 ) # 1 day cache life
38
+ @connect_timeout = keys[:connect_timeout] || 6
39
+ @max_time = keys[:max_time] || 8
40
+ @retry = keys[:retry] || 1
35
41
  @cookies_enable = ( keys[:cookies_disable] ? false : true )
36
42
  @user_agent = AGENT_ALIASES["Google"]#AGENT_ALIASES[AGENT_ALIASES.keys[rand(6)]]
37
43
  FileUtils.makedirs("/tmp/curl/")
38
44
  @cookies_file = keys[:cookies] || "/tmp/curl/curl_#{rand}_#{rand}.jar"
39
- # @cookies_file = "/home/ruslan/curl.jar"
40
- #--header "Accept-Encoding: deflate"
41
- @setup_params = ' --header "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" --header "Accept-Language: en-us,en;q=0.5" --header "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7" '
42
- @setup_params = ' --connect-timeout 6 --max-time 8 --retry 1 --location --compressed --silent -k '
45
+ # @cookies_file = "/home/ruslan/curl.jar"
46
+ #--header "Accept-Encoding: deflate"
47
+ # @setup_params = ' --header "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" --header "Accept-Language: en-us,en;q=0.5" --header "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7" '
48
+ @setup_params = " --connect-timeout #{@connect_timeout} --max-time #{@max_time} --retry #{@retry} --location --compressed --silent -k "
43
49
  # @setup_params = ' --location --silent '
44
50
  yield self if block_given?
45
51
  end
@@ -60,10 +66,11 @@ class CURL
60
66
  end
61
67
 
62
68
  def socks(socks_uri)
63
- socks = ( socks_uri.is_a?(URI) ? socks_uri : URI.parse("http://#{socks_uri}") )
64
- @setup_params = "#{@setup_params} --socks5-hostname \"#{socks.host}:#{socks.port}\" "
65
- @setup_params = "#{@setup_params} --proxy-user \"#{socks.user}:#{socks.password}\" " if socks.user
66
- @setup_params
69
+ socks = ( socks_uri.is_a?(URI) ? socks_uri : URI.parse("http://#{socks_uri}") )
70
+ s = @socks_hostname ? "--socks5-hostname" : "--socks5"
71
+ @setup_params = "#{@setup_params} #{s} \"#{socks.host}:#{socks.port}\" "
72
+ @setup_params = "#{@setup_params} --proxy-user \"#{socks.user}:#{socks.password}\" " if socks.user
73
+ @setup_params
67
74
  end
68
75
 
69
76
  def self.check(proxy)
@@ -84,27 +91,44 @@ class CURL
84
91
  @debug
85
92
  end
86
93
 
87
- def get(url, count=3, ref=nil, keys={})
94
+ def cache_path(url)
95
+ "#{@cache}/#{Digest::MD5.hexdigest(url)[0..1]}/#{Digest::MD5.hexdigest(url)[2..3]}/#{Digest::MD5.hexdigest(url)[4..5]}/#{Digest::MD5.hexdigest(url)[6..7]}"
96
+ end
97
+ def cache_file(url)
98
+ cache_path(url)+"/#{Digest::MD5.hexdigest(url)}.html"
99
+ end
100
+
101
+ def get(url, keys={})
102
+ ref = keys[:ref] ||= nil
103
+ count = keys[:count] ||= 3
104
+ encoding = keys[:encoding] ||= "utf-8"
105
+ raw = ( keys[:raw]==nil ? false : keys[:raw] )
106
+
88
107
  if @cache
89
- filename = "#{@cache}/#{Digest::MD5.hexdigest(url)[0..3]}/#{Digest::MD5.hexdigest(url)}.html"
90
- unless File.exists?(filename)
91
- FileUtils.mkdir_p("#{@cache}/#{Digest::MD5.hexdigest(url)[0..3]}/")
92
- result = get_raw(url,count,ref)
93
- puts "cache to file '#{filename}'"
108
+ filename = cache_file(url)
109
+ unless File.exists?(filename) && (File.exists?(filename) && File.ctime(filename) > Time.now-@cache_time)
110
+ FileUtils.mkdir_p(cache_path(url))
111
+ result = get_raw(url, {:count=>count, :ref=>ref, :encoding=>encoding} ) #+" --output \"#{filename}\" ")
112
+ puts "cache to file '#{filename}'" if @debug
94
113
  File.open(filename,"w"){|f| f.puts result}
95
114
  return result
96
115
  else
97
- puts "read from cache file '#{filename}'"
116
+ puts "read from cache file '#{filename}'" if @debug
98
117
  return open(filename).read
99
118
  end
100
119
  else
101
- return get_raw(url,count,ref)
120
+ return get_raw(url, {:count=>count , :ref=>ref, :encoding=>encoding, :raw=>raw})
102
121
  end
103
122
 
104
123
  end
105
124
 
106
- def get_raw(url,count=3,ref=nil)
107
- cmd = "curl #{cookies_store} #{browser_type} #{@setup_params} #{ref} \"#{url}\" "
125
+ def get_raw(url, keys={})
126
+ ref = keys[:ref] ||= nil
127
+ count = keys[:count] ||= 3
128
+ encoding = keys[:encoding] ||= "utf-8"
129
+ raw = ( keys[:raw]==nil ? false : keys[:raw] )
130
+
131
+ cmd = "curl #{cookies_store} #{browser_type} #{@setup_params} #{ref} \"#{url}\" "
108
132
  if @debug
109
133
  puts cmd.red
110
134
  end
@@ -114,8 +138,13 @@ class CURL
114
138
  count -= 1
115
139
  result = self.get(url,count) if count > 0
116
140
  end
117
- result = result.gsub(/\\x../,'')
118
-
141
+ # result.force_encoding(encoding)
142
+ if raw
143
+ return result
144
+ else
145
+ return ( encoding=="utf-8" ? result.clean : Iconv.new("UTF-8", "WINDOWS-1251").iconv(result) )
146
+ end
147
+
119
148
  end
120
149
 
121
150
  # формат данных для поста
@@ -155,26 +184,29 @@ class CURL
155
184
  # }
156
185
  def send(url,post_data, ref = nil,count=5 )
157
186
 
158
- post_q = '' # " -F \"method\"=\"post\" "
159
- post_data.each do |key,val|
160
- pre = ""
161
- if key
162
- pre = "@" if key.scan("file").size>0 or key.scan("photo").size>0
163
- val = val.gsub('"','\"')
164
- post_q += " -F \"#{key}\"=#{pre}\"#{val}\" "
165
- end
166
- end
167
-
168
- cmd = "curl #{cookies_store} #{browser_type} #{post_q} #{@setup_params} #{ref} \"#{url}\" "
169
- puts cmd.red if @debug
170
-
171
- result = open_pipe(cmd)
172
- #if result.to_s.strip.size == 0
173
- # puts "empty result, left #{count} try".yellow if @debug
174
- # count -= 1
175
- # result = self.send(url,post_data,nil,count) if count > 0
176
- #end
177
- result
187
+ post_q = '' # " -F \"method\"=\"post\" "
188
+ post_data.each do |key,val|
189
+ pre = ""
190
+ if key
191
+ key = key.to_s
192
+ pre = "@" if key.scan("file").size>0 or key.scan("photo").size>0 or key.scan("@").size>0
193
+ key = key.to_s.gsub("@",'')
194
+ val = val.to_s
195
+ val = val.gsub('"','\"')
196
+ post_q += " -F \"#{key}\"=#{pre}\"#{val}\" "
197
+ end
198
+ end
199
+
200
+ cmd = "curl #{cookies_store} #{browser_type} #{post_q} #{@setup_params} #{ref} \"#{url}\" "
201
+ puts cmd.red if @debug
202
+
203
+ result = open_pipe(cmd)
204
+ #if result.to_s.strip.size == 0
205
+ # puts "empty result, left #{count} try".yellow if @debug
206
+ # count -= 1
207
+ # result = self.send(url,post_data,nil,count) if count > 0
208
+ #end
209
+ result
178
210
  end
179
211
 
180
212
 
@@ -0,0 +1,181 @@
1
+ # encoding: UTF-8
2
+ require "unidecoder"
3
+
4
+ module String::Cleaner
5
+
6
+ def clean
7
+ fix_encoding.fix_endlines.fix_invisible_chars
8
+ end
9
+
10
+ def fix_encoding
11
+ utf8 = dup
12
+ if utf8.respond_to?(:force_encoding)
13
+ utf8.force_encoding("UTF-8") # for Ruby 1.9+
14
+ unless utf8.valid_encoding? # if invalid UTF-8
15
+ utf8 = utf8.force_encoding("ISO8859-1")
16
+ utf8.encode!("UTF-8", :invalid => :replace, :undef => :replace, :replace => "")
17
+ end
18
+ utf8.gsub!(/\u0080|¤/, "€") # special case for euro sign from Windows-1252
19
+ utf8
20
+ else
21
+ require "iconv"
22
+ utf8 << " "
23
+ begin
24
+ Iconv.new("UTF-8", "UTF-8").iconv(utf8)
25
+ rescue
26
+ utf8.gsub!(/\x80/n, "\xA4")
27
+ Iconv.new("UTF-8//IGNORE", "ISO8859-1").iconv(utf8).gsub("¤", "€")
28
+ end
29
+ end
30
+ end
31
+
32
+ def fix_endlines
33
+ gsub(/(?:\r\n|\r)/u, "\n")
34
+ end
35
+
36
+ SPECIAL_SPACES = [
37
+ 0x00A0, # NO-BREAK SPACE
38
+ 0x1680, # OGHAM SPACE MARK
39
+ 0x180E, # MONGOLIAN VOWEL SEPARATOR
40
+ (0x2000..0x200A).to_a, # EN QUAD..HAIR SPACE
41
+ 0x2028, # LINE SEPARATOR
42
+ 0x2029, # PARAGRAPH SEPARATOR
43
+ 0x202F, # NARROW NO-BREAK SPACE
44
+ 0x205F, # MEDIUM MATHEMATICAL SPACE
45
+ 0x3000, # IDEOGRAPHIC SPACE
46
+ ].flatten.collect{|e| [e].pack 'U*'}
47
+
48
+ ZERO_WIDTH = [
49
+ 0x200B, # ZERO WIDTH SPACE
50
+ 0x200C, # ZERO WIDTH NON-JOINER
51
+ 0x200D, # ZERO WIDTH JOINER
52
+ 0x2060, # WORD JOINER
53
+ 0xFEFF, # ZERO WIDTH NO-BREAK SPACE
54
+ ].flatten.collect{|e| [e].pack 'U*'}
55
+
56
+ def fix_invisible_chars
57
+ utf8 = self.dup
58
+ utf8.gsub!(Regexp.new(ZERO_WIDTH.join("|")), "")
59
+ utf8 = if utf8.respond_to?(:force_encoding)
60
+ utf8 = (utf8 << " ").split(/\n/u).each{|line|
61
+ line.gsub!(/[\s\p{C}]/u, " ")
62
+ }.join("\n").chop!
63
+ else
64
+ require "oniguruma"
65
+ utf8.split(/\n/n).collect{|line|
66
+ Oniguruma::ORegexp.new("[\\p{C}]", {:encoding => Oniguruma::ENCODING_UTF8}).gsub(line, " ")
67
+ }.join("\n").chop!
68
+ end
69
+ utf8.gsub!(Regexp.new(SPECIAL_SPACES.join("|") + "|\s"), " ")
70
+ utf8
71
+ end
72
+
73
+ def trim(chars = "")
74
+ chars.size>0 ? gsub(/\A[#{chars}]+|[#{chars}]+\z/, "") : strip
75
+ end
76
+
77
+ def to_permalink(separator="-")
78
+ clean.to_ascii(chartable).downcase.gsub(/[^a-z0-9]+/, separator).trim(separator)
79
+ end
80
+
81
+ def nl2br
82
+ gsub("\n", "<br/>\n")
83
+ end
84
+
85
+ def to_nicer_sym
86
+ to_permalink("_").to_sym
87
+ end
88
+
89
+ def chartable(options = {})
90
+ options = {
91
+ :clean_binary => true,
92
+ :translit_symbols => true,
93
+ }.merge(options)
94
+ char = "%c"
95
+ table = {
96
+ "`" => "'", # dec = 96
97
+ "¦" => "|", # dec = 166, broken vertical bar
98
+ "¨" => "", # dec = 168, spacing diaeresis - umlaut
99
+ "ª" => "", # dec = 170, feminine ordinal indicator
100
+ "«" => "\"", # dec = 171, left double angle quotes
101
+ "¬" => "!", # dec = 172, not sign
102
+ "­" => "-", # dec = 173, soft hyphen
103
+ "¯" => "-", # dec = 175, spacing macron - overline
104
+ "²" => "2", # dec = 178, superscript two - squared
105
+ "³" => "3", # dec = 179, superscript three - cubed
106
+ "´" => "'", # dec = 180, acute accent - spacing acute
107
+ "·" => "", # dec = 183, middle dot - Georgian comma
108
+ "¸" => "", # dec = 184, spacing cedilla
109
+ "¹" => "1", # dec = 185, superscript one
110
+ "º" => "0", # dec = 186, masculine ordinal indicator
111
+ "»" => "\"", # dec = 187, right double angle quotes
112
+ "¿" => "", # dec = 191, inverted question mark
113
+ "Ý" => "Y", # dec = 221
114
+ "–" => "-", # hex = 2013, en dash
115
+ "—" => "-", # hex = 2014, em dash
116
+ "‚" => "'", # hex = 201A, single low-9 quotation mark
117
+ "„" => "\"", # hex = 201E, double low-9 quotation mark
118
+ }
119
+ if options[:clean_binary]
120
+ table[char % 0] = "" # null
121
+ table[char % 1] = "" # start of heading
122
+ table[char % 2] = "" # start of text
123
+ table[char % 3] = "" # end of text
124
+ table[char % 4] = "" # end of transmission
125
+ table[char % 5] = "" # enquiry
126
+ table[char % 6] = "" # acknowledge
127
+ table[char % 7] = "" # bell
128
+ table[char % 8] = "" # backspace
129
+ table[char % 9] = " " # tab
130
+ table[char % 11] = "" # vertical tab
131
+ table[char % 12] = "" # form feed
132
+ table[char % 14] = "" # shift out
133
+ table[char % 15] = "" # shift in
134
+ table[char % 16] = "" # data link escape
135
+ table[char % 17] = "" # device control 1
136
+ table[char % 18] = "" # device control 2
137
+ table[char % 19] = "" # device control 3
138
+ table[char % 20] = "" # device control 4
139
+ table[char % 21] = "" # negative acknowledgement
140
+ table[char % 22] = "" # synchronous idle
141
+ table[char % 23] = "" # end of transmission block
142
+ table[char % 24] = "" # cancel
143
+ table[char % 25] = "" # end of medium
144
+ table[char % 26] = "" # substitute
145
+ table[char % 27] = "" # escape
146
+ table[char % 28] = "" # file separator
147
+ table[char % 29] = "" # group separator
148
+ table[char % 30] = "" # record separator
149
+ table[char % 31] = "" # unit separator
150
+ table[char % 127] = "" # delete
151
+ end
152
+ if options[:translit_symbols]
153
+ table["$"] = " dollars " # dec = 36, dollar sign
154
+ table["%"] = " percent " # dec = 37, percent sign
155
+ table["&"] = " and " # dec = 38, ampersand
156
+ table["@"] = " at " # dec = 64, at symbol
157
+ table[char % 128] = " euros " # windows euro
158
+ table["¢"] = " cents " # dec = 162, cent sign
159
+ table["£"] = " pounds " # dec = 163, pound sign
160
+ table["¤"] = " euros " # dec = 164, currency sign
161
+ table["¥"] = " yens " # dec = 165, yen sign
162
+ table["§"] = " section " # dec = 167, section sign
163
+ table["©"] = " copyright " # dec = 169, copyright sign
164
+ table["®"] = " registered trademark " # dec = 174, registered trade mark sign
165
+ table["°"] = " degrees " # dec = 176, degree sign
166
+ table["±"] = " approx " # dec = 177, plus-or-minus sign
167
+ table["µ"] = " micro " # dec = 181, micro sign
168
+ table["¶"] = " paragraph " # dec = 182, pilcrow sign - paragraph sign
169
+ table["¼"] = " 1/4 " # dec = 188, fraction one quarter
170
+ table["½"] = " 1/2 " # dec = 189, fraction one half
171
+ table["¾"] = " 3/4 " # dec = 190, fraction three quarters
172
+ table["€"] = " euros " # hex = 20AC, unicode euro
173
+ table["™"] = " trademark " # hex = 2122, trade mark
174
+ end
175
+ table
176
+ end
177
+
178
+ end
179
+ class String
180
+ include String::Cleaner
181
+ end
metadata CHANGED
@@ -1,12 +1,8 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: curl
3
3
  version: !ruby/object:Gem::Version
4
- prerelease: false
5
- segments:
6
- - 0
7
- - 0
8
- - 4
9
- version: 0.0.4
4
+ prerelease:
5
+ version: 0.0.9
10
6
  platform: ruby
11
7
  authors:
12
8
  - tg0
@@ -14,7 +10,7 @@ autorequire:
14
10
  bindir: bin
15
11
  cert_chain: []
16
12
 
17
- date: 2010-10-13 00:00:00 +03:00
13
+ date: 2011-03-10 00:00:00 +02:00
18
14
  default_executable:
19
15
  dependencies:
20
16
  - !ruby/object:Gem::Dependency
@@ -25,13 +21,20 @@ dependencies:
25
21
  requirements:
26
22
  - - ">="
27
23
  - !ruby/object:Gem::Version
28
- segments:
29
- - 0
30
- - 2
31
- - 1
32
24
  version: 0.2.1
33
25
  type: :runtime
34
26
  version_requirements: *id001
27
+ - !ruby/object:Gem::Dependency
28
+ name: unidecoder
29
+ prerelease: false
30
+ requirement: &id002 !ruby/object:Gem::Requirement
31
+ none: false
32
+ requirements:
33
+ - - ">="
34
+ - !ruby/object:Gem::Version
35
+ version: 1.1.1
36
+ type: :runtime
37
+ version_requirements: *id002
35
38
  description: Some simple methods to use shell curl
36
39
  email: email@tg0.ru
37
40
  executables: []
@@ -43,6 +46,7 @@ extra_rdoc_files: []
43
46
  files:
44
47
  - README
45
48
  - lib/curl.rb
49
+ - lib/string_cleaner.rb
46
50
  has_rdoc: true
47
51
  homepage: http://github.com/tg0/curl
48
52
  licenses: []
@@ -58,21 +62,17 @@ required_ruby_version: !ruby/object:Gem::Requirement
58
62
  requirements:
59
63
  - - ">="
60
64
  - !ruby/object:Gem::Version
61
- segments:
62
- - 0
63
65
  version: "0"
64
66
  required_rubygems_version: !ruby/object:Gem::Requirement
65
67
  none: false
66
68
  requirements:
67
69
  - - ">="
68
70
  - !ruby/object:Gem::Version
69
- segments:
70
- - 0
71
71
  version: "0"
72
72
  requirements: []
73
73
 
74
74
  rubyforge_project: curl
75
- rubygems_version: 1.3.7
75
+ rubygems_version: 1.5.2
76
76
  signing_key:
77
77
  specification_version: 3
78
78
  summary: shell CURL ruby wrapper.