curl 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of curl might be problematic. Click here for more details.

Files changed (3) hide show
  1. data/lib/curl.rb +3 -2
  2. data/lib/string_cleaner.rb +181 -0
  3. metadata +19 -3
@@ -4,6 +4,7 @@ require "open3"
4
4
  require 'fileutils'
5
5
  require 'ap'
6
6
  require 'digest/md5'
7
+ require 'string_cleaner'
7
8
 
8
9
 
9
10
  include Open3
@@ -96,7 +97,7 @@ class CURL
96
97
  filename = cache_file(url)
97
98
  unless File.exists?(filename)
98
99
  FileUtils.mkdir_p(cache_path(url))
99
- result = get_raw(url,count,ref)
100
+ result = get_raw(url,count,ref) #+" --output \"#{filename}\" ")
100
101
  puts "cache to file '#{filename}'" if @debug
101
102
  File.open(filename,"w"){|f| f.puts result}
102
103
  return result
@@ -121,7 +122,7 @@ class CURL
121
122
  count -= 1
122
123
  result = self.get(url,count) if count > 0
123
124
  end
124
- result = result.gsub(/\\x../,'')
125
+ result.clean
125
126
 
126
127
  end
127
128
 
@@ -0,0 +1,181 @@
1
+ # encoding: UTF-8
2
+ require "unidecoder"
3
+
4
+ module String::Cleaner
5
+
6
+ def clean
7
+ fix_encoding.fix_endlines.fix_invisible_chars
8
+ end
9
+
10
+ def fix_encoding
11
+ utf8 = dup
12
+ if utf8.respond_to?(:force_encoding)
13
+ utf8.force_encoding("UTF-8") # for Ruby 1.9+
14
+ unless utf8.valid_encoding? # if invalid UTF-8
15
+ utf8 = utf8.force_encoding("ISO8859-1")
16
+ utf8.encode!("UTF-8", :invalid => :replace, :undef => :replace, :replace => "")
17
+ end
18
+ utf8.gsub!(/\u0080|¤/, "€") # special case for euro sign from Windows-1252
19
+ utf8
20
+ else
21
+ require "iconv"
22
+ utf8 << " "
23
+ begin
24
+ Iconv.new("UTF-8", "UTF-8").iconv(utf8)
25
+ rescue
26
+ utf8.gsub!(/\x80/n, "\xA4")
27
+ Iconv.new("UTF-8//IGNORE", "ISO8859-1").iconv(utf8).gsub("¤", "€")
28
+ end
29
+ end
30
+ end
31
+
32
+ def fix_endlines
33
+ gsub(/(?:\r\n|\r)/u, "\n")
34
+ end
35
+
36
+ SPECIAL_SPACES = [
37
+ 0x00A0, # NO-BREAK SPACE
38
+ 0x1680, # OGHAM SPACE MARK
39
+ 0x180E, # MONGOLIAN VOWEL SEPARATOR
40
+ (0x2000..0x200A).to_a, # EN QUAD..HAIR SPACE
41
+ 0x2028, # LINE SEPARATOR
42
+ 0x2029, # PARAGRAPH SEPARATOR
43
+ 0x202F, # NARROW NO-BREAK SPACE
44
+ 0x205F, # MEDIUM MATHEMATICAL SPACE
45
+ 0x3000, # IDEOGRAPHIC SPACE
46
+ ].flatten.collect{|e| [e].pack 'U*'}
47
+
48
+ ZERO_WIDTH = [
49
+ 0x200B, # ZERO WIDTH SPACE
50
+ 0x200C, # ZERO WIDTH NON-JOINER
51
+ 0x200D, # ZERO WIDTH JOINER
52
+ 0x2060, # WORD JOINER
53
+ 0xFEFF, # ZERO WIDTH NO-BREAK SPACE
54
+ ].flatten.collect{|e| [e].pack 'U*'}
55
+
56
+ def fix_invisible_chars
57
+ utf8 = self.dup
58
+ utf8.gsub!(Regexp.new(ZERO_WIDTH.join("|")), "")
59
+ utf8 = if utf8.respond_to?(:force_encoding)
60
+ utf8 = (utf8 << " ").split(/\n/u).each{|line|
61
+ line.gsub!(/[\s\p{C}]/u, " ")
62
+ }.join("\n").chop!
63
+ else
64
+ require "oniguruma"
65
+ utf8.split(/\n/n).collect{|line|
66
+ Oniguruma::ORegexp.new("[\\p{C}]", {:encoding => Oniguruma::ENCODING_UTF8}).gsub(line, " ")
67
+ }.join("\n").chop!
68
+ end
69
+ utf8.gsub!(Regexp.new(SPECIAL_SPACES.join("|") + "|\s"), " ")
70
+ utf8
71
+ end
72
+
73
+ def trim(chars = "")
74
+ chars.size>0 ? gsub(/\A[#{chars}]+|[#{chars}]+\z/, "") : strip
75
+ end
76
+
77
+ def to_permalink(separator="-")
78
+ clean.to_ascii(chartable).downcase.gsub(/[^a-z0-9]+/, separator).trim(separator)
79
+ end
80
+
81
+ def nl2br
82
+ gsub("\n", "<br/>\n")
83
+ end
84
+
85
+ def to_nicer_sym
86
+ to_permalink("_").to_sym
87
+ end
88
+
89
+ def chartable(options = {})
90
+ options = {
91
+ :clean_binary => true,
92
+ :translit_symbols => true,
93
+ }.merge(options)
94
+ char = "%c"
95
+ table = {
96
+ "`" => "'", # dec = 96
97
+ "¦" => "|", # dec = 166, broken vertical bar
98
+ "¨" => "", # dec = 168, spacing diaeresis - umlaut
99
+ "ª" => "", # dec = 170, feminine ordinal indicator
100
+ "«" => "\"", # dec = 171, left double angle quotes
101
+ "¬" => "!", # dec = 172, not sign
102
+ "­" => "-", # dec = 173, soft hyphen
103
+ "¯" => "-", # dec = 175, spacing macron - overline
104
+ "²" => "2", # dec = 178, superscript two - squared
105
+ "³" => "3", # dec = 179, superscript three - cubed
106
+ "´" => "'", # dec = 180, acute accent - spacing acute
107
+ "·" => "", # dec = 183, middle dot - Georgian comma
108
+ "¸" => "", # dec = 184, spacing cedilla
109
+ "¹" => "1", # dec = 185, superscript one
110
+ "º" => "0", # dec = 186, masculine ordinal indicator
111
+ "»" => "\"", # dec = 187, right double angle quotes
112
+ "¿" => "", # dec = 191, inverted question mark
113
+ "Ý" => "Y", # dec = 221
114
+ "–" => "-", # hex = 2013, en dash
115
+ "—" => "-", # hex = 2014, em dash
116
+ "‚" => "'", # hex = 201A, single low-9 quotation mark
117
+ "„" => "\"", # hex = 201E, double low-9 quotation mark
118
+ }
119
+ if options[:clean_binary]
120
+ table[char % 0] = "" # null
121
+ table[char % 1] = "" # start of heading
122
+ table[char % 2] = "" # start of text
123
+ table[char % 3] = "" # end of text
124
+ table[char % 4] = "" # end of transmission
125
+ table[char % 5] = "" # enquiry
126
+ table[char % 6] = "" # acknowledge
127
+ table[char % 7] = "" # bell
128
+ table[char % 8] = "" # backspace
129
+ table[char % 9] = " " # tab
130
+ table[char % 11] = "" # vertical tab
131
+ table[char % 12] = "" # form feed
132
+ table[char % 14] = "" # shift out
133
+ table[char % 15] = "" # shift in
134
+ table[char % 16] = "" # data link escape
135
+ table[char % 17] = "" # device control 1
136
+ table[char % 18] = "" # device control 2
137
+ table[char % 19] = "" # device control 3
138
+ table[char % 20] = "" # device control 4
139
+ table[char % 21] = "" # negative acknowledgement
140
+ table[char % 22] = "" # synchronous idle
141
+ table[char % 23] = "" # end of transmission block
142
+ table[char % 24] = "" # cancel
143
+ table[char % 25] = "" # end of medium
144
+ table[char % 26] = "" # substitute
145
+ table[char % 27] = "" # escape
146
+ table[char % 28] = "" # file separator
147
+ table[char % 29] = "" # group separator
148
+ table[char % 30] = "" # record separator
149
+ table[char % 31] = "" # unit separator
150
+ table[char % 127] = "" # delete
151
+ end
152
+ if options[:translit_symbols]
153
+ table["$"] = " dollars " # dec = 36, dollar sign
154
+ table["%"] = " percent " # dec = 37, percent sign
155
+ table["&"] = " and " # dec = 38, ampersand
156
+ table["@"] = " at " # dec = 64, at symbol
157
+ table[char % 128] = " euros " # windows euro
158
+ table["¢"] = " cents " # dec = 162, cent sign
159
+ table["£"] = " pounds " # dec = 163, pound sign
160
+ table["¤"] = " euros " # dec = 164, currency sign
161
+ table["¥"] = " yens " # dec = 165, yen sign
162
+ table["§"] = " section " # dec = 167, section sign
163
+ table["©"] = " copyright " # dec = 169, copyright sign
164
+ table["®"] = " registered trademark " # dec = 174, registered trade mark sign
165
+ table["°"] = " degrees " # dec = 176, degree sign
166
+ table["±"] = " approx " # dec = 177, plus-or-minus sign
167
+ table["µ"] = " micro " # dec = 181, micro sign
168
+ table["¶"] = " paragraph " # dec = 182, pilcrow sign - paragraph sign
169
+ table["¼"] = " 1/4 " # dec = 188, fraction one quarter
170
+ table["½"] = " 1/2 " # dec = 189, fraction one half
171
+ table["¾"] = " 3/4 " # dec = 190, fraction three quarters
172
+ table["€"] = " euros " # hex = 20AC, unicode euro
173
+ table["™"] = " trademark " # hex = 2122, trade mark
174
+ end
175
+ table
176
+ end
177
+
178
+ end
179
+ class String
180
+ include String::Cleaner
181
+ end
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 0
8
- - 5
9
- version: 0.0.5
8
+ - 6
9
+ version: 0.0.6
10
10
  platform: ruby
11
11
  authors:
12
12
  - tg0
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-10-14 00:00:00 +03:00
17
+ date: 2010-10-30 00:00:00 +03:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -32,6 +32,21 @@ dependencies:
32
32
  version: 0.2.1
33
33
  type: :runtime
34
34
  version_requirements: *id001
35
+ - !ruby/object:Gem::Dependency
36
+ name: unidecoder
37
+ prerelease: false
38
+ requirement: &id002 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ segments:
44
+ - 1
45
+ - 1
46
+ - 1
47
+ version: 1.1.1
48
+ type: :runtime
49
+ version_requirements: *id002
35
50
  description: Some simple methods to use shell curl
36
51
  email: email@tg0.ru
37
52
  executables: []
@@ -43,6 +58,7 @@ extra_rdoc_files: []
43
58
  files:
44
59
  - README
45
60
  - lib/curl.rb
61
+ - lib/string_cleaner.rb
46
62
  has_rdoc: true
47
63
  homepage: http://github.com/tg0/curl
48
64
  licenses: []