curl 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of curl might be problematic. Click here for more details.
- data/lib/curl.rb +3 -2
- data/lib/string_cleaner.rb +181 -0
- metadata +19 -3
data/lib/curl.rb
CHANGED
@@ -4,6 +4,7 @@ require "open3"
|
|
4
4
|
require 'fileutils'
|
5
5
|
require 'ap'
|
6
6
|
require 'digest/md5'
|
7
|
+
require 'string_cleaner'
|
7
8
|
|
8
9
|
|
9
10
|
include Open3
|
@@ -96,7 +97,7 @@ class CURL
|
|
96
97
|
filename = cache_file(url)
|
97
98
|
unless File.exists?(filename)
|
98
99
|
FileUtils.mkdir_p(cache_path(url))
|
99
|
-
result = get_raw(url,count,ref)
|
100
|
+
result = get_raw(url,count,ref) #+" --output \"#{filename}\" ")
|
100
101
|
puts "cache to file '#{filename}'" if @debug
|
101
102
|
File.open(filename,"w"){|f| f.puts result}
|
102
103
|
return result
|
@@ -121,7 +122,7 @@ class CURL
|
|
121
122
|
count -= 1
|
122
123
|
result = self.get(url,count) if count > 0
|
123
124
|
end
|
124
|
-
result
|
125
|
+
result.clean
|
125
126
|
|
126
127
|
end
|
127
128
|
|
@@ -0,0 +1,181 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require "unidecoder"
|
3
|
+
|
4
|
+
module String::Cleaner
|
5
|
+
|
6
|
+
def clean
|
7
|
+
fix_encoding.fix_endlines.fix_invisible_chars
|
8
|
+
end
|
9
|
+
|
10
|
+
def fix_encoding
|
11
|
+
utf8 = dup
|
12
|
+
if utf8.respond_to?(:force_encoding)
|
13
|
+
utf8.force_encoding("UTF-8") # for Ruby 1.9+
|
14
|
+
unless utf8.valid_encoding? # if invalid UTF-8
|
15
|
+
utf8 = utf8.force_encoding("ISO8859-1")
|
16
|
+
utf8.encode!("UTF-8", :invalid => :replace, :undef => :replace, :replace => "")
|
17
|
+
end
|
18
|
+
utf8.gsub!(/\u0080|¤/, "€") # special case for euro sign from Windows-1252
|
19
|
+
utf8
|
20
|
+
else
|
21
|
+
require "iconv"
|
22
|
+
utf8 << " "
|
23
|
+
begin
|
24
|
+
Iconv.new("UTF-8", "UTF-8").iconv(utf8)
|
25
|
+
rescue
|
26
|
+
utf8.gsub!(/\x80/n, "\xA4")
|
27
|
+
Iconv.new("UTF-8//IGNORE", "ISO8859-1").iconv(utf8).gsub("¤", "€")
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def fix_endlines
|
33
|
+
gsub(/(?:\r\n|\r)/u, "\n")
|
34
|
+
end
|
35
|
+
|
36
|
+
SPECIAL_SPACES = [
|
37
|
+
0x00A0, # NO-BREAK SPACE
|
38
|
+
0x1680, # OGHAM SPACE MARK
|
39
|
+
0x180E, # MONGOLIAN VOWEL SEPARATOR
|
40
|
+
(0x2000..0x200A).to_a, # EN QUAD..HAIR SPACE
|
41
|
+
0x2028, # LINE SEPARATOR
|
42
|
+
0x2029, # PARAGRAPH SEPARATOR
|
43
|
+
0x202F, # NARROW NO-BREAK SPACE
|
44
|
+
0x205F, # MEDIUM MATHEMATICAL SPACE
|
45
|
+
0x3000, # IDEOGRAPHIC SPACE
|
46
|
+
].flatten.collect{|e| [e].pack 'U*'}
|
47
|
+
|
48
|
+
ZERO_WIDTH = [
|
49
|
+
0x200B, # ZERO WIDTH SPACE
|
50
|
+
0x200C, # ZERO WIDTH NON-JOINER
|
51
|
+
0x200D, # ZERO WIDTH JOINER
|
52
|
+
0x2060, # WORD JOINER
|
53
|
+
0xFEFF, # ZERO WIDTH NO-BREAK SPACE
|
54
|
+
].flatten.collect{|e| [e].pack 'U*'}
|
55
|
+
|
56
|
+
def fix_invisible_chars
|
57
|
+
utf8 = self.dup
|
58
|
+
utf8.gsub!(Regexp.new(ZERO_WIDTH.join("|")), "")
|
59
|
+
utf8 = if utf8.respond_to?(:force_encoding)
|
60
|
+
utf8 = (utf8 << " ").split(/\n/u).each{|line|
|
61
|
+
line.gsub!(/[\s\p{C}]/u, " ")
|
62
|
+
}.join("\n").chop!
|
63
|
+
else
|
64
|
+
require "oniguruma"
|
65
|
+
utf8.split(/\n/n).collect{|line|
|
66
|
+
Oniguruma::ORegexp.new("[\\p{C}]", {:encoding => Oniguruma::ENCODING_UTF8}).gsub(line, " ")
|
67
|
+
}.join("\n").chop!
|
68
|
+
end
|
69
|
+
utf8.gsub!(Regexp.new(SPECIAL_SPACES.join("|") + "|\s"), " ")
|
70
|
+
utf8
|
71
|
+
end
|
72
|
+
|
73
|
+
def trim(chars = "")
|
74
|
+
chars.size>0 ? gsub(/\A[#{chars}]+|[#{chars}]+\z/, "") : strip
|
75
|
+
end
|
76
|
+
|
77
|
+
def to_permalink(separator="-")
|
78
|
+
clean.to_ascii(chartable).downcase.gsub(/[^a-z0-9]+/, separator).trim(separator)
|
79
|
+
end
|
80
|
+
|
81
|
+
def nl2br
|
82
|
+
gsub("\n", "<br/>\n")
|
83
|
+
end
|
84
|
+
|
85
|
+
def to_nicer_sym
|
86
|
+
to_permalink("_").to_sym
|
87
|
+
end
|
88
|
+
|
89
|
+
def chartable(options = {})
|
90
|
+
options = {
|
91
|
+
:clean_binary => true,
|
92
|
+
:translit_symbols => true,
|
93
|
+
}.merge(options)
|
94
|
+
char = "%c"
|
95
|
+
table = {
|
96
|
+
"`" => "'", # dec = 96
|
97
|
+
"¦" => "|", # dec = 166, broken vertical bar
|
98
|
+
"¨" => "", # dec = 168, spacing diaeresis - umlaut
|
99
|
+
"ª" => "", # dec = 170, feminine ordinal indicator
|
100
|
+
"«" => "\"", # dec = 171, left double angle quotes
|
101
|
+
"¬" => "!", # dec = 172, not sign
|
102
|
+
"" => "-", # dec = 173, soft hyphen
|
103
|
+
"¯" => "-", # dec = 175, spacing macron - overline
|
104
|
+
"²" => "2", # dec = 178, superscript two - squared
|
105
|
+
"³" => "3", # dec = 179, superscript three - cubed
|
106
|
+
"´" => "'", # dec = 180, acute accent - spacing acute
|
107
|
+
"·" => "", # dec = 183, middle dot - Georgian comma
|
108
|
+
"¸" => "", # dec = 184, spacing cedilla
|
109
|
+
"¹" => "1", # dec = 185, superscript one
|
110
|
+
"º" => "0", # dec = 186, masculine ordinal indicator
|
111
|
+
"»" => "\"", # dec = 187, right double angle quotes
|
112
|
+
"¿" => "", # dec = 191, inverted question mark
|
113
|
+
"Ý" => "Y", # dec = 221
|
114
|
+
"–" => "-", # hex = 2013, en dash
|
115
|
+
"—" => "-", # hex = 2014, em dash
|
116
|
+
"‚" => "'", # hex = 201A, single low-9 quotation mark
|
117
|
+
"„" => "\"", # hex = 201E, double low-9 quotation mark
|
118
|
+
}
|
119
|
+
if options[:clean_binary]
|
120
|
+
table[char % 0] = "" # null
|
121
|
+
table[char % 1] = "" # start of heading
|
122
|
+
table[char % 2] = "" # start of text
|
123
|
+
table[char % 3] = "" # end of text
|
124
|
+
table[char % 4] = "" # end of transmission
|
125
|
+
table[char % 5] = "" # enquiry
|
126
|
+
table[char % 6] = "" # acknowledge
|
127
|
+
table[char % 7] = "" # bell
|
128
|
+
table[char % 8] = "" # backspace
|
129
|
+
table[char % 9] = " " # tab
|
130
|
+
table[char % 11] = "" # vertical tab
|
131
|
+
table[char % 12] = "" # form feed
|
132
|
+
table[char % 14] = "" # shift out
|
133
|
+
table[char % 15] = "" # shift in
|
134
|
+
table[char % 16] = "" # data link escape
|
135
|
+
table[char % 17] = "" # device control 1
|
136
|
+
table[char % 18] = "" # device control 2
|
137
|
+
table[char % 19] = "" # device control 3
|
138
|
+
table[char % 20] = "" # device control 4
|
139
|
+
table[char % 21] = "" # negative acknowledgement
|
140
|
+
table[char % 22] = "" # synchronous idle
|
141
|
+
table[char % 23] = "" # end of transmission block
|
142
|
+
table[char % 24] = "" # cancel
|
143
|
+
table[char % 25] = "" # end of medium
|
144
|
+
table[char % 26] = "" # substitute
|
145
|
+
table[char % 27] = "" # escape
|
146
|
+
table[char % 28] = "" # file separator
|
147
|
+
table[char % 29] = "" # group separator
|
148
|
+
table[char % 30] = "" # record separator
|
149
|
+
table[char % 31] = "" # unit separator
|
150
|
+
table[char % 127] = "" # delete
|
151
|
+
end
|
152
|
+
if options[:translit_symbols]
|
153
|
+
table["$"] = " dollars " # dec = 36, dollar sign
|
154
|
+
table["%"] = " percent " # dec = 37, percent sign
|
155
|
+
table["&"] = " and " # dec = 38, ampersand
|
156
|
+
table["@"] = " at " # dec = 64, at symbol
|
157
|
+
table[char % 128] = " euros " # windows euro
|
158
|
+
table["¢"] = " cents " # dec = 162, cent sign
|
159
|
+
table["£"] = " pounds " # dec = 163, pound sign
|
160
|
+
table["¤"] = " euros " # dec = 164, currency sign
|
161
|
+
table["¥"] = " yens " # dec = 165, yen sign
|
162
|
+
table["§"] = " section " # dec = 167, section sign
|
163
|
+
table["©"] = " copyright " # dec = 169, copyright sign
|
164
|
+
table["®"] = " registered trademark " # dec = 174, registered trade mark sign
|
165
|
+
table["°"] = " degrees " # dec = 176, degree sign
|
166
|
+
table["±"] = " approx " # dec = 177, plus-or-minus sign
|
167
|
+
table["µ"] = " micro " # dec = 181, micro sign
|
168
|
+
table["¶"] = " paragraph " # dec = 182, pilcrow sign - paragraph sign
|
169
|
+
table["¼"] = " 1/4 " # dec = 188, fraction one quarter
|
170
|
+
table["½"] = " 1/2 " # dec = 189, fraction one half
|
171
|
+
table["¾"] = " 3/4 " # dec = 190, fraction three quarters
|
172
|
+
table["€"] = " euros " # hex = 20AC, unicode euro
|
173
|
+
table["™"] = " trademark " # hex = 2122, trade mark
|
174
|
+
end
|
175
|
+
table
|
176
|
+
end
|
177
|
+
|
178
|
+
end
|
179
|
+
class String
|
180
|
+
include String::Cleaner
|
181
|
+
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
8
|
+
- 6
|
9
|
+
version: 0.0.6
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- tg0
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-10-
|
17
|
+
date: 2010-10-30 00:00:00 +03:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -32,6 +32,21 @@ dependencies:
|
|
32
32
|
version: 0.2.1
|
33
33
|
type: :runtime
|
34
34
|
version_requirements: *id001
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: unidecoder
|
37
|
+
prerelease: false
|
38
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
segments:
|
44
|
+
- 1
|
45
|
+
- 1
|
46
|
+
- 1
|
47
|
+
version: 1.1.1
|
48
|
+
type: :runtime
|
49
|
+
version_requirements: *id002
|
35
50
|
description: Some simple methods to use shell curl
|
36
51
|
email: email@tg0.ru
|
37
52
|
executables: []
|
@@ -43,6 +58,7 @@ extra_rdoc_files: []
|
|
43
58
|
files:
|
44
59
|
- README
|
45
60
|
- lib/curl.rb
|
61
|
+
- lib/string_cleaner.rb
|
46
62
|
has_rdoc: true
|
47
63
|
homepage: http://github.com/tg0/curl
|
48
64
|
licenses: []
|