wikiscript 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Manifest.txt +2 -1
- data/Rakefile +2 -1
- data/lib/wikiscript.rb +16 -4
- data/lib/wikiscript/client.rb +43 -29
- data/lib/wikiscript/page.rb +1 -0
- data/lib/wikiscript/version.rb +1 -1
- data/test/test_page.rb +41 -0
- data/test/test_page_de.rb +28 -0
- metadata +23 -10
- data/test/test_austria.rb +0 -24
data/Manifest.txt
CHANGED
data/Rakefile
CHANGED
data/lib/wikiscript.rb
CHANGED
@@ -1,17 +1,18 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
1
3
|
## stdlibs
|
2
4
|
|
3
5
|
require 'net/http'
|
4
6
|
require 'uri'
|
5
|
-
require '
|
7
|
+
require 'cgi'
|
6
8
|
require 'pp'
|
7
|
-
require 'ostruct'
|
8
9
|
|
9
10
|
|
10
11
|
## 3rd party gems/libs
|
11
12
|
## require 'props'
|
12
13
|
|
13
14
|
require 'logutils'
|
14
|
-
|
15
|
+
require 'fetcher'
|
15
16
|
|
16
17
|
# our own code
|
17
18
|
|
@@ -29,7 +30,18 @@ module Wikiscript
|
|
29
30
|
def self.root
|
30
31
|
"#{File.expand_path( File.dirname(File.dirname(__FILE__)) )}"
|
31
32
|
end
|
32
|
-
|
33
|
+
|
34
|
+
|
35
|
+
## for now make lang a global - change why? why not??
|
36
|
+
def self.lang=(value)
|
37
|
+
@@lang = value.to_s # use to_s - lets you pass ing :en, :de etc.
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.lang
|
41
|
+
# note: for now always returns a string e.g. 'en', 'de' etc. not a symbol
|
42
|
+
@@lang ||= 'en'
|
43
|
+
end
|
44
|
+
|
33
45
|
end # module Wikiscript
|
34
46
|
|
35
47
|
|
data/lib/wikiscript/client.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# encoding: utf-8
|
1
2
|
|
2
3
|
module Wikiscript
|
3
4
|
|
@@ -5,52 +6,65 @@ module Wikiscript
|
|
5
6
|
|
6
7
|
include LogUtils::Logging
|
7
8
|
|
8
|
-
SITE_BASE = 'http://
|
9
|
+
SITE_BASE = 'http://{lang}.wikipedia.org/w/index.php'
|
9
10
|
|
10
11
|
### API_BASE = 'http://en.wikipedia.org/w/api.php'
|
11
12
|
|
12
13
|
def initialize( opts={} )
|
13
|
-
@opts
|
14
|
+
@opts = opts
|
15
|
+
@worker = Fetcher::Worker.new
|
14
16
|
end
|
15
17
|
|
18
|
+
## change to: wikitext why? why not? or to raw? why? why not?
|
16
19
|
def text( title )
|
17
|
-
## todo/fix:
|
18
|
-
|
19
|
-
get( "action=raw&title=#{title}" )
|
20
|
+
## todo/fix: convert spaces to _ if not present for wikipedia page title - why ?? why not ???
|
21
|
+
get( action: 'raw', title: title )
|
20
22
|
end
|
21
23
|
|
22
24
|
private
|
23
|
-
|
24
|
-
|
25
|
+
def site_base
|
26
|
+
## replace lang w/ lang config if present e.g.
|
27
|
+
## http://{lang}.wikipedia.org/w/index.php
|
28
|
+
# becomes
|
29
|
+
# http://en.wikipedia.org/w/index.php or
|
30
|
+
# http://de.wikipedia.org/w/index.php etc
|
31
|
+
|
32
|
+
SITE_BASE.gsub( "{lang}", Wikiscript.lang )
|
33
|
+
end
|
34
|
+
|
35
|
+
def build_query( h )
|
36
|
+
h.map do |k,v|
|
37
|
+
"#{CGI.escape(k.to_s)}=#{CGI.escape(v.to_s)}"
|
38
|
+
end.join( '&' )
|
39
|
+
end
|
25
40
|
|
26
41
|
def get( params )
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
proxy = ENV['HTTP_PROXY']
|
32
|
-
proxy = ENV['http_proxy'] if proxy.nil? # try possible lower/case env variable (for *nix systems) is this necessary??
|
33
|
-
|
34
|
-
if proxy
|
35
|
-
proxy = URI.parse( proxy )
|
36
|
-
logger.debug "using net http proxy: proxy.host=#{proxy.host}, proxy.port=#{proxy.port}"
|
37
|
-
if proxy.user && proxy.password
|
38
|
-
logger.debug " using credentials: proxy.user=#{proxy.user}, proxy.password=****"
|
39
|
-
else
|
40
|
-
logger.debug " using no credentials"
|
41
|
-
end
|
42
|
-
else
|
43
|
-
logger.debug "using direct net http access; no proxy configured"
|
44
|
-
proxy = OpenStruct.new # all fields return nil (e.g. proxy.host, etc.)
|
45
|
-
end
|
42
|
+
# note: lets us passing in params as hash e.g.
|
43
|
+
# action: 'raw', title: 'Austria'
|
44
|
+
# key and values will get CGI escaped
|
45
|
+
query = build_query( params )
|
46
46
|
|
47
|
-
|
47
|
+
## uri = URI.parse( "#{SITE_BASE}?#{params}" )
|
48
|
+
## fix: pass in uri (add to fetcher check for is_a? URI etc.)
|
49
|
+
uri_string = "#{site_base}?#{query}"
|
48
50
|
|
49
|
-
|
50
|
-
response = http.request( Net::HTTP::Get.new( uri.request_uri ))
|
51
|
+
response = @worker.get_response( uri_string )
|
51
52
|
|
52
53
|
if response.code == '200'
|
53
54
|
t = response.body
|
55
|
+
###
|
56
|
+
# NB: Net::HTTP will NOT set encoding UTF-8 etc.
|
57
|
+
# will mostly be ASCII
|
58
|
+
# - try to change encoding to UTF-8 ourselves
|
59
|
+
logger.debug "t.encoding.name (before): #{t.encoding.name}"
|
60
|
+
#####
|
61
|
+
# NB: ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
|
62
|
+
|
63
|
+
## NB:
|
64
|
+
# for now "hardcoded" to utf8 - what else can we do?
|
65
|
+
# - note: force_encoding will NOT change the chars only change the assumed encoding w/o translation
|
66
|
+
t = t.force_encoding( Encoding::UTF_8 )
|
67
|
+
logger.debug "t.encoding.name (after): #{t.encoding.name}"
|
54
68
|
## pp t
|
55
69
|
t
|
56
70
|
else
|
data/lib/wikiscript/page.rb
CHANGED
data/lib/wikiscript/version.rb
CHANGED
data/test/test_page.rb
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
require 'helper'
|
5
|
+
|
6
|
+
|
7
|
+
class TestPage < MiniTest::Unit::TestCase
|
8
|
+
|
9
|
+
def setup
|
10
|
+
Wikiscript.lang = :en
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_austria_en
|
14
|
+
page = Wikiscript::Page.new( 'Austria' )
|
15
|
+
text = page.text
|
16
|
+
|
17
|
+
## print first 600 chars
|
18
|
+
pp text[0..600]
|
19
|
+
|
20
|
+
## check for some snippets
|
21
|
+
assert( /{{Infobox country/ =~ text )
|
22
|
+
assert( /common_name = Austria/ =~ text )
|
23
|
+
assert( /capital = \[\[Vienna\]\]/ =~ text )
|
24
|
+
assert( /The origins of modern-day Austria date back to the time/ =~ text )
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_sankt_poelten_en
|
28
|
+
page = Wikiscript::Page.new( 'Sankt_Pölten' )
|
29
|
+
text = page.text
|
30
|
+
|
31
|
+
## print first 600 chars
|
32
|
+
pp text[0..600]
|
33
|
+
|
34
|
+
## check for some snippets
|
35
|
+
assert( /{{Infobox Town AT/ =~ text )
|
36
|
+
assert( /Name\s+=\s+Sankt Pölten/ =~ text )
|
37
|
+
assert( /'''Sankt Pölten''' \(''St. Pölten''\) is the capital city of/ =~ text )
|
38
|
+
end
|
39
|
+
|
40
|
+
end # class TestPage
|
41
|
+
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
require 'helper'
|
5
|
+
|
6
|
+
|
7
|
+
class TestPageDe < MiniTest::Unit::TestCase
|
8
|
+
|
9
|
+
def setup
|
10
|
+
Wikiscript.lang = :de
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_st_poelten_de
|
14
|
+
page = Wikiscript::Page.new( 'St._Pölten' )
|
15
|
+
text = page.text
|
16
|
+
|
17
|
+
## print first 600 chars
|
18
|
+
pp text[0..600]
|
19
|
+
|
20
|
+
## check for some snippets
|
21
|
+
assert( /{{Infobox Gemeinde in Österreich/ =~ text )
|
22
|
+
assert( /Name\s+=\s+St\. Pölten/ =~ text )
|
23
|
+
assert( /'''St\. Pölten''' \(amtlicher Name,/ =~ text )
|
24
|
+
assert( /Die Stadt liegt am Fluss \[\[Traisen \(Fluss\)\|Traisen\]\]/ =~ text )
|
25
|
+
end
|
26
|
+
|
27
|
+
end # class TestPageDe
|
28
|
+
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wikiscript
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2014-07-
|
12
|
+
date: 2014-07-07 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: logutils
|
16
|
-
requirement: &
|
16
|
+
requirement: &79133900 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,21 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *79133900
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: fetcher
|
27
|
+
requirement: &79133630 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :runtime
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *79133630
|
25
36
|
- !ruby/object:Gem::Dependency
|
26
37
|
name: rdoc
|
27
|
-
requirement: &
|
38
|
+
requirement: &79133350 !ruby/object:Gem::Requirement
|
28
39
|
none: false
|
29
40
|
requirements:
|
30
41
|
- - ~>
|
@@ -32,10 +43,10 @@ dependencies:
|
|
32
43
|
version: '4.0'
|
33
44
|
type: :development
|
34
45
|
prerelease: false
|
35
|
-
version_requirements: *
|
46
|
+
version_requirements: *79133350
|
36
47
|
- !ruby/object:Gem::Dependency
|
37
48
|
name: hoe
|
38
|
-
requirement: &
|
49
|
+
requirement: &79133100 !ruby/object:Gem::Requirement
|
39
50
|
none: false
|
40
51
|
requirements:
|
41
52
|
- - ~>
|
@@ -43,7 +54,7 @@ dependencies:
|
|
43
54
|
version: '3.11'
|
44
55
|
type: :development
|
45
56
|
prerelease: false
|
46
|
-
version_requirements: *
|
57
|
+
version_requirements: *79133100
|
47
58
|
description: wikiscript - scripts for wikipedia (get wikitext for page etc.)
|
48
59
|
email: opensport@googlegroups.com
|
49
60
|
executables: []
|
@@ -62,7 +73,8 @@ files:
|
|
62
73
|
- lib/wikiscript/page.rb
|
63
74
|
- lib/wikiscript/version.rb
|
64
75
|
- test/helper.rb
|
65
|
-
- test/
|
76
|
+
- test/test_page.rb
|
77
|
+
- test/test_page_de.rb
|
66
78
|
- .gemtest
|
67
79
|
homepage: https://github.com/wikiscript/wikiscript.ruby
|
68
80
|
licenses:
|
@@ -92,4 +104,5 @@ signing_key:
|
|
92
104
|
specification_version: 3
|
93
105
|
summary: wikiscript - scripts for wikipedia (get wikitext for page etc.)
|
94
106
|
test_files:
|
95
|
-
- test/
|
107
|
+
- test/test_page_de.rb
|
108
|
+
- test/test_page.rb
|
data/test/test_austria.rb
DELETED
@@ -1,24 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
|
4
|
-
require 'helper'
|
5
|
-
|
6
|
-
|
7
|
-
class TestAustria < MiniTest::Unit::TestCase
|
8
|
-
|
9
|
-
def test_text
|
10
|
-
page = Wikiscript::Page.new( 'Austria' )
|
11
|
-
text = page.text
|
12
|
-
|
13
|
-
## print first 600 chars
|
14
|
-
pp text[0..600]
|
15
|
-
|
16
|
-
## check for some snippets
|
17
|
-
assert( /{{Infobox country/ =~ text )
|
18
|
-
assert( /common_name = Austria/ =~ text )
|
19
|
-
assert( /capital = \[\[Vienna\]\]/ =~ text )
|
20
|
-
assert( /The origins of modern-day Austria date back to the time/ =~ text )
|
21
|
-
end
|
22
|
-
|
23
|
-
end # class TestAustria
|
24
|
-
|