wikiscript 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Manifest.txt +2 -1
- data/Rakefile +2 -1
- data/lib/wikiscript.rb +16 -4
- data/lib/wikiscript/client.rb +43 -29
- data/lib/wikiscript/page.rb +1 -0
- data/lib/wikiscript/version.rb +1 -1
- data/test/test_page.rb +41 -0
- data/test/test_page_de.rb +28 -0
- metadata +23 -10
- data/test/test_austria.rb +0 -24
data/Manifest.txt
CHANGED
data/Rakefile
CHANGED
data/lib/wikiscript.rb
CHANGED
@@ -1,17 +1,18 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
1
3
|
## stdlibs
|
2
4
|
|
3
5
|
require 'net/http'
|
4
6
|
require 'uri'
|
5
|
-
require '
|
7
|
+
require 'cgi'
|
6
8
|
require 'pp'
|
7
|
-
require 'ostruct'
|
8
9
|
|
9
10
|
|
10
11
|
## 3rd party gems/libs
|
11
12
|
## require 'props'
|
12
13
|
|
13
14
|
require 'logutils'
|
14
|
-
|
15
|
+
require 'fetcher'
|
15
16
|
|
16
17
|
# our own code
|
17
18
|
|
@@ -29,7 +30,18 @@ module Wikiscript
|
|
29
30
|
def self.root
|
30
31
|
"#{File.expand_path( File.dirname(File.dirname(__FILE__)) )}"
|
31
32
|
end
|
32
|
-
|
33
|
+
|
34
|
+
|
35
|
+
## for now make lang a global - change why? why not??
|
36
|
+
def self.lang=(value)
|
37
|
+
@@lang = value.to_s # use to_s - lets you pass ing :en, :de etc.
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.lang
|
41
|
+
# note: for now always returns a string e.g. 'en', 'de' etc. not a symbol
|
42
|
+
@@lang ||= 'en'
|
43
|
+
end
|
44
|
+
|
33
45
|
end # module Wikiscript
|
34
46
|
|
35
47
|
|
data/lib/wikiscript/client.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# encoding: utf-8
|
1
2
|
|
2
3
|
module Wikiscript
|
3
4
|
|
@@ -5,52 +6,65 @@ module Wikiscript
|
|
5
6
|
|
6
7
|
include LogUtils::Logging
|
7
8
|
|
8
|
-
SITE_BASE = 'http://
|
9
|
+
SITE_BASE = 'http://{lang}.wikipedia.org/w/index.php'
|
9
10
|
|
10
11
|
### API_BASE = 'http://en.wikipedia.org/w/api.php'
|
11
12
|
|
12
13
|
def initialize( opts={} )
|
13
|
-
@opts
|
14
|
+
@opts = opts
|
15
|
+
@worker = Fetcher::Worker.new
|
14
16
|
end
|
15
17
|
|
18
|
+
## change to: wikitext why? why not? or to raw? why? why not?
|
16
19
|
def text( title )
|
17
|
-
## todo/fix:
|
18
|
-
|
19
|
-
get( "action=raw&title=#{title}" )
|
20
|
+
## todo/fix: convert spaces to _ if not present for wikipedia page title - why ?? why not ???
|
21
|
+
get( action: 'raw', title: title )
|
20
22
|
end
|
21
23
|
|
22
24
|
private
|
23
|
-
|
24
|
-
|
25
|
+
def site_base
|
26
|
+
## replace lang w/ lang config if present e.g.
|
27
|
+
## http://{lang}.wikipedia.org/w/index.php
|
28
|
+
# becomes
|
29
|
+
# http://en.wikipedia.org/w/index.php or
|
30
|
+
# http://de.wikipedia.org/w/index.php etc
|
31
|
+
|
32
|
+
SITE_BASE.gsub( "{lang}", Wikiscript.lang )
|
33
|
+
end
|
34
|
+
|
35
|
+
def build_query( h )
|
36
|
+
h.map do |k,v|
|
37
|
+
"#{CGI.escape(k.to_s)}=#{CGI.escape(v.to_s)}"
|
38
|
+
end.join( '&' )
|
39
|
+
end
|
25
40
|
|
26
41
|
def get( params )
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
proxy = ENV['HTTP_PROXY']
|
32
|
-
proxy = ENV['http_proxy'] if proxy.nil? # try possible lower/case env variable (for *nix systems) is this necessary??
|
33
|
-
|
34
|
-
if proxy
|
35
|
-
proxy = URI.parse( proxy )
|
36
|
-
logger.debug "using net http proxy: proxy.host=#{proxy.host}, proxy.port=#{proxy.port}"
|
37
|
-
if proxy.user && proxy.password
|
38
|
-
logger.debug " using credentials: proxy.user=#{proxy.user}, proxy.password=****"
|
39
|
-
else
|
40
|
-
logger.debug " using no credentials"
|
41
|
-
end
|
42
|
-
else
|
43
|
-
logger.debug "using direct net http access; no proxy configured"
|
44
|
-
proxy = OpenStruct.new # all fields return nil (e.g. proxy.host, etc.)
|
45
|
-
end
|
42
|
+
# note: lets us passing in params as hash e.g.
|
43
|
+
# action: 'raw', title: 'Austria'
|
44
|
+
# key and values will get CGI escaped
|
45
|
+
query = build_query( params )
|
46
46
|
|
47
|
-
|
47
|
+
## uri = URI.parse( "#{SITE_BASE}?#{params}" )
|
48
|
+
## fix: pass in uri (add to fetcher check for is_a? URI etc.)
|
49
|
+
uri_string = "#{site_base}?#{query}"
|
48
50
|
|
49
|
-
|
50
|
-
response = http.request( Net::HTTP::Get.new( uri.request_uri ))
|
51
|
+
response = @worker.get_response( uri_string )
|
51
52
|
|
52
53
|
if response.code == '200'
|
53
54
|
t = response.body
|
55
|
+
###
|
56
|
+
# NB: Net::HTTP will NOT set encoding UTF-8 etc.
|
57
|
+
# will mostly be ASCII
|
58
|
+
# - try to change encoding to UTF-8 ourselves
|
59
|
+
logger.debug "t.encoding.name (before): #{t.encoding.name}"
|
60
|
+
#####
|
61
|
+
# NB: ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
|
62
|
+
|
63
|
+
## NB:
|
64
|
+
# for now "hardcoded" to utf8 - what else can we do?
|
65
|
+
# - note: force_encoding will NOT change the chars only change the assumed encoding w/o translation
|
66
|
+
t = t.force_encoding( Encoding::UTF_8 )
|
67
|
+
logger.debug "t.encoding.name (after): #{t.encoding.name}"
|
54
68
|
## pp t
|
55
69
|
t
|
56
70
|
else
|
data/lib/wikiscript/page.rb
CHANGED
data/lib/wikiscript/version.rb
CHANGED
data/test/test_page.rb
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
require 'helper'
|
5
|
+
|
6
|
+
|
7
|
+
class TestPage < MiniTest::Unit::TestCase
|
8
|
+
|
9
|
+
def setup
|
10
|
+
Wikiscript.lang = :en
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_austria_en
|
14
|
+
page = Wikiscript::Page.new( 'Austria' )
|
15
|
+
text = page.text
|
16
|
+
|
17
|
+
## print first 600 chars
|
18
|
+
pp text[0..600]
|
19
|
+
|
20
|
+
## check for some snippets
|
21
|
+
assert( /{{Infobox country/ =~ text )
|
22
|
+
assert( /common_name = Austria/ =~ text )
|
23
|
+
assert( /capital = \[\[Vienna\]\]/ =~ text )
|
24
|
+
assert( /The origins of modern-day Austria date back to the time/ =~ text )
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_sankt_poelten_en
|
28
|
+
page = Wikiscript::Page.new( 'Sankt_Pölten' )
|
29
|
+
text = page.text
|
30
|
+
|
31
|
+
## print first 600 chars
|
32
|
+
pp text[0..600]
|
33
|
+
|
34
|
+
## check for some snippets
|
35
|
+
assert( /{{Infobox Town AT/ =~ text )
|
36
|
+
assert( /Name\s+=\s+Sankt Pölten/ =~ text )
|
37
|
+
assert( /'''Sankt Pölten''' \(''St. Pölten''\) is the capital city of/ =~ text )
|
38
|
+
end
|
39
|
+
|
40
|
+
end # class TestPage
|
41
|
+
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
require 'helper'
|
5
|
+
|
6
|
+
|
7
|
+
class TestPageDe < MiniTest::Unit::TestCase
|
8
|
+
|
9
|
+
def setup
|
10
|
+
Wikiscript.lang = :de
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_st_poelten_de
|
14
|
+
page = Wikiscript::Page.new( 'St._Pölten' )
|
15
|
+
text = page.text
|
16
|
+
|
17
|
+
## print first 600 chars
|
18
|
+
pp text[0..600]
|
19
|
+
|
20
|
+
## check for some snippets
|
21
|
+
assert( /{{Infobox Gemeinde in Österreich/ =~ text )
|
22
|
+
assert( /Name\s+=\s+St\. Pölten/ =~ text )
|
23
|
+
assert( /'''St\. Pölten''' \(amtlicher Name,/ =~ text )
|
24
|
+
assert( /Die Stadt liegt am Fluss \[\[Traisen \(Fluss\)\|Traisen\]\]/ =~ text )
|
25
|
+
end
|
26
|
+
|
27
|
+
end # class TestPageDe
|
28
|
+
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wikiscript
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2014-07-
|
12
|
+
date: 2014-07-07 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: logutils
|
16
|
-
requirement: &
|
16
|
+
requirement: &79133900 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,21 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *79133900
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: fetcher
|
27
|
+
requirement: &79133630 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :runtime
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *79133630
|
25
36
|
- !ruby/object:Gem::Dependency
|
26
37
|
name: rdoc
|
27
|
-
requirement: &
|
38
|
+
requirement: &79133350 !ruby/object:Gem::Requirement
|
28
39
|
none: false
|
29
40
|
requirements:
|
30
41
|
- - ~>
|
@@ -32,10 +43,10 @@ dependencies:
|
|
32
43
|
version: '4.0'
|
33
44
|
type: :development
|
34
45
|
prerelease: false
|
35
|
-
version_requirements: *
|
46
|
+
version_requirements: *79133350
|
36
47
|
- !ruby/object:Gem::Dependency
|
37
48
|
name: hoe
|
38
|
-
requirement: &
|
49
|
+
requirement: &79133100 !ruby/object:Gem::Requirement
|
39
50
|
none: false
|
40
51
|
requirements:
|
41
52
|
- - ~>
|
@@ -43,7 +54,7 @@ dependencies:
|
|
43
54
|
version: '3.11'
|
44
55
|
type: :development
|
45
56
|
prerelease: false
|
46
|
-
version_requirements: *
|
57
|
+
version_requirements: *79133100
|
47
58
|
description: wikiscript - scripts for wikipedia (get wikitext for page etc.)
|
48
59
|
email: opensport@googlegroups.com
|
49
60
|
executables: []
|
@@ -62,7 +73,8 @@ files:
|
|
62
73
|
- lib/wikiscript/page.rb
|
63
74
|
- lib/wikiscript/version.rb
|
64
75
|
- test/helper.rb
|
65
|
-
- test/
|
76
|
+
- test/test_page.rb
|
77
|
+
- test/test_page_de.rb
|
66
78
|
- .gemtest
|
67
79
|
homepage: https://github.com/wikiscript/wikiscript.ruby
|
68
80
|
licenses:
|
@@ -92,4 +104,5 @@ signing_key:
|
|
92
104
|
specification_version: 3
|
93
105
|
summary: wikiscript - scripts for wikipedia (get wikitext for page etc.)
|
94
106
|
test_files:
|
95
|
-
- test/
|
107
|
+
- test/test_page_de.rb
|
108
|
+
- test/test_page.rb
|
data/test/test_austria.rb
DELETED
@@ -1,24 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
|
4
|
-
require 'helper'
|
5
|
-
|
6
|
-
|
7
|
-
class TestAustria < MiniTest::Unit::TestCase
|
8
|
-
|
9
|
-
def test_text
|
10
|
-
page = Wikiscript::Page.new( 'Austria' )
|
11
|
-
text = page.text
|
12
|
-
|
13
|
-
## print first 600 chars
|
14
|
-
pp text[0..600]
|
15
|
-
|
16
|
-
## check for some snippets
|
17
|
-
assert( /{{Infobox country/ =~ text )
|
18
|
-
assert( /common_name = Austria/ =~ text )
|
19
|
-
assert( /capital = \[\[Vienna\]\]/ =~ text )
|
20
|
-
assert( /The origins of modern-day Austria date back to the time/ =~ text )
|
21
|
-
end
|
22
|
-
|
23
|
-
end # class TestAustria
|
24
|
-
|