wptemplates 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +20 -0
- data/.rspec +2 -0
- data/.travis.yml +8 -0
- data/Gemfile +8 -0
- data/LICENSE.txt +22 -0
- data/README.md +92 -0
- data/README.rdoc +8 -0
- data/Rakefile +4 -0
- data/lib/wptemplates.rb +20 -0
- data/lib/wptemplates/ast.rb +91 -0
- data/lib/wptemplates/parser.rb +112 -0
- data/lib/wptemplates/preprocessor.rb +17 -0
- data/lib/wptemplates/regexes.rb +95 -0
- data/lib/wptemplates/utils.rb +38 -0
- data/lib/wptemplates/version.rb +3 -0
- data/spec/regexes_spec.rb +458 -0
- data/spec/spec_helper.rb +23 -0
- data/spec/utils_spec.rb +90 -0
- data/spec/wptemplates_links_spec.rb +249 -0
- data/spec/wptemplates_mixed_spec.rb +82 -0
- data/spec/wptemplates_templates_spec.rb +161 -0
- data/spec/wptemplates_text_spec.rb +75 -0
- data/tasks/browser.rake +4 -0
- data/tasks/bundler_gem.rake +1 -0
- data/tasks/console.rake +7 -0
- data/tasks/irbrc.rb +22 -0
- data/tasks/rdoc.rake +16 -0
- data/tasks/readme_examples.rake +23 -0
- data/tasks/readme_html.rake +22 -0
- data/tasks/rspec.rake +10 -0
- data/wptemplates.gemspec +29 -0
- metadata +167 -0
data/.gitignore
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
.bundle
|
4
|
+
.config
|
5
|
+
.yardoc
|
6
|
+
Gemfile.lock
|
7
|
+
InstalledFiles
|
8
|
+
_yardoc
|
9
|
+
coverage
|
10
|
+
doc/
|
11
|
+
lib/bundler/man
|
12
|
+
pkg
|
13
|
+
rdoc
|
14
|
+
spec/reports
|
15
|
+
spec/examples_spec.rb
|
16
|
+
test/tmp
|
17
|
+
test/version_tmp
|
18
|
+
tmp
|
19
|
+
.irb_history
|
20
|
+
README.html
|
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Bernhard Häussner
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
# Wptemplates
|
2
|
+
|
3
|
+
[](https://travis-ci.org/bxt/wptemplates)
|
4
|
+
|
5
|
+
Gem for collecting template informations from mediawiki markup.
|
6
|
+
|
7
|
+
It will help you to extract useful machine-readable data from
|
8
|
+
wikipedia articles, since there ist a lot of useful stuff
|
9
|
+
encoded as templates.
|
10
|
+
|
11
|
+
Currently only templates and links are parsed, all other markup is ignored.
|
12
|
+
|
13
|
+
## Installation
|
14
|
+
|
15
|
+
Add this line to your application's Gemfile:
|
16
|
+
|
17
|
+
gem 'wptemplates', git: 'git://github.com/bxt/wptemplates.git'
|
18
|
+
|
19
|
+
And then execute:
|
20
|
+
|
21
|
+
$ bundle
|
22
|
+
|
23
|
+
The gem is currently not in the rubygems.org repository.
|
24
|
+
|
25
|
+
## Usage
|
26
|
+
|
27
|
+
To parse a piece of markup simply call:
|
28
|
+
|
29
|
+
<!-- EXAMPLES:INIT -->
|
30
|
+
ast = Wptemplates.parse("{{foo | bar | x = 3 }} baz [[bam (2003)|]]y")
|
31
|
+
|
32
|
+
<!-- /EXAMPLES -->
|
33
|
+
|
34
|
+
You will get an instance of Wptemplates::Soup which is an array of
|
35
|
+
Wptemplates::Template, Wptemplates::Link and Wptemplates::Text.
|
36
|
+
You can explore the AST with these methods:
|
37
|
+
|
38
|
+
<!-- EXAMPLES:intro -->
|
39
|
+
ast.templates.is_a?(Array) && ast.templates.length # => 1
|
40
|
+
ast.text # => " baz bamy"
|
41
|
+
ast[0].name # => :foo
|
42
|
+
ast[0].params[0].text # => " bar "
|
43
|
+
ast[0].params[:x].text # => "3"
|
44
|
+
ast.all_templates_of(:foo).map{|t| t.params[:x].text} # => ["3"]
|
45
|
+
<!-- /EXAMPLES -->
|
46
|
+
|
47
|
+
You can access the links via:
|
48
|
+
|
49
|
+
<!-- EXAMPLES:links -->
|
50
|
+
ast.links.length # => 1
|
51
|
+
ast.links[0].text # => "bamy"
|
52
|
+
ast.all_links.map{|l| l.link} # => ["Bam (2003)"]
|
53
|
+
<!-- /EXAMPLES -->
|
54
|
+
|
55
|
+
## Developing
|
56
|
+
|
57
|
+
Here's some useful info if you want to improve/customize this gem.
|
58
|
+
|
59
|
+
### Getting Started
|
60
|
+
|
61
|
+
Checkout the project, run `bundle` and then `rake` to see if the tests
|
62
|
+
pass. Run `rake -T` to see the rake tasks.
|
63
|
+
|
64
|
+
### Markup
|
65
|
+
|
66
|
+
MediaWiki markup is not trivial to parse and there might always
|
67
|
+
be compatibility issues. There's a useful help page about
|
68
|
+
[templates][tmplh] and a [markup spec][mspec]. For links there
|
69
|
+
is a page about [links][linkh] and about the [pipe trick][ptrkh].
|
70
|
+
Also, there is a page with [link's BNF][lnbnf].
|
71
|
+
|
72
|
+
### Known Issues
|
73
|
+
|
74
|
+
* If you have images in your templates the pipes cause a new parameter
|
75
|
+
* Namespaced links are not recognized
|
76
|
+
* Templates in links are not recognized
|
77
|
+
* Links contents are not htmldecoded
|
78
|
+
* nowiki, pre and math blocks might cause problems
|
79
|
+
|
80
|
+
## Contributing
|
81
|
+
|
82
|
+
1. Fork it
|
83
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
84
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
85
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
86
|
+
5. Create new Pull Request
|
87
|
+
|
88
|
+
[tmplh]: http://en.wikipedia.org/wiki/Help:Template#Usage_syntax "English Wikipedia Template help page, syntax section"
|
89
|
+
[mspec]: http://www.mediawiki.org/wiki/Markup_spec "MediaWiki Markup spec"
|
90
|
+
[linkh]: http://en.wikipedia.org/wiki/Help:Link "English Wikipedia Link help page"
|
91
|
+
[ptrkh]: http://en.wikipedia.org/wiki/Help:Pipe_trick "English Wikipedia Pipe trick help page"
|
92
|
+
[lnbnf]: http://www.mediawiki.org/wiki/Markup_spec/BNF/Links "MediaWiki Link BNF"
|
data/README.rdoc
ADDED
@@ -0,0 +1,8 @@
|
|
1
|
+
= Wptemplates
|
2
|
+
|
3
|
+
{<img src="https://travis-ci.org/bxt/wptemplates.png?branch=master" alt="Build Status" />}[https://travis-ci.org/bxt/wptemplates]
|
4
|
+
|
5
|
+
The public entry method for this gem is Wptemplates.parse
|
6
|
+
It will give you Wptemplates::Soup, Wptemplates::Link,
|
7
|
+
Wptemplates::Template and Wptemplates::Text elements,
|
8
|
+
which you can traverse and filter.
|
data/Rakefile
ADDED
data/lib/wptemplates.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
require "wptemplates/version"
|
2
|
+
require "wptemplates/parser"
|
3
|
+
require "wptemplates/preprocessor"
|
4
|
+
|
5
|
+
module Wptemplates
|
6
|
+
|
7
|
+
def self.parse text
|
8
|
+
parser.parse(text)
|
9
|
+
parser.parse(preprocessor.preprocess(text))
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.parser
|
13
|
+
@parser ||= Parser.new
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.preprocessor
|
17
|
+
@preprocessor ||= Preprocessor.new
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
module Wptemplates
|
2
|
+
|
3
|
+
module Node
|
4
|
+
def templates
|
5
|
+
[]
|
6
|
+
end
|
7
|
+
def all_templates
|
8
|
+
templates
|
9
|
+
end
|
10
|
+
def text
|
11
|
+
""
|
12
|
+
end
|
13
|
+
def templates_of type
|
14
|
+
templates.select{|t| t.name==type}
|
15
|
+
end
|
16
|
+
def template_of type
|
17
|
+
templates_of(type).first
|
18
|
+
end
|
19
|
+
def all_templates_of type
|
20
|
+
all_templates.select{|t| t.name==type}
|
21
|
+
end
|
22
|
+
def deep_template_of type
|
23
|
+
all_templates_of(type).first
|
24
|
+
end
|
25
|
+
def links
|
26
|
+
[]
|
27
|
+
end
|
28
|
+
def all_links
|
29
|
+
links
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
class Soup < Array
|
34
|
+
include Node
|
35
|
+
def templates
|
36
|
+
map(&:templates).flatten(1)
|
37
|
+
end
|
38
|
+
def links
|
39
|
+
map(&:links).flatten(1)
|
40
|
+
end
|
41
|
+
def all_templates
|
42
|
+
map(&:all_templates).flatten(1)
|
43
|
+
end
|
44
|
+
def all_links
|
45
|
+
map(&:all_links).flatten(1)
|
46
|
+
end
|
47
|
+
def text
|
48
|
+
map(&:text).join('')
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
class Template
|
53
|
+
include Node
|
54
|
+
attr_reader :name, :params
|
55
|
+
def initialize(name, params = {})
|
56
|
+
@name = name
|
57
|
+
@params = params
|
58
|
+
end
|
59
|
+
def templates
|
60
|
+
[self]
|
61
|
+
end
|
62
|
+
def all_templates
|
63
|
+
templates + @params.map{|_,v| v.templates }.flatten(1)
|
64
|
+
end
|
65
|
+
def all_links
|
66
|
+
links + @params.map{|_,v| v.all_links }.flatten(1)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
class Text
|
71
|
+
include Node
|
72
|
+
attr_reader :text
|
73
|
+
def initialize(text)
|
74
|
+
@text = text
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
class Link
|
79
|
+
include Node
|
80
|
+
attr_reader :text, :link, :anchor
|
81
|
+
def initialize(text, link, anchor)
|
82
|
+
@text = text
|
83
|
+
@link = link
|
84
|
+
@anchor = anchor
|
85
|
+
end
|
86
|
+
def links
|
87
|
+
[self]
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
require 'wptemplates/regexes'
|
3
|
+
require 'wptemplates/utils'
|
4
|
+
require 'wptemplates/ast'
|
5
|
+
|
6
|
+
module Wptemplates
|
7
|
+
class Parser
|
8
|
+
include Regexes
|
9
|
+
include Utils
|
10
|
+
|
11
|
+
def parse(text)
|
12
|
+
@input = StringScanner.new(text)
|
13
|
+
parse_main
|
14
|
+
end
|
15
|
+
|
16
|
+
protected
|
17
|
+
|
18
|
+
def parse_main in_template_parameter = false
|
19
|
+
output = Soup.new
|
20
|
+
|
21
|
+
while unit = parse_link || parse_template || parse_anything(in_template_parameter)
|
22
|
+
output << unit
|
23
|
+
end
|
24
|
+
|
25
|
+
output << Text.new("") if output.empty?
|
26
|
+
|
27
|
+
output
|
28
|
+
end
|
29
|
+
|
30
|
+
def parse_template
|
31
|
+
if @input.scan(a_doubleopenbrace)
|
32
|
+
template = Template.new parse_template_name, parse_template_parameters
|
33
|
+
@input.scan(a_doubleclosingbrace) or raise "unclosed template"
|
34
|
+
template
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def parse_anything in_template_parameter = false
|
39
|
+
if in_template_parameter
|
40
|
+
@input.scan(till_doublebrace_doubleopenbrackets_or_pipe) && Text.new(@input.matched)
|
41
|
+
else
|
42
|
+
@input.scan(till_doubleopenbrace_or_doubleopenbrackets) && Text.new(@input.matched)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def parse_template_name
|
47
|
+
if @input.scan(till_doubleclosebrace_or_pipe)
|
48
|
+
symbolize(@input.matched)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def parse_template_parameters
|
53
|
+
i = 0
|
54
|
+
h = {}
|
55
|
+
while parsed_named_template_parameter(h) || parse_numeric_template_parameter(h,i) do
|
56
|
+
i += 1
|
57
|
+
end
|
58
|
+
h
|
59
|
+
end
|
60
|
+
|
61
|
+
def parsed_named_template_parameter(h)
|
62
|
+
if @input.scan(from_pipe_till_equals_no_doubleclosebrace_or_pipe)
|
63
|
+
key = symbolize(@input[1])
|
64
|
+
value = parse_main(true)
|
65
|
+
{l:0,r:-1}.each do |d,i|
|
66
|
+
value[i].text.send(:"#{d}strip!")
|
67
|
+
value.delete_at(i) if value[i].text.empty? && (value.length > 1)
|
68
|
+
end
|
69
|
+
h[key] = value
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def parse_numeric_template_parameter(h,i)
|
74
|
+
if @input.scan(a_pipe)
|
75
|
+
value = parse_main(true)
|
76
|
+
h[i] = value
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
def parse_link
|
81
|
+
if @input.scan(a_link)
|
82
|
+
url, label, letters = (1..3).map {|i| @input[i]}
|
83
|
+
if label == ""
|
84
|
+
pipe_trick url, label, letters
|
85
|
+
else
|
86
|
+
link_new_with_normalize (label || url)+letters, url[until_hash], url[after_hash]
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
def pipe_trick url, label, letters
|
92
|
+
if url["#"]
|
93
|
+
nil
|
94
|
+
elsif m = has_parens.match(url)
|
95
|
+
link_new_with_normalize(m[:no_parens]+letters, url, nil)
|
96
|
+
else
|
97
|
+
label = fixpoint(clone: true, start: url) do |u|
|
98
|
+
u[first_comma,:before][parens, :before]
|
99
|
+
end
|
100
|
+
link_new_with_normalize(label+letters, url, nil)
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def link_new_with_normalize text, link, anchor
|
105
|
+
text = normalize_linklabel(text)
|
106
|
+
link = normalize_link(link)
|
107
|
+
anchor = normalize_link(anchor, true) unless anchor.nil?
|
108
|
+
Link.new(text, link, anchor)
|
109
|
+
end
|
110
|
+
|
111
|
+
end
|
112
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Wptemplates
|
4
|
+
module Regexes
|
5
|
+
module_function
|
6
|
+
|
7
|
+
def till_doublebrace_doubleopenbrackets_or_pipe
|
8
|
+
/(
|
9
|
+
[^{}\[|] # Unproblematic chars
|
10
|
+
| { (?!{ ) # A lone open brace
|
11
|
+
| } (?!} ) # A lone close brace
|
12
|
+
| \[(?!\[) # A lone open bracket
|
13
|
+
| ^\[\[ # Doubleopenbrackets at start
|
14
|
+
)+/x
|
15
|
+
end
|
16
|
+
|
17
|
+
def till_doubleopenbrace_or_doubleopenbrackets
|
18
|
+
/(
|
19
|
+
[^{\[] # Unproblematic chars
|
20
|
+
| { (?!{ ) # A lone open brace
|
21
|
+
| \[(?!\[) # A lone open bracket
|
22
|
+
| ^\[\[ # Doubleopenbrackets at start
|
23
|
+
)+/x
|
24
|
+
end
|
25
|
+
|
26
|
+
def till_doubleclosebrace_or_pipe
|
27
|
+
/(
|
28
|
+
[^|}] # Unproblematic chars
|
29
|
+
| } (?!} ) # A lone close brace
|
30
|
+
)+/x
|
31
|
+
end
|
32
|
+
|
33
|
+
def from_pipe_till_equals_no_doubleclosebrace_or_pipe
|
34
|
+
/
|
35
|
+
\| # Pipe
|
36
|
+
((
|
37
|
+
[^|=}] # Unproblematic chars
|
38
|
+
|}(?!}) # A lone close brace
|
39
|
+
)*)
|
40
|
+
= # Equals
|
41
|
+
/x
|
42
|
+
end
|
43
|
+
|
44
|
+
def a_pipe
|
45
|
+
/\|/
|
46
|
+
end
|
47
|
+
|
48
|
+
def a_doubleopenbrace
|
49
|
+
/{{/
|
50
|
+
end
|
51
|
+
|
52
|
+
def a_doubleclosingbrace
|
53
|
+
/}}/
|
54
|
+
end
|
55
|
+
|
56
|
+
def a_link
|
57
|
+
/
|
58
|
+
\[\[
|
59
|
+
(?<link>
|
60
|
+
# ([% title-legal-chars])+
|
61
|
+
[%\ !"$&'()*,\-.\/0-9:;=?@A-Z\\^_`a-z~\u0080-\uFFFF+]+
|
62
|
+
# ("#" [# % title-legal-chars]+)?
|
63
|
+
( \# [\#%\ !"$&'()*,\-.\/0-9:;=?@A-Z\\^_`a-z~\u0080-\uFFFF+]+ )?
|
64
|
+
)
|
65
|
+
(
|
66
|
+
# "|" LEGAL_ARTICLE_ENTITY*
|
67
|
+
\| (?<link-description>([^\]]|\](?!\]))*)
|
68
|
+
)?
|
69
|
+
\]\]
|
70
|
+
(?<extra_letters>\p{L}*)
|
71
|
+
/x
|
72
|
+
end
|
73
|
+
|
74
|
+
def until_hash
|
75
|
+
/[^#]*/
|
76
|
+
end
|
77
|
+
|
78
|
+
def after_hash
|
79
|
+
/(?<=#).*/
|
80
|
+
end
|
81
|
+
|
82
|
+
def has_parens
|
83
|
+
/^(?<no_parens>.*?) *\(.*\) *$/
|
84
|
+
end
|
85
|
+
|
86
|
+
def first_comma
|
87
|
+
/^(?<before>([^,]|,(?! ))*)(, |$)/
|
88
|
+
end
|
89
|
+
|
90
|
+
def parens
|
91
|
+
/^(?<before>.*?)(\(.*\) *)?$/
|
92
|
+
end
|
93
|
+
|
94
|
+
end
|
95
|
+
end
|