wptemplates 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,20 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ spec/examples_spec.rb
16
+ test/tmp
17
+ test/version_tmp
18
+ tmp
19
+ .irb_history
20
+ README.html
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
data/.travis.yml ADDED
@@ -0,0 +1,8 @@
1
+ language: ruby
2
+ rvm:
3
+ - "1.9.2"
4
+ - "1.9.3"
5
+ - jruby-19mode # JRuby in 1.9 mode
6
+ - rbx-19mode
7
+ # uncomment this line if your project needs to run something other than `rake`:
8
+ # script: bundle exec rspec spec
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in wptemplates.gemspec
4
+ gemspec
5
+
6
+ platforms :ruby_19, :rbx do
7
+ gem 'redcarpet', '~> 2'
8
+ end
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Bernhard Häussner
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,92 @@
1
+ # Wptemplates
2
+
3
+ [![Build Status](https://travis-ci.org/bxt/wptemplates.png?branch=master)](https://travis-ci.org/bxt/wptemplates)
4
+
5
+ Gem for collecting template informations from mediawiki markup.
6
+
7
+ It will help you to extract useful machine-readable data from
8
+ wikipedia articles, since there ist a lot of useful stuff
9
+ encoded as templates.
10
+
11
+ Currently only templates and links are parsed, all other markup is ignored.
12
+
13
+ ## Installation
14
+
15
+ Add this line to your application's Gemfile:
16
+
17
+ gem 'wptemplates', git: 'git://github.com/bxt/wptemplates.git'
18
+
19
+ And then execute:
20
+
21
+ $ bundle
22
+
23
+ The gem is currently not in the rubygems.org repository.
24
+
25
+ ## Usage
26
+
27
+ To parse a piece of markup simply call:
28
+
29
+ <!-- EXAMPLES:INIT -->
30
+ ast = Wptemplates.parse("{{foo | bar | x = 3 }} baz [[bam (2003)|]]y")
31
+
32
+ <!-- /EXAMPLES -->
33
+
34
+ You will get an instance of Wptemplates::Soup which is an array of
35
+ Wptemplates::Template, Wptemplates::Link and Wptemplates::Text.
36
+ You can explore the AST with these methods:
37
+
38
+ <!-- EXAMPLES:intro -->
39
+ ast.templates.is_a?(Array) && ast.templates.length # => 1
40
+ ast.text # => " baz bamy"
41
+ ast[0].name # => :foo
42
+ ast[0].params[0].text # => " bar "
43
+ ast[0].params[:x].text # => "3"
44
+ ast.all_templates_of(:foo).map{|t| t.params[:x].text} # => ["3"]
45
+ <!-- /EXAMPLES -->
46
+
47
+ You can access the links via:
48
+
49
+ <!-- EXAMPLES:links -->
50
+ ast.links.length # => 1
51
+ ast.links[0].text # => "bamy"
52
+ ast.all_links.map{|l| l.link} # => ["Bam (2003)"]
53
+ <!-- /EXAMPLES -->
54
+
55
+ ## Developing
56
+
57
+ Here's some useful info if you want to improve/customize this gem.
58
+
59
+ ### Getting Started
60
+
61
+ Checkout the project, run `bundle` and then `rake` to see if the tests
62
+ pass. Run `rake -T` to see the rake tasks.
63
+
64
+ ### Markup
65
+
66
+ MediaWiki markup is not trivial to parse and there might always
67
+ be compatibility issues. There's a useful help page about
68
+ [templates][tmplh] and a [markup spec][mspec]. For links there
69
+ is a page about [links][linkh] and about the [pipe trick][ptrkh].
70
+ Also, there is a page with [link's BNF][lnbnf].
71
+
72
+ ### Known Issues
73
+
74
+ * If you have images in your templates the pipes cause a new parameter
75
+ * Namespaced links are not recognized
76
+ * Templates in links are not recognized
77
+ * Links contents are not htmldecoded
78
+ * nowiki, pre and math blocks might cause problems
79
+
80
+ ## Contributing
81
+
82
+ 1. Fork it
83
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
84
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
85
+ 4. Push to the branch (`git push origin my-new-feature`)
86
+ 5. Create new Pull Request
87
+
88
+ [tmplh]: http://en.wikipedia.org/wiki/Help:Template#Usage_syntax "English Wikipedia Template help page, syntax section"
89
+ [mspec]: http://www.mediawiki.org/wiki/Markup_spec "MediaWiki Markup spec"
90
+ [linkh]: http://en.wikipedia.org/wiki/Help:Link "English Wikipedia Link help page"
91
+ [ptrkh]: http://en.wikipedia.org/wiki/Help:Pipe_trick "English Wikipedia Pipe trick help page"
92
+ [lnbnf]: http://www.mediawiki.org/wiki/Markup_spec/BNF/Links "MediaWiki Link BNF"
data/README.rdoc ADDED
@@ -0,0 +1,8 @@
1
+ = Wptemplates
2
+
3
+ {<img src="https://travis-ci.org/bxt/wptemplates.png?branch=master" alt="Build Status" />}[https://travis-ci.org/bxt/wptemplates]
4
+
5
+ The public entry method for this gem is Wptemplates.parse
6
+ It will give you Wptemplates::Soup, Wptemplates::Link,
7
+ Wptemplates::Template and Wptemplates::Text elements,
8
+ which you can traverse and filter.
data/Rakefile ADDED
@@ -0,0 +1,4 @@
1
+
2
+ Dir['tasks/**/*.rake'].each { |rake| load rake }
3
+
4
+ task :default => :spec
@@ -0,0 +1,20 @@
1
+ require "wptemplates/version"
2
+ require "wptemplates/parser"
3
+ require "wptemplates/preprocessor"
4
+
5
+ module Wptemplates
6
+
7
+ def self.parse text
8
+ parser.parse(text)
9
+ parser.parse(preprocessor.preprocess(text))
10
+ end
11
+
12
+ def self.parser
13
+ @parser ||= Parser.new
14
+ end
15
+
16
+ def self.preprocessor
17
+ @preprocessor ||= Preprocessor.new
18
+ end
19
+
20
+ end
@@ -0,0 +1,91 @@
1
+ module Wptemplates
2
+
3
+ module Node
4
+ def templates
5
+ []
6
+ end
7
+ def all_templates
8
+ templates
9
+ end
10
+ def text
11
+ ""
12
+ end
13
+ def templates_of type
14
+ templates.select{|t| t.name==type}
15
+ end
16
+ def template_of type
17
+ templates_of(type).first
18
+ end
19
+ def all_templates_of type
20
+ all_templates.select{|t| t.name==type}
21
+ end
22
+ def deep_template_of type
23
+ all_templates_of(type).first
24
+ end
25
+ def links
26
+ []
27
+ end
28
+ def all_links
29
+ links
30
+ end
31
+ end
32
+
33
+ class Soup < Array
34
+ include Node
35
+ def templates
36
+ map(&:templates).flatten(1)
37
+ end
38
+ def links
39
+ map(&:links).flatten(1)
40
+ end
41
+ def all_templates
42
+ map(&:all_templates).flatten(1)
43
+ end
44
+ def all_links
45
+ map(&:all_links).flatten(1)
46
+ end
47
+ def text
48
+ map(&:text).join('')
49
+ end
50
+ end
51
+
52
+ class Template
53
+ include Node
54
+ attr_reader :name, :params
55
+ def initialize(name, params = {})
56
+ @name = name
57
+ @params = params
58
+ end
59
+ def templates
60
+ [self]
61
+ end
62
+ def all_templates
63
+ templates + @params.map{|_,v| v.templates }.flatten(1)
64
+ end
65
+ def all_links
66
+ links + @params.map{|_,v| v.all_links }.flatten(1)
67
+ end
68
+ end
69
+
70
+ class Text
71
+ include Node
72
+ attr_reader :text
73
+ def initialize(text)
74
+ @text = text
75
+ end
76
+ end
77
+
78
+ class Link
79
+ include Node
80
+ attr_reader :text, :link, :anchor
81
+ def initialize(text, link, anchor)
82
+ @text = text
83
+ @link = link
84
+ @anchor = anchor
85
+ end
86
+ def links
87
+ [self]
88
+ end
89
+ end
90
+
91
+ end
@@ -0,0 +1,112 @@
1
+ require 'strscan'
2
+ require 'wptemplates/regexes'
3
+ require 'wptemplates/utils'
4
+ require 'wptemplates/ast'
5
+
6
+ module Wptemplates
7
+ class Parser
8
+ include Regexes
9
+ include Utils
10
+
11
+ def parse(text)
12
+ @input = StringScanner.new(text)
13
+ parse_main
14
+ end
15
+
16
+ protected
17
+
18
+ def parse_main in_template_parameter = false
19
+ output = Soup.new
20
+
21
+ while unit = parse_link || parse_template || parse_anything(in_template_parameter)
22
+ output << unit
23
+ end
24
+
25
+ output << Text.new("") if output.empty?
26
+
27
+ output
28
+ end
29
+
30
+ def parse_template
31
+ if @input.scan(a_doubleopenbrace)
32
+ template = Template.new parse_template_name, parse_template_parameters
33
+ @input.scan(a_doubleclosingbrace) or raise "unclosed template"
34
+ template
35
+ end
36
+ end
37
+
38
+ def parse_anything in_template_parameter = false
39
+ if in_template_parameter
40
+ @input.scan(till_doublebrace_doubleopenbrackets_or_pipe) && Text.new(@input.matched)
41
+ else
42
+ @input.scan(till_doubleopenbrace_or_doubleopenbrackets) && Text.new(@input.matched)
43
+ end
44
+ end
45
+
46
+ def parse_template_name
47
+ if @input.scan(till_doubleclosebrace_or_pipe)
48
+ symbolize(@input.matched)
49
+ end
50
+ end
51
+
52
+ def parse_template_parameters
53
+ i = 0
54
+ h = {}
55
+ while parsed_named_template_parameter(h) || parse_numeric_template_parameter(h,i) do
56
+ i += 1
57
+ end
58
+ h
59
+ end
60
+
61
+ def parsed_named_template_parameter(h)
62
+ if @input.scan(from_pipe_till_equals_no_doubleclosebrace_or_pipe)
63
+ key = symbolize(@input[1])
64
+ value = parse_main(true)
65
+ {l:0,r:-1}.each do |d,i|
66
+ value[i].text.send(:"#{d}strip!")
67
+ value.delete_at(i) if value[i].text.empty? && (value.length > 1)
68
+ end
69
+ h[key] = value
70
+ end
71
+ end
72
+
73
+ def parse_numeric_template_parameter(h,i)
74
+ if @input.scan(a_pipe)
75
+ value = parse_main(true)
76
+ h[i] = value
77
+ end
78
+ end
79
+
80
+ def parse_link
81
+ if @input.scan(a_link)
82
+ url, label, letters = (1..3).map {|i| @input[i]}
83
+ if label == ""
84
+ pipe_trick url, label, letters
85
+ else
86
+ link_new_with_normalize (label || url)+letters, url[until_hash], url[after_hash]
87
+ end
88
+ end
89
+ end
90
+
91
+ def pipe_trick url, label, letters
92
+ if url["#"]
93
+ nil
94
+ elsif m = has_parens.match(url)
95
+ link_new_with_normalize(m[:no_parens]+letters, url, nil)
96
+ else
97
+ label = fixpoint(clone: true, start: url) do |u|
98
+ u[first_comma,:before][parens, :before]
99
+ end
100
+ link_new_with_normalize(label+letters, url, nil)
101
+ end
102
+ end
103
+
104
+ def link_new_with_normalize text, link, anchor
105
+ text = normalize_linklabel(text)
106
+ link = normalize_link(link)
107
+ anchor = normalize_link(anchor, true) unless anchor.nil?
108
+ Link.new(text, link, anchor)
109
+ end
110
+
111
+ end
112
+ end
@@ -0,0 +1,17 @@
1
+
2
+ module Wptemplates
3
+ class Preprocessor
4
+
5
+ def preprocess(text)
6
+ strip_html_comments!(text)
7
+ text
8
+ end
9
+
10
+ protected
11
+
12
+ def strip_html_comments!(text)
13
+ text.gsub!(/<!--.*?-->/m,'')
14
+ end
15
+
16
+ end
17
+ end
@@ -0,0 +1,95 @@
1
+ # encoding: UTF-8
2
+
3
+ module Wptemplates
4
+ module Regexes
5
+ module_function
6
+
7
+ def till_doublebrace_doubleopenbrackets_or_pipe
8
+ /(
9
+ [^{}\[|] # Unproblematic chars
10
+ | { (?!{ ) # A lone open brace
11
+ | } (?!} ) # A lone close brace
12
+ | \[(?!\[) # A lone open bracket
13
+ | ^\[\[ # Doubleopenbrackets at start
14
+ )+/x
15
+ end
16
+
17
+ def till_doubleopenbrace_or_doubleopenbrackets
18
+ /(
19
+ [^{\[] # Unproblematic chars
20
+ | { (?!{ ) # A lone open brace
21
+ | \[(?!\[) # A lone open bracket
22
+ | ^\[\[ # Doubleopenbrackets at start
23
+ )+/x
24
+ end
25
+
26
+ def till_doubleclosebrace_or_pipe
27
+ /(
28
+ [^|}] # Unproblematic chars
29
+ | } (?!} ) # A lone close brace
30
+ )+/x
31
+ end
32
+
33
+ def from_pipe_till_equals_no_doubleclosebrace_or_pipe
34
+ /
35
+ \| # Pipe
36
+ ((
37
+ [^|=}] # Unproblematic chars
38
+ |}(?!}) # A lone close brace
39
+ )*)
40
+ = # Equals
41
+ /x
42
+ end
43
+
44
+ def a_pipe
45
+ /\|/
46
+ end
47
+
48
+ def a_doubleopenbrace
49
+ /{{/
50
+ end
51
+
52
+ def a_doubleclosingbrace
53
+ /}}/
54
+ end
55
+
56
+ def a_link
57
+ /
58
+ \[\[
59
+ (?<link>
60
+ # ([% title-legal-chars])+
61
+ [%\ !"$&'()*,\-.\/0-9:;=?@A-Z\\^_`a-z~\u0080-\uFFFF+]+
62
+ # ("#" [# % title-legal-chars]+)?
63
+ ( \# [\#%\ !"$&'()*,\-.\/0-9:;=?@A-Z\\^_`a-z~\u0080-\uFFFF+]+ )?
64
+ )
65
+ (
66
+ # "|" LEGAL_ARTICLE_ENTITY*
67
+ \| (?<link-description>([^\]]|\](?!\]))*)
68
+ )?
69
+ \]\]
70
+ (?<extra_letters>\p{L}*)
71
+ /x
72
+ end
73
+
74
+ def until_hash
75
+ /[^#]*/
76
+ end
77
+
78
+ def after_hash
79
+ /(?<=#).*/
80
+ end
81
+
82
+ def has_parens
83
+ /^(?<no_parens>.*?) *\(.*\) *$/
84
+ end
85
+
86
+ def first_comma
87
+ /^(?<before>([^,]|,(?! ))*)(, |$)/
88
+ end
89
+
90
+ def parens
91
+ /^(?<before>.*?)(\(.*\) *)?$/
92
+ end
93
+
94
+ end
95
+ end