infoboxer 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/README.md +1 -1
- data/lib/infoboxer/media_wiki/traits.rb +2 -1
- data/lib/infoboxer/parser/inline.rb +16 -5
- data/lib/infoboxer/parser/table.rb +23 -19
- data/lib/infoboxer/parser/util.rb +12 -2
- data/lib/infoboxer/tree/table.rb +2 -2
- data/lib/infoboxer/version.rb +1 -1
- data/regression/pages/greece.wiki +919 -0
- data/regression/pages/south_america_new.wiki +652 -0
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 69ee012f6cd8bb3e923289a4a921e81798ffd2aa
|
4
|
+
data.tar.gz: 42686bced509c310051ba7bc895c68d26a96bd2f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cc237b46fe0f59a2ba04d3b53da4a7361cd974e50e1d18f688a077093da664a7d205dd363ddc2622a9af4c1a6f1168315d90fb6867e65c0a61bd60e5e453e81a
|
7
|
+
data.tar.gz: d844446421cffd8e7f6a8fe885f0f1918c772f9e2e7c1d289a1e90b544fde9b3c2b41c8956df7338aab9ad2ae5ca7f5ec9d67fb2d3f16da34367ed5b96a5aa22
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,13 @@
|
|
1
1
|
# Infoboxer's change log
|
2
2
|
|
3
|
+
## 0.2.2 (2016-01-03)
|
4
|
+
|
5
|
+
Fixes:
|
6
|
+
* more sophisticated table parsing;
|
7
|
+
* empty `<nowiki/>` is parsed properly;
|
8
|
+
* inline unclosed markup inside wikilinks works;
|
9
|
+
* `MediaWiki::Traits` can now be continued in several places.
|
10
|
+
|
3
11
|
## 0.2.1 (2015-12-21)
|
4
12
|
|
5
13
|
* `infoboxer` binary properly registered.
|
data/README.md
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
[![Build Status](https://travis-ci.org/molybdenum-99/infoboxer.svg?branch=master)](https://travis-ci.org/molybdenum-99/infoboxer)
|
5
5
|
[![Coverage Status](https://coveralls.io/repos/molybdenum-99/infoboxer/badge.svg?branch=master&service=github)](https://coveralls.io/github/molybdenum-99/infoboxer?branch=master)
|
6
6
|
[![Code Climate](https://codeclimate.com/github/molybdenum-99/infoboxer/badges/gpa.svg)](https://codeclimate.com/github/molybdenum-99/infoboxer)
|
7
|
-
[![
|
7
|
+
[![Infoboxer Gitter](https://badges.gitter.im/molybdenum-99/infoboxer.svg)](https://gitter.im/molybdenum-99/infoboxer)
|
8
8
|
|
9
9
|
**Infoboxer** is pure-Ruby Wikipedia (and generic MediaWiki) client and
|
10
10
|
parser, targeting information extraction (hence the name).
|
@@ -59,7 +59,8 @@ module Infoboxer
|
|
59
59
|
# [English Wikipedia traits](https://github.com/molybdenum-99/infoboxer/blob/master/lib/infoboxer/definitions/en.wikipedia.org.rb)
|
60
60
|
# for example implementation.
|
61
61
|
def for(domain, &block)
|
62
|
-
|
62
|
+
Traits.domains[domain].tap{|c| c && c.instance_eval(&block)} ||
|
63
|
+
Class.new(self, &block).domain(domain)
|
63
64
|
end
|
64
65
|
|
65
66
|
# @private
|
@@ -33,8 +33,10 @@ module Infoboxer
|
|
33
33
|
nodes = Nodes[]
|
34
34
|
guarded_loop do
|
35
35
|
# FIXME: quick and UGLY IS HELL JUST TRYING TO MAKE THE SHIT WORK
|
36
|
-
if @context.inline_eol_sign
|
36
|
+
if @context.inline_eol_sign == /^\]/
|
37
37
|
chunk = @context.scan_until(re.short_inline_until_cache_brackets[until_pattern])
|
38
|
+
elsif @context.inline_eol_sign == /^\]\]/
|
39
|
+
chunk = @context.scan_until(re.short_inline_until_cache_brackets2[until_pattern])
|
38
40
|
else
|
39
41
|
chunk = @context.scan_until(re.short_inline_until_cache[until_pattern])
|
40
42
|
end
|
@@ -97,7 +99,7 @@ module Infoboxer
|
|
97
99
|
when '{{'
|
98
100
|
template
|
99
101
|
when /<nowiki([^>]*)>/
|
100
|
-
nowiki
|
102
|
+
nowiki($1)
|
101
103
|
when /<ref([^>]*)\/>/
|
102
104
|
reference($1, true)
|
103
105
|
when /<ref([^>]*)>/
|
@@ -114,7 +116,12 @@ module Infoboxer
|
|
114
116
|
# [[a|b]]
|
115
117
|
def wikilink
|
116
118
|
link = @context.scan_continued_until(/\||\]\]/)
|
117
|
-
|
119
|
+
if @context.matched == '|'
|
120
|
+
@context.push_eol_sign(/^\]\]/)
|
121
|
+
caption = inline(/\]\]/)
|
122
|
+
@context.pop_eol_sign
|
123
|
+
end
|
124
|
+
|
118
125
|
Wikilink.new(link, caption)
|
119
126
|
end
|
120
127
|
|
@@ -136,8 +143,12 @@ module Infoboxer
|
|
136
143
|
Ref.new(children, parse_params(param_str))
|
137
144
|
end
|
138
145
|
|
139
|
-
def nowiki
|
140
|
-
|
146
|
+
def nowiki(tag_rest)
|
147
|
+
if tag_rest.end_with?('/')
|
148
|
+
Text.new('')
|
149
|
+
else
|
150
|
+
Text.new(@context.scan_continued_until(/<\/nowiki>/))
|
151
|
+
end
|
141
152
|
end
|
142
153
|
end
|
143
154
|
|
@@ -52,10 +52,10 @@ module Infoboxer
|
|
52
52
|
table_template(table)
|
53
53
|
|
54
54
|
when nil
|
55
|
-
|
55
|
+
return false
|
56
56
|
|
57
57
|
else
|
58
|
-
table_cell_cont(table)
|
58
|
+
return table_cell_cont(table)
|
59
59
|
end
|
60
60
|
true # should continue parsing
|
61
61
|
end
|
@@ -103,29 +103,33 @@ module Infoboxer
|
|
103
103
|
end
|
104
104
|
end
|
105
105
|
|
106
|
-
#
|
107
|
-
#
|
108
|
-
# {|
|
109
|
-
# <caption>....</caption>
|
110
|
-
#
|
111
|
-
# Solution is NOT elegant or semantically "right", yet it works.
|
112
|
-
# Somehow.
|
106
|
+
# Good news, everyone! Table can be IMPLICITLY closed when it's
|
107
|
+
# not "cell" context.
|
113
108
|
#
|
109
|
+
# Unless it's empty row, which is just skipped.
|
114
110
|
def table_cell_cont(table)
|
115
111
|
container = case (last = table.children.last)
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
112
|
+
when TableRow
|
113
|
+
last.children.last
|
114
|
+
when TableCaption
|
115
|
+
last
|
116
|
+
else
|
117
|
+
nil
|
118
|
+
end
|
119
|
+
|
120
|
+
if !container
|
121
|
+
# return "table not continued" unless row is empty
|
122
|
+
if @context.current.empty?
|
123
|
+
return true
|
124
|
+
else
|
125
|
+
@context.prev!
|
126
|
+
return false
|
127
|
+
end
|
125
128
|
end
|
126
|
-
|
129
|
+
|
127
130
|
container.push_children(paragraph(/^\s*([|!]|{\|)/))
|
128
131
|
table.push_children(container) unless container.parent
|
132
|
+
true
|
129
133
|
end
|
130
134
|
end
|
131
135
|
end
|
@@ -19,13 +19,20 @@ module Infoboxer
|
|
19
19
|
}}
|
20
20
|
)]x
|
21
21
|
|
22
|
-
|
22
|
+
INLINE_EOL_BRACK = %r[(?= # if we have ahead... (not scanned, just checked
|
23
23
|
</ref> | # <ref> closed
|
24
24
|
}} | # or template closed
|
25
25
|
(?<!\])\](?!\]) # or ext.link closed,
|
26
26
|
# the madness with look-ahead/behind means "match single bracket but not double"
|
27
27
|
)]x
|
28
28
|
|
29
|
+
# FIXME: ok, NOW it's officially ridiculous
|
30
|
+
INLINE_EOL_BRACK2 = %r[(?= # if we have ahead... (not scanned, just checked
|
31
|
+
</ref> | # <ref> closed
|
32
|
+
}} | # or template closed
|
33
|
+
\]\] # or int.link closed
|
34
|
+
)]x
|
35
|
+
|
29
36
|
|
30
37
|
def make_regexps
|
31
38
|
{
|
@@ -38,7 +45,10 @@ module Infoboxer
|
|
38
45
|
h[r] = Regexp.union(*[r, INLINE_EOL, FORMATTING, /$/].compact.uniq)
|
39
46
|
},
|
40
47
|
short_inline_until_cache_brackets: Hash.new{|h, r|
|
41
|
-
h[r] = Regexp.union(*[r,
|
48
|
+
h[r] = Regexp.union(*[r, INLINE_EOL_BRACK, FORMATTING, /$/].compact.uniq)
|
49
|
+
},
|
50
|
+
short_inline_until_cache_brackets2: Hash.new{|h, r|
|
51
|
+
h[r] = Regexp.union(*[r, INLINE_EOL_BRACK2, FORMATTING, /$/].compact.uniq)
|
42
52
|
}
|
43
53
|
|
44
54
|
}
|
data/lib/infoboxer/tree/table.rb
CHANGED
@@ -25,13 +25,13 @@ module Infoboxer
|
|
25
25
|
#
|
26
26
|
# FIXME: it can easily be several table heading rows
|
27
27
|
def heading_row
|
28
|
-
rows.first.children.all?(&call(matches?: TableHeading)) ?
|
28
|
+
rows.first && rows.first.children.all?(&call(matches?: TableHeading)) ?
|
29
29
|
rows.first : nil
|
30
30
|
end
|
31
31
|
|
32
32
|
# For now, returns all table rows except {#heading_row}
|
33
33
|
def body_rows
|
34
|
-
rows.first.children.all?(&call(matches?: TableHeading)) ?
|
34
|
+
rows.first && rows.first.children.all?(&call(matches?: TableHeading)) ?
|
35
35
|
rows[1..-1] :
|
36
36
|
rows
|
37
37
|
end
|