wp2txt 0.6.0 → 0.6.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/benchmark.rb +69 -0
- data/bin/wp2txt +1 -1
- data/lib/wp2txt/version.rb +1 -1
- data/wp2txt.gemspec +1 -1
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bcfd6986e262e455c100d664583b099d57ff4428
|
4
|
+
data.tar.gz: 68c922b43951b7f326b0136681981a166208d0a9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 294e0f8e1d2b37534ad885c617cfbd72ad72144dca6fb01231f6e2cf691a86bf58690f5dd1b2b410f8ee23eb3c74fa3f40e4ca8bbf3f3921ea78295783da5f2e
|
7
|
+
data.tar.gz: 71a1b8feca5c3067ff534f0239c4c937485ff3ed8c0a6de793be0b71befd440cb5b1c2c465dc6684c558a53209ab6481dd2e60edfe7fb7cbdd9c6f07416efd24
|
data/bin/benchmark.rb
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
$: << File.join(File.dirname(__FILE__))
|
5
|
+
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
6
|
+
|
7
|
+
require 'wp2txt'
|
8
|
+
require 'wp2txt/utils'
|
9
|
+
include Wp2txt
|
10
|
+
require 'benchmark'
|
11
|
+
|
12
|
+
data_dir = File.join(File.dirname(__FILE__), '..', "data")
|
13
|
+
|
14
|
+
parent = Wp2txt::CmdProgbar.new
|
15
|
+
input_file = File.join(data_dir, "testdata.bz2")
|
16
|
+
output_dir = data_dir
|
17
|
+
tfile_size = 10
|
18
|
+
convert = true
|
19
|
+
strip_tmarker = true
|
20
|
+
|
21
|
+
|
22
|
+
|
23
|
+
Benchmark.bm do |x|
|
24
|
+
x.report do
|
25
|
+
wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker)
|
26
|
+
wpconv.extract_text do |article|
|
27
|
+
title = format_wiki article.title
|
28
|
+
title = "[[#{title}]]\n"
|
29
|
+
|
30
|
+
contents = "\nCATEGORIES: "
|
31
|
+
contents += article.categories.join(", ")
|
32
|
+
contents += "\n\n"
|
33
|
+
|
34
|
+
article.elements.each do |e|
|
35
|
+
case e.first
|
36
|
+
when :mw_heading
|
37
|
+
line = format_wiki(e.last)
|
38
|
+
when :mw_paragraph
|
39
|
+
line = format_wiki(e.last)
|
40
|
+
when :mw_table, :mw_htable
|
41
|
+
line = format_wiki(e.last)
|
42
|
+
when :mw_pre
|
43
|
+
line = e.last
|
44
|
+
when :mw_quote
|
45
|
+
line = format_wiki(e.last)
|
46
|
+
when :mw_unordered, :mw_ordered, :mw_definition
|
47
|
+
line = format_wiki(e.last)
|
48
|
+
when :mw_redirect
|
49
|
+
line = format_wiki(e.last)
|
50
|
+
line += "\n\n"
|
51
|
+
else
|
52
|
+
next
|
53
|
+
end
|
54
|
+
contents += line
|
55
|
+
contents = remove_templates(contents)
|
56
|
+
end
|
57
|
+
|
58
|
+
##### cleanup #####
|
59
|
+
if /\A\s*\z/m =~ contents
|
60
|
+
result = ""
|
61
|
+
else
|
62
|
+
result = title + "\n" + contents
|
63
|
+
end
|
64
|
+
result = result.gsub(/\[ref\]\s*\[\/ref\]/m){""}
|
65
|
+
result = result.gsub(/\n\n\n+/m){"\n\n"} + "\n"
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
data/bin/wp2txt
CHANGED
@@ -45,7 +45,7 @@ output_dir = opts[:output_dir]
|
|
45
45
|
tfile_size = opts[:file_size]
|
46
46
|
convert = opts[:convert]
|
47
47
|
strip_tmarker = opts[:marker] ? false : true
|
48
|
-
opt_array = [:
|
48
|
+
opt_array = [:title, :list, :heading, :table, :template, :redirect]
|
49
49
|
config = {}
|
50
50
|
opt_array.each do |opt|
|
51
51
|
config[opt] = opts[opt]
|
data/lib/wp2txt/version.rb
CHANGED
data/wp2txt.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wp2txt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-10-
|
11
|
+
date: 2014-10-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -99,6 +99,7 @@ description: WP2TXT extracts plain text data from Wikipedia dump file (encoded i
|
|
99
99
|
email:
|
100
100
|
- yohasebe@gmail.com
|
101
101
|
executables:
|
102
|
+
- benchmark.rb
|
102
103
|
- wp2txt
|
103
104
|
extensions: []
|
104
105
|
extra_rdoc_files: []
|
@@ -108,6 +109,7 @@ files:
|
|
108
109
|
- LICENSE
|
109
110
|
- README.md
|
110
111
|
- Rakefile
|
112
|
+
- bin/benchmark.rb
|
111
113
|
- bin/wp2txt
|
112
114
|
- data/testdata.bz2
|
113
115
|
- lib/wp2txt.rb
|
@@ -138,7 +140,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
138
140
|
version: '0'
|
139
141
|
requirements: []
|
140
142
|
rubyforge_project: wp2txt
|
141
|
-
rubygems_version: 2.4.
|
143
|
+
rubygems_version: 2.4.2
|
142
144
|
signing_key:
|
143
145
|
specification_version: 4
|
144
146
|
summary: Wikipedia dump to text converter
|