wp2txt 0.6.0 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/benchmark.rb +69 -0
- data/bin/wp2txt +1 -1
- data/lib/wp2txt/version.rb +1 -1
- data/wp2txt.gemspec +1 -1
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bcfd6986e262e455c100d664583b099d57ff4428
|
4
|
+
data.tar.gz: 68c922b43951b7f326b0136681981a166208d0a9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 294e0f8e1d2b37534ad885c617cfbd72ad72144dca6fb01231f6e2cf691a86bf58690f5dd1b2b410f8ee23eb3c74fa3f40e4ca8bbf3f3921ea78295783da5f2e
|
7
|
+
data.tar.gz: 71a1b8feca5c3067ff534f0239c4c937485ff3ed8c0a6de793be0b71befd440cb5b1c2c465dc6684c558a53209ab6481dd2e60edfe7fb7cbdd9c6f07416efd24
|
data/bin/benchmark.rb
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
$: << File.join(File.dirname(__FILE__))
|
5
|
+
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
6
|
+
|
7
|
+
require 'wp2txt'
|
8
|
+
require 'wp2txt/utils'
|
9
|
+
include Wp2txt
|
10
|
+
require 'benchmark'
|
11
|
+
|
12
|
+
data_dir = File.join(File.dirname(__FILE__), '..', "data")
|
13
|
+
|
14
|
+
parent = Wp2txt::CmdProgbar.new
|
15
|
+
input_file = File.join(data_dir, "testdata.bz2")
|
16
|
+
output_dir = data_dir
|
17
|
+
tfile_size = 10
|
18
|
+
convert = true
|
19
|
+
strip_tmarker = true
|
20
|
+
|
21
|
+
|
22
|
+
|
23
|
+
Benchmark.bm do |x|
|
24
|
+
x.report do
|
25
|
+
wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker)
|
26
|
+
wpconv.extract_text do |article|
|
27
|
+
title = format_wiki article.title
|
28
|
+
title = "[[#{title}]]\n"
|
29
|
+
|
30
|
+
contents = "\nCATEGORIES: "
|
31
|
+
contents += article.categories.join(", ")
|
32
|
+
contents += "\n\n"
|
33
|
+
|
34
|
+
article.elements.each do |e|
|
35
|
+
case e.first
|
36
|
+
when :mw_heading
|
37
|
+
line = format_wiki(e.last)
|
38
|
+
when :mw_paragraph
|
39
|
+
line = format_wiki(e.last)
|
40
|
+
when :mw_table, :mw_htable
|
41
|
+
line = format_wiki(e.last)
|
42
|
+
when :mw_pre
|
43
|
+
line = e.last
|
44
|
+
when :mw_quote
|
45
|
+
line = format_wiki(e.last)
|
46
|
+
when :mw_unordered, :mw_ordered, :mw_definition
|
47
|
+
line = format_wiki(e.last)
|
48
|
+
when :mw_redirect
|
49
|
+
line = format_wiki(e.last)
|
50
|
+
line += "\n\n"
|
51
|
+
else
|
52
|
+
next
|
53
|
+
end
|
54
|
+
contents += line
|
55
|
+
contents = remove_templates(contents)
|
56
|
+
end
|
57
|
+
|
58
|
+
##### cleanup #####
|
59
|
+
if /\A\s*\z/m =~ contents
|
60
|
+
result = ""
|
61
|
+
else
|
62
|
+
result = title + "\n" + contents
|
63
|
+
end
|
64
|
+
result = result.gsub(/\[ref\]\s*\[\/ref\]/m){""}
|
65
|
+
result = result.gsub(/\n\n\n+/m){"\n\n"} + "\n"
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
data/bin/wp2txt
CHANGED
@@ -45,7 +45,7 @@ output_dir = opts[:output_dir]
|
|
45
45
|
tfile_size = opts[:file_size]
|
46
46
|
convert = opts[:convert]
|
47
47
|
strip_tmarker = opts[:marker] ? false : true
|
48
|
-
opt_array = [:
|
48
|
+
opt_array = [:title, :list, :heading, :table, :template, :redirect]
|
49
49
|
config = {}
|
50
50
|
opt_array.each do |opt|
|
51
51
|
config[opt] = opts[opt]
|
data/lib/wp2txt/version.rb
CHANGED
data/wp2txt.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wp2txt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-10-
|
11
|
+
date: 2014-10-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -99,6 +99,7 @@ description: WP2TXT extracts plain text data from Wikipedia dump file (encoded i
|
|
99
99
|
email:
|
100
100
|
- yohasebe@gmail.com
|
101
101
|
executables:
|
102
|
+
- benchmark.rb
|
102
103
|
- wp2txt
|
103
104
|
extensions: []
|
104
105
|
extra_rdoc_files: []
|
@@ -108,6 +109,7 @@ files:
|
|
108
109
|
- LICENSE
|
109
110
|
- README.md
|
110
111
|
- Rakefile
|
112
|
+
- bin/benchmark.rb
|
111
113
|
- bin/wp2txt
|
112
114
|
- data/testdata.bz2
|
113
115
|
- lib/wp2txt.rb
|
@@ -138,7 +140,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
138
140
|
version: '0'
|
139
141
|
requirements: []
|
140
142
|
rubyforge_project: wp2txt
|
141
|
-
rubygems_version: 2.4.
|
143
|
+
rubygems_version: 2.4.2
|
142
144
|
signing_key:
|
143
145
|
specification_version: 4
|
144
146
|
summary: Wikipedia dump to text converter
|